In [1]:
import pandas as pd
import os
import re
import xml.etree.cElementTree as et
from lxml import etree

import spacy
from spacy.lang.en import English

In [2]:
df=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/parsed_xml_clean.pkl')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990262 entries, 0 to 990261
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   TextLemmatized  990262 non-null  object        
 1   ID              990262 non-null  object        
 2   Title           990262 non-null  object        
 3   Type            990262 non-null  object        
 4   StartDate       990262 non-null  datetime64[ns]
 5   EndDate         990262 non-null  object        
 6   Text            990262 non-null  object        
 7   TextWordCount   990262 non-null  object        
 8   PubTitle        990262 non-null  object        
 9   SourceType      990262 non-null  object        
 10  Year            990262 non-null  float64       
 11  Month           990262 non-null  float64       
 12  Newspaper       990262 non-null  object        
 13  GroupNo         990262 non-null  int64         
dtypes: datetime64[ns](1), float64(2), in

In [3]:
# Check date range
print("Date range:",min(df['StartDate']), max(df['StartDate']))

Date range: 1985-01-01 00:00:00 2021-12-31 00:00:00


In [4]:
# Function to remove multiple spaces
def remove_spaces(text):
    text=re.sub(' +',' ',text).strip()
    text=text.replace('\n',' ').replace('\r',' ')
    return text

In [5]:
nlp = English()  # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

In [6]:
# Function to print one XML example
def print_xml(ID):
    tree = etree.parse(filePath+ID+'.xml')
    xml = etree.tostring(tree, encoding="unicode", pretty_print=True)
    print(xml)

In [7]:
# Function to remove html tags from a string
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [15]:
# Function to identify the sentence with "*regulat*" and a sentence before and after (expanded regulatory sentences)
def extractSentenceBeforeAfter(text):
    sentSet=set()
    text=remove_spaces(text)
    doc=nlp(text)
    sentList=list(doc.sents)
    for i in range(0, len(sentList)):
        sent=sentList[i].text.strip()
        if len(re.findall('regulat',sent,re.IGNORECASE))>0:
            sentSet.add(sent)
            if i>0:
                sentSet.add(sentList[i-1].text.strip())
            if i<len(sentList)-1:
                sentSet.add(sentList[i+1].text.strip())
    sentText=' '.join(sentSet)
    return sentText

In [None]:
# Extract expanded regulatory sentences
regsents_expand=[]
for text in df['Text']:
    new=extractSentenceBeforeAfter(text)
    regsents_expand.append(new)
print(len(regsents_expand))

990262


In [None]:
print(regsents_expand[0])
print(regsents_expand[-1])

The second-biggest maker of heart devices said federal regulators cleared three new heart products, including two defibrillators. $6.36 8.9% Genentech Inc. stock surged to $77.96. $0.74 6% Shares of Boston Scientific Corp. increased to $13.05.
A9 THE WORLD  Nobel Peace Prize Ethiopia's reformer prime minister, Abiy Ahmed, ended his nation's border standoff with Eritrea. You have to look beyond his canvases to see why. Arts &amp; Style  THE ECONOMY Boeing, FAA are faulted Regulators had "inadequate awareness" of a key system on the 737 Max airliner, a report says.


In [18]:
df['RegSentsExpand']=regsents_expand

In [19]:
# Length of regulatory sections
df['RegSentExpandLength']=df['RegSentsExpand'].str.len()
print(df.sort_values('RegSentExpandLength',ascending=False)[['ID','RegSentExpandLength']].head(10))

                ID  RegSentExpandLength
623338  2222510549                46878
139357  1824047135                38644
379661  1824047069                38518
315438  1872743297                35818
139358  1798874942                28365
255238  1859383206                27524
447418  1845766676                25873
392328  1886296496                25038
379394  1867515064                24706
379444  1857676896                24410


In [29]:
print('# of articles with no "*regulat*" in full text:',df[df['RegSentExpandLength']==0]['ID'].nunique())

# of articles with no "*regulat*" in full text: 46496


In [22]:
# Sort df
df=df.sort_values(['Newspaper','StartDate','Title']).reset_index(drop=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990262 entries, 0 to 990261
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   TextLemmatized       990262 non-null  object        
 1   ID                   990262 non-null  object        
 2   Title                990262 non-null  object        
 3   Type                 990262 non-null  object        
 4   StartDate            990262 non-null  datetime64[ns]
 5   EndDate              990262 non-null  object        
 6   Text                 990262 non-null  object        
 7   TextWordCount        990262 non-null  object        
 8   PubTitle             990262 non-null  object        
 9   SourceType           990262 non-null  object        
 10  Year                 990262 non-null  float64       
 11  Month                990262 non-null  float64       
 12  Newspaper            990262 non-null  object        
 13  GroupNo       

In [25]:
df.drop(['TextLemmatized','Text','GroupNo'],axis=1).to_pickle('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/allRegSentsExpand.pkl')