In [1]:
import pandas as pd
import os
import re
import xml.etree.cElementTree as et
from lxml import etree

import spacy
from spacy.lang.en import English

In [2]:
nlp = English()  # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

## Import news articles

In [None]:
# All news articles
df=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/parsed_xml.pkl')
print(df.info())

In [None]:
print(df['Newspaper'].value_counts())

## 1. Identify "regulatory sections" (a sentence with "regulat*" and its neighbor sentences)

In [3]:
# Function to remove multiple spaces
def remove_spaces(text):
    text=re.sub(' +',' ',text).strip()
    text=text.replace('\n',' ').replace('\r',' ')
    return text

In [6]:
# Function to identify the sentence with "*regulat*" and a sentence before and after (expanded regulatory sentences)
def extractSentenceBeforeAfter(text):
    sentSet=set()
    text=remove_spaces(text)
    doc=nlp(text)
    sentList=list(doc.sents)
    for i in range(0, len(sentList)):
        sent=sentList[i].text.strip()
        if len(re.findall('regulat',sent.lower()))>0:
            sentSet.add(sent)
            if i>0:
                sentSet.add(sentList[i-1].text.strip())
            if i<len(sentList)-1:
                sentSet.add(sentList[i+1].text.strip())
    sentText=' '.join(sentSet)
    return sentText

In [7]:
# Extract expanded regulatory sentences
regsents_expand=[]
for text in df['Text']:
    new=extractSentenceBeforeAfter(text)
    regsents_expand.append(new)
print(len(regsents_expand))

822737


In [8]:
print(regsents_expand[-1])

Mr. Abe's critics say he failed to address structural problems such as a declining population and onerous government regulation in order to boost growth potential. Without those changes -- which Mr. Abe termed the "third arrow" of Abenomics -- BOJ policy wasn't enough, they say. Japan has been dealing with all of those problems since the 1990s -- and the BOJ already set the same goal of overshooting the 2% target four years ago, with little effect.


In [9]:
df['RegSentsExpand']=regsents_expand

In [10]:
df['RegSentExpandLen']=df['RegSentsExpand'].str.len()
print(df.sort_values('RegSentExpandLen',ascending=False)[['ID','RegSentExpandLen']].head(10))

                ID  RegSentExpandLen
785285  1824047135             38644
785284  1824047069             38518
780573  1798874942             28365
790351  1859383206             27524
788561  1845766676             25873
793733  1886296496             25038
791594  1867515064             24706
790140  1857676896             24410
698039   847586480             24403
792784  1877776183             24136


In [11]:
print(df.head())

          ID                                            Title  Type  \
0  294326637             1984: IT WAS THE YEAR OF THE BIG LIE  News   
1  294308147  CONTROLS LIFTED ON ABOUT HALF OF US NATURAL GAS  News   
2  294323196              COURT UPHOLDS DIABLO CANYON LICENSE  News   
3  294311708                      HUNT'S IMMACULATE RECEPTION  News   
4  294262284                          LEGISLATIVE REPORT CARD  News   

   StartDate     EndDate                                               Text  \
0 1985-01-01  1985-01-01  ART BUCHWALD Art Buchwald is a syndicated colu...   
1 1985-01-01  1985-01-01  After 30 years of strict federal control, pric...   
2 1985-01-01  1985-01-01  A federal appeals court yesterday upheld the N...   
3 1985-01-01  1985-01-01  COTTON BOWL '85 / JOHN ROBINSON John Robinson ...   
4 1985-01-01  1985-01-01  Much of Beacon Hill's 1984 legislative activit...   

  TextWordCount                          PubTitle  SourceType  Year  Month  \
0           422  Bos

In [12]:
df[['ID','RegSentsExpand']].to_pickle('/home/ec2-user/SageMaker/New Uncertainty/Reg Relevance/allRegSentsExpand.pkl')

## A1. Alternative approach 1: identify sentences with "regulat*"

In [5]:
# Function to identify sentences with "*regulat*"
def extractSentence(text):
    sentSet=set()    # use set to avoid duplicated sentences
    text=remove_spaces(text)
    doc=nlp(text)
    for item in doc.sents:
        sent=item.text.strip()
        if len(re.findall('regulat',sent.lower()))>0:
            sentSet.add(sent)
    sentText=' '.join(sentSet)
    return sentText

In [None]:
# Extract regulatory sentences
regsents=[]
for text in df['Text']:
    new=extractSentence(text)
    regsents.append(new)
print(len(regsents))

In [None]:
df['RegSents']=regsents

In [None]:
print(df.info())

In [None]:
print(df['RegSents'][200000])

In [None]:
# Function to identify titles with "*regulat*"
def extractTitle(title):
    title=remove_spaces(str(title))
    if len(re.findall('regulat',title.lower()))>0:
        regTitle=title
    else:
        regTitle=""
    return regTitle

In [None]:
# Extract regulatory titles
regtitles=[]
for title in df['Title']:
    new=extractTitle(title)
    regtitles.append(new)

print(len(regtitles), regtitles[0])

In [None]:
df['RegTitles']=regtitles

In [None]:
print(df[df['RegTitles']!='']['ID'].nunique())

In [None]:
print(df[df['RegTitles']!=''][['ID','RegTitles']][0:5])

In [None]:
print(df.info())

In [None]:
df[['ID','RegSents','RegTitles']].to_pickle('/home/ec2-user/SageMaker/New Uncertainty/Reg Relevance/allRegSents.pkl')

In [None]:
df.to_pickle('/home/ec2-user/SageMaker/New Uncertainty/Reg Relevance/allRegNews_RegSents.pkl')

## A2. Alternative approach 2: identify sections with "regulat*"

In [None]:
# Function to print one XML example
def print_xml(ID):
    tree = etree.parse(filePath+ID+'.xml')
    xml = etree.tostring(tree, encoding="unicode", pretty_print=True)
    print(xml)

In [None]:
filePath='/home/ec2-user/SageMaker/data/corpus/regnews/'
filePath2='/home/ec2-user/SageMaker/data/corpus/regnews2/'

In [None]:
# See an example
print_xml('398770257')

In [1]:
# Function to remove html tags from a string
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [None]:
# Function to extract sections with "regulat"
def extractSection(text):
    secSet=set()
    secStartPos=[m.end() for m in re.finditer('&lt;p&gt',text)]
    secStartPos=secStartPos+[m.end() for m in re.finditer('<p>',text)]
    secStartPos=secStartPos+[m.end() for m in re.finditer('<ul>',text)]
    secEndPos=[m.start() for m in re.finditer('&lt;/p&gt',text)]
    secEndPos=secEndPos+[m.start() for m in re.finditer('</p>',text)]
    secEndPos=secEndPos+[m.start() for m in re.finditer('</ul>',text)]
    regPos=[m.start() for m in re.finditer('regulat',text.lower())]

    if len(regPos)>0:
        for reg in regPos:
            start=-99999
            end=99999
            for sec in secStartPos:
                if (sec-reg<=0) & (sec>start):
                    start=sec
            for sec in secEndPos:
                    if (sec-reg>=0) & (sec<end):
                        end=sec
            secSet.add(text[start:end])
        secText=" ".join(secSet)
    else:
        secText=''
    return secText

In [None]:
# Extract sections
sections=[] 
for ID in df['ID']:
    xmlp = et.XMLParser(encoding="UTF-8")
    try:
        file=filePath+ID+'.xml'
        parsed_xml = et.parse(file,parser=xmlp)
    except:
        file=filePath2+ID+'.xml'
        parsed_xml = et.parse(file,parser=xmlp)
    root = parsed_xml.getroot()
    for child in root.findall('Obj'):
        if root.find('TextInfo')!=None:
            for node in root.iter('Text'):
                text=node.text
                section=extractSection(text) 
                section=remove_html_tags(section)
                sections.append(section)

In [None]:
print(len(sections))
print(sections[50000])

In [None]:
df['RegSections']=sections
df['RegSectionLen']=df['RegSections'].str.len()

In [None]:
print(df.sort_values('RegSectionLen',ascending=False)[df['RegSectionLen']>0][['ID','RegSectionLen']].head(10))

In [None]:
df[['ID','RegSections','RegSectionLen']].to_pickle('/home/ec2-user/SageMaker/New Uncertainty/Reg Relevance/allRegSections.pkl')