In [37]:
import pandas as pd
import os
import datetime
import pickle
import re
import time
from collections import Counter
import numpy as np
import random

In [38]:
import nltk
nltk.data.path
from nltk.tokenize import sent_tokenize, word_tokenize

In [39]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

## 3.1 Match regulatory noun chunks

In [18]:
# Define a text preprocessor (lemmatizer)
def my_preprocessor(text):
    doc=nlp(text)
    lemmas=[token.lemma_ for token in doc if not token.is_punct | token.is_space]
    texts_out=" ".join(lemmas)
    return texts_out

In [19]:
# Use the dictionary of regulatory noun chunks
df_nounchunks=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/DictionaryOfRegulatoryNounChunks.csv')
print(df_nounchunks.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10645 entries, 0 to 10644
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   nc_code      10645 non-null  int64 
 1   noun_chunks  10645 non-null  object
 2   rin          10645 non-null  object
 3   area         10185 non-null  object
 4   area_no      10645 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 415.9+ KB
None


In [20]:
nounchunks=df_nounchunks['noun_chunks'].tolist()
print(len(nounchunks),nounchunks[0:20])

10645 ['180-day exclusivity', '1983 amendment', '1988 amendment', '1990 farm bill', '1993 provision', '1994 amendment', '1995 amendment', '1996 agreement', '1996 amendment', '1996 farm bill', '1996 regulation', '1996 telecom act', '1996 telecommunication act', '1999 deadline', '1999 quota', '2002 revision', '2005 amendment', '2005 rule', '2005 transportation bill', '2007 implementation']


In [29]:
# Import expanded reg sentences
df_regSentsExpand=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/allRegSentsExpand.pkl')
print(df_regSentsExpand.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990262 entries, 0 to 990261
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ID                   990262 non-null  object        
 1   Title                990262 non-null  object        
 2   Type                 990262 non-null  object        
 3   StartDate            990262 non-null  datetime64[ns]
 4   EndDate              990262 non-null  object        
 5   TextWordCount        990262 non-null  object        
 6   PubTitle             990262 non-null  object        
 7   SourceType           990262 non-null  object        
 8   Year                 990262 non-null  float64       
 9   Month                990262 non-null  float64       
 10  Newspaper            990262 non-null  object        
 11  RegSentsExpand       990262 non-null  object        
 12  RegSentExpandLength  990262 non-null  int64         
dtypes: datetime64[

In [30]:
# Convert reg sentences to list
regSentsExpand=df_regSentsExpand['RegSentsExpand'].tolist()
print(len(regSentsExpand), regSentsExpand[0])

990262 "You can move in tomorrow and you won't have to do one thing to fix it up." "Deregulation of natural gas will lower your heating bills." "Our latest model will give you 40 miles to the gallon."


In [31]:
print(regSentsExpand[4031])
print(my_preprocessor(regSentsExpand[4031]))

Suddenly, with new Federal Communications Commission regulations promising even greater resurgence, New England hams like Carp find their avocation no longer looks like the faintly outdated fiddling of science nerds and aging Cub Scouts. The FCC announced last January that, effective, as hams say, 0001 March 21, 1987, it would permit substantially enhanced privileges for novice operators, including the transmission of voice. Since a loosening of regulations in late March, the FCC office in Gettysburg, Pa., has reported a three-fold increase in monthly license applications. Known to his radio friends as K-1-Hotel-Lima- Zulu, Carp operates as one of thousands of amateur radio enthusiasts in New England, where by all indications the hobby is flourishing. That removed a key impediment, especially to the involvement of younger aspirants, who previously had bridled at stringent ham licensing regulations requiring new radio operators to use Morse code. Now even novices can send their voice ou

In [11]:
# Preprocess all expanded reg sentences
regSentsExpand_lemmatized=[my_preprocessor(sent) for sent in regSentsExpand]

In [15]:
print(len(regSentsExpand_lemmatized),regSentsExpand[0], regSentsExpand_lemmatized[0])

990262 "You can move in tomorrow and you won't have to do one thing to fix it up." "Deregulation of natural gas will lower your heating bills." "Our latest model will give you 40 miles to the gallon." -PRON- can move in tomorrow and -PRON- will not have to do one thing to fix -PRON- up deregulation of natural gas will lower -PRON- heating bill -PRON- late model will give -PRON- 40 mile to the gallon


In [24]:
# Compile a new re pattern with regulatory noun chunks
pattern=re.compile(r"\b"+r"\b|\b".join(map(re.escape, nounchunks))+r"\b",re.IGNORECASE)

In [None]:
# Match noun chunks in all expanded reg sentences
start_time = time.time()

nounchunk_match=[]
nounchunk_match_words=[]
for sent in regSentsExpand_lemmatized:
    match_words=[]
    match=0
    find=pattern.findall(sent)
    if len(find)>0:
        match_words=find
        match=len(find)
    nounchunk_match.append(match)
    nounchunk_match_words.append(match_words)
    
print("--- %s seconds ---" % (time.time() - start_time))

In [20]:
print(len(nounchunk_match), len(nounchunk_match_words))
print(nounchunk_match_words[-1], nounchunk_match[-1])

990262 990262
['public health'] 1


In [21]:
# Export results: matched noun chunks in expanded reg sentences
df_regSentsExpand['NounChunkMatchFiltered']=nounchunk_match
df_regSentsExpand['NounChunkMatchWordsFiltered']=nounchunk_match_words

In [22]:
print(df_regSentsExpand.head())

          ID                                            Title  Type  \
0  294326637             1984: IT WAS THE YEAR OF THE BIG LIE  News   
1  294308147  CONTROLS LIFTED ON ABOUT HALF OF US NATURAL GAS  News   
2  294323196              COURT UPHOLDS DIABLO CANYON LICENSE  News   
3  294311708                      HUNT'S IMMACULATE RECEPTION  News   
4  294262284                          LEGISLATIVE REPORT CARD  News   

   StartDate     EndDate TextWordCount                          PubTitle  \
0 1985-01-01  1985-01-01           422  Boston Globe (pre-1997 Fulltext)   
1 1985-01-01  1985-01-01           751  Boston Globe (pre-1997 Fulltext)   
2 1985-01-01  1985-01-01           238  Boston Globe (pre-1997 Fulltext)   
3 1985-01-01  1985-01-01          1142  Boston Globe (pre-1997 Fulltext)   
4 1985-01-01  1985-01-01          1570  Boston Globe (pre-1997 Fulltext)   

   SourceType    Year  Month     Newspaper  \
0  Newspapers  1985.0    1.0  Boston Globe   
1  Newspapers  1985.0   

In [24]:
print('# of reg relevant artciles:',df_regSentsExpand[df_regSentsExpand['NounChunkMatchFiltered']>0]['ID'].nunique())

# of reg relevant artciles: 608172


In [27]:
print(df_regSentsExpand[df_regSentsExpand['RegSentExpandLength']==0]['NounChunkMatchFiltered'].value_counts())

0    46496
Name: NounChunkMatchFiltered, dtype: int64


In [25]:
# Examine some examples
for i in range(0,100):
    if df_regSentsExpand['NounChunkMatchFiltered'][i]>0:
        print(df_regSentsExpand['RegSentsExpand'][i],df_regSentsExpand['NounChunkMatchWordsFiltered'][i])

"You can move in tomorrow and you won't have to do one thing to fix it up." "Deregulation of natural gas will lower your heating bills." "Our latest model will give you 40 miles to the gallon." ['natural gas']
For one thing, federal rules governing the pipeline companies that move gas from producer to consumer forbid rate adjustments based on anticipated changes in the price of decontrolled gas. Because rates can be adjusted only after a price change by producers, any increase or decrease will not show up until next spring or summer when the pipelines file their "purchased gas adjustments" with the Federal Energy Regulatory Administration. But even then, hardly anyone involved in the $65-billion-a-year industry that provides one-fourth of the nation's energy expects any significant price change. ['federal rule', 'price change', 'price change']
A federal appeals court yesterday upheld the Nuclear Regulatory Commission's issuance of an operating license for the Diablo Canyon nuclear powe

In [26]:
df_regSentsExpand.to_pickle('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegSentsExpand_NounChunks.pkl')

## 3.2 Get monthly relevant article counts

In [31]:
df_regSentsExpand=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegSentsExpand_NounChunks.pkl')

In [32]:
# Reg relevant articles
df_reg=df_regSentsExpand[df_regSentsExpand['NounChunkMatchFiltered']>0].reset_index(drop=True)
print(df_reg.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608172 entries, 0 to 608171
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           608172 non-null  object        
 1   Title                        608172 non-null  object        
 2   Type                         608172 non-null  object        
 3   StartDate                    608172 non-null  datetime64[ns]
 4   EndDate                      608172 non-null  object        
 5   TextWordCount                608172 non-null  object        
 6   PubTitle                     608172 non-null  object        
 7   SourceType                   608172 non-null  object        
 8   Year                         608172 non-null  float64       
 9   Month                        608172 non-null  float64       
 10  Newspaper                    608172 non-null  object        
 11  RegSentsExpand            

In [29]:
# Get monthly count
df_monthly=df_reg.groupby(['Newspaper','Year','Month'])['ID'].count().reset_index(name="RegRelevantCount")
df_monthly=df_monthly.sort_values(['Newspaper','Year','Month']).reset_index(drop=True)
print(df_monthly)

                Newspaper    Year  Month  RegRelevantCount
0            Boston Globe  1985.0    1.0                83
1            Boston Globe  1985.0    2.0                91
2            Boston Globe  1985.0    3.0               103
3            Boston Globe  1985.0    4.0                92
4            Boston Globe  1985.0    5.0               112
...                   ...     ...    ...               ...
3052  Wall Street Journal  2021.0    8.0               433
3053  Wall Street Journal  2021.0    9.0               464
3054  Wall Street Journal  2021.0   10.0               503
3055  Wall Street Journal  2021.0   11.0               478
3056  Wall Street Journal  2021.0   12.0               488

[3057 rows x 4 columns]


In [30]:
df_monthly.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegRelevant_MonthlyArticleCount_Dec2021.csv',index=False)

### 3.2.1. Original coverage

In [33]:
print(df_reg['PubTitle'].value_counts())

Wall Street Journal                        99838
New York Times (Online)                    89010
New York Times                             75616
Wall Street Journal (Online)               69036
The Washington Post                        41440
Los Angeles Times                          36955
Los Angeles Times (pre-1997 Fulltext)      36798
Chicago Tribune                            28868
Boston Globe                               27113
Chicago Tribune (pre-1997 Fulltext)        23236
The Washington Post (pre-1997 Fulltext)    20734
Boston Globe (pre-1997 Fulltext)           13366
USA TODAY                                  11175
The Washington Post (Online)               10531
USA TODAY (pre-1997 Fulltext)               6079
USA Today (Online)                          5259
Los Angeles Times (Online)                  4685
Boston Globe (Online)                       4383
Chicago Tribune (Online)                    4050
Name: PubTitle, dtype: int64


In [34]:
df_reg=df_reg[(df_reg['PubTitle']!='Chicago Tribune (Online)')
                & (df_reg['PubTitle']!='Los Angeles Times (Online)')
                & (df_reg['PubTitle']!='New York Times (Online)')
                & (df_reg['PubTitle'] != 'The Washington Post (Online)')].reset_index(drop=True)
print(df_reg['PubTitle'].value_counts())

Wall Street Journal                        99838
New York Times                             75616
Wall Street Journal (Online)               69036
The Washington Post                        41440
Los Angeles Times                          36955
Los Angeles Times (pre-1997 Fulltext)      36798
Chicago Tribune                            28868
Boston Globe                               27113
Chicago Tribune (pre-1997 Fulltext)        23236
The Washington Post (pre-1997 Fulltext)    20734
Boston Globe (pre-1997 Fulltext)           13366
USA TODAY                                  11175
USA TODAY (pre-1997 Fulltext)               6079
USA Today (Online)                          5259
Boston Globe (Online)                       4383
Name: PubTitle, dtype: int64


In [35]:
# Get monthly count
df_monthly2=df_reg.groupby(['Newspaper','Year','Month'])['ID'].count().reset_index(name="RegRelevantCount")
df_monthly2=df_monthly2.sort_values(['Newspaper','Year','Month']).reset_index(drop=True)
print(df_monthly2)

                Newspaper    Year  Month  RegRelevantCount
0            Boston Globe  1985.0    1.0                83
1            Boston Globe  1985.0    2.0                91
2            Boston Globe  1985.0    3.0               103
3            Boston Globe  1985.0    4.0                92
4            Boston Globe  1985.0    5.0               112
...                   ...     ...    ...               ...
3052  Wall Street Journal  2021.0    8.0               433
3053  Wall Street Journal  2021.0    9.0               464
3054  Wall Street Journal  2021.0   10.0               503
3055  Wall Street Journal  2021.0   11.0               478
3056  Wall Street Journal  2021.0   12.0               488

[3057 rows x 4 columns]


In [36]:
df_monthly2.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegRelevant_MonthlyArticleCount_Dec2021_OriginalCoverage.csv',index=False)

## 3.3 Noun Chunk Occurences acorss Regulation-related Articles

In [41]:
df_regSentsExpand=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegSentsExpand_NounChunks.pkl')
print(df_regSentsExpand.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990262 entries, 0 to 990261
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           990262 non-null  object        
 1   Title                        990262 non-null  object        
 2   Type                         990262 non-null  object        
 3   StartDate                    990262 non-null  datetime64[ns]
 4   EndDate                      990262 non-null  object        
 5   TextWordCount                990262 non-null  object        
 6   PubTitle                     990262 non-null  object        
 7   SourceType                   990262 non-null  object        
 8   Year                         990262 non-null  float64       
 9   Month                        990262 non-null  float64       
 10  Newspaper                    990262 non-null  object        
 11  RegSentsExpand            

In [42]:
# Regulation-related articles
df_reg=df_regSentsExpand[df_regSentsExpand['NounChunkMatchFiltered']>0].reset_index(drop=True)
print(df_reg.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608172 entries, 0 to 608171
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           608172 non-null  object        
 1   Title                        608172 non-null  object        
 2   Type                         608172 non-null  object        
 3   StartDate                    608172 non-null  datetime64[ns]
 4   EndDate                      608172 non-null  object        
 5   TextWordCount                608172 non-null  object        
 6   PubTitle                     608172 non-null  object        
 7   SourceType                   608172 non-null  object        
 8   Year                         608172 non-null  float64       
 9   Month                        608172 non-null  float64       
 10  Newspaper                    608172 non-null  object        
 11  RegSentsExpand            

In [None]:
# Append all filtered noun chunks
allMatchWords=[]
for nc_list in df_reg['NounChunkMatchWordsFiltered']:
    nc_list_lower=[nc.lower() for nc in nc_list]    #Convert to lower case
    allMatchWords=allMatchWords+nc_list_lower
print(len(allMatchWords), allMatchWords[0])

In [None]:
# Count each noun chunk
allMatchWordsCount=Counter(allMatchWords)
print(allMatchWordsCount)

In [None]:
# Convert to dataframe
df_MatchWords = pd.DataFrame(allMatchWordsCount.items(),columns = ['Noun Chunks','Occurences'])
print(df_MatchWords.info())

In [None]:
df_MatchWords=df_MatchWords.sort_values('Occurences',ascending=False).reset_index(drop=True)
print(df_MatchWords.head(10))

In [None]:
# Export noun chunk occurences
df_MatchWords.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegSentsExpand_FilteredNounChunkOccurences.csv',index=False)