In [1]:
import pandas as pd
import os
import datetime
import pickle
import re
import time
from collections import Counter
import numpy as np
from ast import literal_eval

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

## 1. Import Regulatory Sections and Noun Chunks with Areas

In [3]:
# Noun chunks with areas
nounchunks_area=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/DictionaryOfRegulatoryNounChunks.csv')
print(nounchunks_area.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10645 entries, 0 to 10644
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   nc_code      10645 non-null  int64 
 1   noun_chunks  10645 non-null  object
 2   rin          10645 non-null  object
 3   area         10185 non-null  object
 4   area_no      10645 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 415.9+ KB
None


In [4]:
nounchunks_area.head()

Unnamed: 0,nc_code,noun_chunks,rin,area,area_no
0,0,180-day exclusivity,0910-AC11,{1},1
1,1,1983 amendment,2115-AB72,{2},1
2,2,1988 amendment,1205-AB05,{4},1
3,3,1990 farm bill,0584-AB28,{1},1
4,4,1993 provision,"0970-AB32,3206-AG31",{1},1


In [5]:
# Convert to dictionary
nounchunks_area=nounchunks_area[nounchunks_area['area_no']>0].set_index('noun_chunks')
nounchunks_area_dict=nounchunks_area.to_dict()['area']
print(len(nounchunks_area_dict))

10185


In [6]:
# Expanded reg sentences with matched noun chunks
df_regSentsExpand=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegSentsExpand_NounChunks.pkl')
print(df_regSentsExpand.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990262 entries, 0 to 990261
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           990262 non-null  object        
 1   Title                        990262 non-null  object        
 2   Type                         990262 non-null  object        
 3   StartDate                    990262 non-null  datetime64[ns]
 4   EndDate                      990262 non-null  object        
 5   TextWordCount                990262 non-null  object        
 6   PubTitle                     990262 non-null  object        
 7   SourceType                   990262 non-null  object        
 8   Year                         990262 non-null  float64       
 9   Month                        990262 non-null  float64       
 10  Newspaper                    990262 non-null  object        
 11  RegSentsExpand            

In [7]:
# Refine to reg relevant articles
df_regSentsExpandRelevant=df_regSentsExpand[df_regSentsExpand['NounChunkMatchFiltered']>0].reset_index(drop=True)
print(df_regSentsExpandRelevant.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608172 entries, 0 to 608171
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           608172 non-null  object        
 1   Title                        608172 non-null  object        
 2   Type                         608172 non-null  object        
 3   StartDate                    608172 non-null  datetime64[ns]
 4   EndDate                      608172 non-null  object        
 5   TextWordCount                608172 non-null  object        
 6   PubTitle                     608172 non-null  object        
 7   SourceType                   608172 non-null  object        
 8   Year                         608172 non-null  float64       
 9   Month                        608172 non-null  float64       
 10  Newspaper                    608172 non-null  object        
 11  RegSentsExpand            

## 2. Link Expanded Reg Sentences to Areas

### Approach 4: dominant distinct area (dda): use the dominant areas from area-specific noun chunks (approach adopted in paper)

In [20]:
# Get all areas associated with area-specific noun chunks
df_regSentsExpandRelevant['AllDistinctAreas']=''
for i in range(0, len(df_regSentsExpandRelevant)):
    nounchunks=df_regSentsExpandRelevant['NounChunkMatchWordsFiltered'][i]
    area_list=[]
    for nc in nounchunks:
        if nc in nounchunks_area_dict:
            area=sorted(literal_eval(nounchunks_area_dict[nc]))
            if len(area)==1:
                area_list=area_list+area
    df_regSentsExpandRelevant['AllDistinctAreas'][i]=area_list

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [21]:
# Get the dominant area(s)
df_regSentsExpandRelevant['DistinctAreaCount']=''
df_regSentsExpandRelevant['DominantDistinctArea']=''
for i in range(0, len(df_regSentsExpandRelevant)):
    area_list=df_regSentsExpandRelevant['AllDistinctAreas'][i]
    area_count=Counter(area_list).most_common()
    dominant_area=[j for j in Counter(area_list).keys() if area_list.count(j)==max(Counter(area_list).values())]
    df_regSentsExpandRelevant['DistinctAreaCount'][i]=area_count
    df_regSentsExpandRelevant['DominantDistinctArea'][i]=dominant_area
print(df_regSentsExpandRelevant.info())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608172 entries, 0 to 608171
Data columns (total 18 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           608172 non-null  object        
 1   Title                        608172 non-null  object        
 2   Type                         608172 non-null  object        
 3   StartDate                    608172 non-null  datetime64[ns]
 4   EndDate                      608172 non-null  object        
 5   TextWordCount                608172 non-null  object        
 6   PubTitle                     608172 non-null  object        
 7   SourceType                   608172 non-null  object        
 8   Year                         608172 non-null  float64       
 9   Month                        608172 non-null  float64       
 10  Newspaper                    608172 non-null  object        
 11  RegSentsExpand            

In [24]:
print(df_regSentsExpandRelevant[['DistinctAreaCount','DominantDistinctArea','AllDistinctAreas']].head())

   DistinctAreaCount DominantDistinctArea AllDistinctAreas
0                 []                   []               []
1  [(10, 2), (1, 1)]                 [10]      [1, 10, 10]
2           [(6, 1)]                  [6]              [6]
3           [(7, 1)]                  [7]              [7]
4                 []                   []               []


### Create dummies for areas

In [28]:
area_range=15
for i in range(1,area_range):
    var='DominantDistinctArea'+str(i)
    df_regSentsExpandRelevant[var]=0
    for j in range(0, len(df_regSentsExpandRelevant)):
        if i in df_regSentsExpandRelevant['DominantDistinctArea'][j]:
            df_regSentsExpandRelevant[var][j]=1
print(df_regSentsExpandRelevant.info())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608172 entries, 0 to 608171
Data columns (total 32 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           608172 non-null  object        
 1   Title                        608172 non-null  object        
 2   Type                         608172 non-null  object        
 3   StartDate                    608172 non-null  datetime64[ns]
 4   EndDate                      608172 non-null  object        
 5   TextWordCount                608172 non-null  object        
 6   PubTitle                     608172 non-null  object        
 7   SourceType                   608172 non-null  object        
 8   Year                         608172 non-null  float64       
 9   Month                        608172 non-null  float64       
 10  Newspaper                    608172 non-null  object        
 11  RegSentsExpand            

## 3. Merge with sentiment scores

In [29]:
# Merge with sentiment scores
sentimentScores=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegRelevant_ArticleSentimentScores.csv')
print(sentimentScores.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608172 entries, 0 to 608171
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                608172 non-null  int64  
 1   StartDate         608172 non-null  object 
 2   Newspaper         608172 non-null  object 
 3   PubTitle          608172 non-null  object 
 4   UncertaintyScore  608172 non-null  float64
 5   GIscore           608172 non-null  float64
 6   LMscore           608172 non-null  float64
 7   LSDscore          608172 non-null  float64
dtypes: float64(4), int64(1), object(3)
memory usage: 37.1+ MB
None


In [30]:
# Merge
sentimentScores['ID']=sentimentScores['ID'].astype('str')
df=df_regSentsExpandRelevant.merge(sentimentScores[['ID','UncertaintyScore','GIscore','LMscore','LSDscore']],on='ID',how='left')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 608172 entries, 0 to 608171
Data columns (total 36 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           608172 non-null  object        
 1   Title                        608172 non-null  object        
 2   Type                         608172 non-null  object        
 3   StartDate                    608172 non-null  datetime64[ns]
 4   EndDate                      608172 non-null  object        
 5   TextWordCount                608172 non-null  object        
 6   PubTitle                     608172 non-null  object        
 7   SourceType                   608172 non-null  object        
 8   Year                         608172 non-null  float64       
 9   Month                        608172 non-null  float64       
 10  Newspaper                    608172 non-null  object        
 11  RegSentsExpand            

In [31]:
df.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegArea_ArticleSentimentScores.csv',index=False)

## 4. Monthly article counts by area

In [None]:
df=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegArea_ArticleSentimentScores.csv',
               converters={'DominantDistinctArea': pd.eval})
print(df.info())

In [18]:
# Area count for each article
dda_list=df['DominantDistinctArea'].tolist()
area_counts=[]
for area in dda_list:
    count=len(area)
    area_counts.append(count)
df['DominantDistinctAreaCount']=area_counts

In [21]:
print(df[['DominantDistinctArea','DominantDistinctAreaCount']].tail())

       DominantDistinctArea  DominantDistinctAreaCount
608167                   []                          0
608168                  [3]                          1
608169                  [1]                          1
608170                   []                          0
608171                  [1]                          1


In [20]:
# Total article count
print("# of articles with an area classification:",len(df[df['DominantDistinctAreaCount']>0]))

# of articles with an area classification: 424791


In [5]:
# List of columns for different approaches
area_range=15    # Number of areas + 1
col_list=[]
for i in range(1,area_range):
    var='DominantDistinctArea'+str(i)
    col_list.append(var)
print(col_list)

['DominantDistinctArea1', 'DominantDistinctArea2', 'DominantDistinctArea3', 'DominantDistinctArea4', 'DominantDistinctArea5', 'DominantDistinctArea6', 'DominantDistinctArea7', 'DominantDistinctArea8', 'DominantDistinctArea9', 'DominantDistinctArea10', 'DominantDistinctArea11', 'DominantDistinctArea12', 'DominantDistinctArea13', 'DominantDistinctArea14']


In [6]:
# Aggregate monthly article counts
monthlyAreaCount=df[['Newspaper','Year','Month']+col_list].groupby(['Newspaper','Year','Month']).agg('sum').reset_index()
print(monthlyAreaCount.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3057 entries, 0 to 3056
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Newspaper               3057 non-null   object 
 1   Year                    3057 non-null   float64
 2   Month                   3057 non-null   float64
 3   DominantDistinctArea1   3057 non-null   int64  
 4   DominantDistinctArea2   3057 non-null   int64  
 5   DominantDistinctArea3   3057 non-null   int64  
 6   DominantDistinctArea4   3057 non-null   int64  
 7   DominantDistinctArea5   3057 non-null   int64  
 8   DominantDistinctArea6   3057 non-null   int64  
 9   DominantDistinctArea7   3057 non-null   int64  
 10  DominantDistinctArea8   3057 non-null   int64  
 11  DominantDistinctArea9   3057 non-null   int64  
 12  DominantDistinctArea10  3057 non-null   int64  
 13  DominantDistinctArea11  3057 non-null   int64  
 14  DominantDistinctArea12  3057 non-null   

In [7]:
monthlyAreaCount.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegArea_MonthlyArticleCountByNewspaper.csv',index=False)

## 5. Filtered Noun Chunk Occurences by Area

In [29]:
# Reg relevant articles
df=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegArea_ArticleSentimentScores.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608172 entries, 0 to 608171
Data columns (total 36 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   ID                           608172 non-null  int64  
 1   Title                        608172 non-null  object 
 2   Type                         608172 non-null  object 
 3   StartDate                    608172 non-null  object 
 4   EndDate                      608172 non-null  object 
 5   TextWordCount                608172 non-null  int64  
 6   PubTitle                     608172 non-null  object 
 7   SourceType                   608172 non-null  object 
 8   Year                         608172 non-null  float64
 9   Month                        608172 non-null  float64
 10  Newspaper                    608172 non-null  object 
 11  RegSentsExpand               608172 non-null  object 
 12  RegSentExpandLength          608172 non-null  int64  
 13 

In [30]:
# Filtered noun chunk occurences across regulation-related articles
df_nounchunk_occurences=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegSentsExpand_FilteredNounChunkOccurences.csv')
print(df_nounchunk_occurences.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10458 entries, 0 to 10457
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Noun Chunks  10458 non-null  object
 1   Occurences   10458 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 163.5+ KB
None


In [31]:
# An example
print(df[df['DominantDistinctArea1']==1]['NounChunkMatchWordsFiltered'])

15        ['Environmental Quality', 'new standard', 'env...
16                                             ['new drug']
19                          ['poison ivy', 'Public Health']
35        ['Federal Reserve', 'other security', 'federal...
36                        ['retail store', 'security firm']
                                ...                        
608146    ['suspicious order', 'witness testimony', 'dis...
608147    ['electronic health record', 'regulatory burden']
608161                ['research grant', 'federal funding']
608169    ['suspicious order', 'witness testimony', 'dis...
608171                                    ['public health']
Name: NounChunkMatchWordsFiltered, Length: 68291, dtype: object


In [35]:
print(literal_eval(df[df['DominantDistinctArea1']==1]['NounChunkMatchWordsFiltered'][15].lower()))

['environmental quality', 'new standard', 'environmental policy', 'agency regulation', 'new standard', 'wetland regulation']


In [36]:
# Filtered noun chunks across regulation-related articles by area
for i in range(1,15):
    allMatchWords=[]
    for list in df[df['DominantDistinctArea'+str(i)]==1]['NounChunkMatchWordsFiltered']:
        allMatchWords=allMatchWords+literal_eval(list.lower())
    allMatchWordsCount=Counter(allMatchWords)
    var_name='Occurences_dda'+str(i)
    df_MatchWords = pd.DataFrame(allMatchWordsCount.items(),columns = ['Noun Chunks',var_name])
    df_nounchunk_occurences=df_nounchunk_occurences.merge(df_MatchWords,on='Noun Chunks',how='outer')
print(df_nounchunk_occurences.head())

          Noun Chunks  Occurences  Occurences_dda1  Occurences_dda2  \
0      new regulation       36661           3690.0           2044.0   
1  federal regulation       26358           4375.0           1790.0   
2         health care       22508           5257.0           1524.0   
3     federal reserve       21068           1193.0            889.0   
4         real estate       20205           1161.0           1112.0   

   Occurences_dda3  Occurences_dda4  Occurences_dda5  Occurences_dda6  \
0          30395.0            631.0           4003.0            409.0   
1           2911.0            615.0           3428.0            624.0   
2           1758.0            529.0           1453.0            548.0   
3           1356.0            165.0           1362.0            118.0   
4           1213.0            233.0           1559.0            186.0   

   Occurences_dda7  Occurences_dda8  Occurences_dda9  Occurences_dda10  \
0           4724.0           1866.0            412.0        

In [37]:
df_nounchunk_occurences.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegArea_FilteredNounChunkOccurences.csv',index=False)