# Robustness Check: Remove General Terms from Top 100 Noun Chunks in Each Area

In [1]:
import pandas as pd
import os
import datetime
import pickle
import re
import time
from collections import Counter
import numpy as np

import nltk
nltk.data.path

['/home/ec2-user/nltk_data',
 '/home/ec2-user/SageMaker/.conda/envs/my_py/nltk_data',
 '/home/ec2-user/SageMaker/.conda/envs/my_py/share/nltk_data',
 '/home/ec2-user/SageMaker/.conda/envs/my_py/lib/nltk_data',
 '/usr/share/nltk_data',
 '/usr/local/share/nltk_data',
 '/usr/lib/nltk_data',
 '/usr/local/lib/nltk_data']

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
#nltk.download('stopwords')
from nltk.corpus import stopwords

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [4]:
from ast import literal_eval
from collections import Counter

In [5]:
import statsmodels.formula.api as sm

## 1. Import Regulatory Sections and Noun Chunks with Areas

In [6]:
# Noun chunks with areas
nounchunks_area=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/DictionaryOfRegulatoryNounChunks.csv')
print(nounchunks_area.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10645 entries, 0 to 10644
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   nc_code      10645 non-null  int64 
 1   noun_chunks  10645 non-null  object
 2   rin          10645 non-null  object
 3   area         10185 non-null  object
 4   area_no      10645 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 415.9+ KB
None


In [7]:
nounchunks_area.head()

Unnamed: 0,nc_code,noun_chunks,rin,area,area_no
0,0,180-day exclusivity,0910-AC11,{1},1
1,1,1983 amendment,2115-AB72,{2},1
2,2,1988 amendment,1205-AB05,{4},1
3,3,1990 farm bill,0584-AB28,{1},1
4,4,1993 provision,"0970-AB32,3206-AG31",{1},1


In [8]:
# Remove area-specific general terms
nounchunks_remove=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/Top 100 Area-specific Noun Chunks Removed.csv')
print(nounchunks_remove.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1385 entries, 0 to 1384
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   noun_chunks      1385 non-null   object 
 1   area_no          1385 non-null   int64  
 2   area_name        1385 non-null   object 
 3   remove_dda_less  258 non-null    float64
 4   remove_dda_more  449 non-null    float64
dtypes: float64(2), int64(1), object(2)
memory usage: 54.2+ KB
None


In [9]:
# Specify which removal appraoch to use
nounchunks_remove=nounchunks_remove.rename(columns={'remove_dda_less':'remove_dda'})
print(nounchunks_remove['remove_dda'].value_counts())

1.0    258
Name: remove_dda, dtype: int64


In [10]:
# Merge
nounchunks_area=nounchunks_area.merge(nounchunks_remove[['noun_chunks','remove_dda']],on='noun_chunks',how='left')
print(nounchunks_area.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10645 entries, 0 to 10644
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   nc_code      10645 non-null  int64  
 1   noun_chunks  10645 non-null  object 
 2   rin          10645 non-null  object 
 3   area         10185 non-null  object 
 4   area_no      10645 non-null  int64  
 5   remove_dda   258 non-null    float64
dtypes: float64(1), int64(2), object(3)
memory usage: 582.1+ KB
None


In [11]:
# Convert to dictionary
nounchunks_area=nounchunks_area[(nounchunks_area['area_no']>0) & (nounchunks_area['remove_dda']!=1)].set_index('noun_chunks')
nounchunks_area_dict=nounchunks_area.to_dict()['area']
print(len(nounchunks_area_dict))

9927


In [12]:
# Expanded reg sentences with matched noun chunks
df_regSentsExpand=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/Reg Relevance/RegSentsExpand_NounChunks3.pkl')
print(df_regSentsExpand.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822737 entries, 0 to 822736
Data columns (total 6 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   ID                           822737 non-null  object
 1   RegSentsExpand               822737 non-null  object
 2   NounChunksMatch              822737 non-null  int64 
 3   NounChunkMatchWords          822737 non-null  object
 4   NounChunkMatchWordsFiltered  822737 non-null  object
 5   NounChunkMatchFiltered       822737 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 37.7+ MB
None


In [13]:
# Remove duplicated articles
IDs_nodup=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/IDs_no_duplicates.csv')
print(IDs_nodup.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788516 entries, 0 to 788515
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   ID      788516 non-null  int64
dtypes: int64(1)
memory usage: 6.0 MB
None


In [14]:
df_regSentsExpand['ID']=df_regSentsExpand['ID'].astype('int64')
df_regSentsExpand=IDs_nodup.merge(df_regSentsExpand,on='ID',how='left').reset_index(drop=True)
print(df_regSentsExpand.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 788516 entries, 0 to 788515
Data columns (total 6 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   ID                           788516 non-null  int64 
 1   RegSentsExpand               788516 non-null  object
 2   NounChunksMatch              788516 non-null  int64 
 3   NounChunkMatchWords          788516 non-null  object
 4   NounChunkMatchWordsFiltered  788516 non-null  object
 5   NounChunkMatchFiltered       788516 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 36.1+ MB
None


In [15]:
# Refine to reg relevant articles
df_regSentsExpandRelevant=df_regSentsExpand[df_regSentsExpand['NounChunkMatchFiltered']>0].reset_index(drop=True)
print(df_regSentsExpandRelevant.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 6 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   ID                           493418 non-null  int64 
 1   RegSentsExpand               493418 non-null  object
 2   NounChunksMatch              493418 non-null  int64 
 3   NounChunkMatchWords          493418 non-null  object
 4   NounChunkMatchWordsFiltered  493418 non-null  object
 5   NounChunkMatchFiltered       493418 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 22.6+ MB
None


## 2. Link Expanded Reg Sentences to Areas

### Approach 4: dominant distinct area (dda): use the dominant areas from area-specific noun chunks

In [16]:
df_regSentsExpandRelevant['AllDistinctAreas']=''
df_regSentsExpandRelevant['DistinctAreaCount']=''
df_regSentsExpandRelevant['DominantDistinctArea']=''
for i in range(0, len(df_regSentsExpandRelevant)):
    nounchunks=df_regSentsExpandRelevant['NounChunkMatchWordsFiltered'][i]
    area_list=[]
    for nc in nounchunks:
        if nc in nounchunks_area_dict:
            area=sorted(literal_eval(nounchunks_area_dict[nc]))
            if len(area)==1:
                area_list=area_list+area    
    
    area_count=Counter(area_list).most_common()
    dominant_area=[j for j in Counter(area_list).keys() if area_list.count(j)==max(Counter(area_list).values())]
    df_regSentsExpandRelevant['AllDistinctAreas'][i]=area_list
    df_regSentsExpandRelevant['DistinctAreaCount'][i]=area_count
    df_regSentsExpandRelevant['DominantDistinctArea'][i]=dominant_area
print(df_regSentsExpandRelevant.info())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 9 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   ID                           493418 non-null  int64 
 1   RegSentsExpand               493418 non-null  object
 2   NounChunksMatch              493418 non-null  int64 
 3   NounChunkMatchWords          493418 non-null  object
 4   NounChunkMatchWordsFiltered  493418 non-null  object
 5   NounChunkMatchFiltered       493418 non-null  int64 
 6   AllDistinctAreas             493418 non-null  object
 7   DistinctAreaCount            493418 non-null  object
 8   DominantDistinctArea         493418 non-null  object
dtypes: int64(3), object(6)
memory usage: 33.9+ MB
None


In [17]:
print(df_regSentsExpandRelevant.head())

           ID                                     RegSentsExpand  \
0   292620682  Now, one hopes, more attention will be paid to...   
1   307490698  His economic program as a candidate in the Rep...   
2   307420103  As the Bank of New England hearings suggest, t...   
3   307576786  The Justice Department contends that the Eight...   
4  1944990599  "The remittances from drivers alone reach almo...   

   NounChunksMatch                                NounChunkMatchWords  \
0                8  [land use, civil right, land use, land use, lo...   
1                2                                 [tax cut, tax cut]   
2               10  [new england, bank hold company, new england, ...   
3                4  [civil action, criminal activity, civil claim,...   
4                1                            [government regulation]   

                         NounChunkMatchWordsFiltered  NounChunkMatchFiltered  \
0  [land use, civil right, land use, land use, pu...                    

In [18]:
df_regSentsExpandRelevant.to_pickle('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegSentsExpand_NounChunks_Area_Robust1.pkl')

## 3. Filtered Noun Chunk Occurences by Area

In [19]:
# Reg relevant articles
df_regSentsExpandRelevant=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegSentsExpand_NounChunks_Area_Robust1.pkl')
print(df_regSentsExpandRelevant.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 9 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   ID                           493418 non-null  int64 
 1   RegSentsExpand               493418 non-null  object
 2   NounChunksMatch              493418 non-null  int64 
 3   NounChunkMatchWords          493418 non-null  object
 4   NounChunkMatchWordsFiltered  493418 non-null  object
 5   NounChunkMatchFiltered       493418 non-null  int64 
 6   AllDistinctAreas             493418 non-null  object
 7   DistinctAreaCount            493418 non-null  object
 8   DominantDistinctArea         493418 non-null  object
dtypes: int64(3), object(6)
memory usage: 33.9+ MB
None


In [20]:
# Filtered noun chunk occurences across regulation-related articles
df_nounchunk_occurences=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Reg Relevance/RegSentsExpand_FilteredNounChunkOccurences.csv')
print(df_nounchunk_occurences.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10645 entries, 0 to 10644
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Noun Chunks  10645 non-null  object
 1   Occurences   10645 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 166.5+ KB
None


In [21]:
# Dummies by dominant distinct area (dda)
for i in range(1,15):
    var='DominantDistinctArea'+str(i)
    df_regSentsExpandRelevant[var]=0
    for j in range(0, len(df_regSentsExpandRelevant)):
        if i in df_regSentsExpandRelevant['DominantDistinctArea'][j]:
            df_regSentsExpandRelevant[var][j]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [22]:
# Filtered noun chunks across regulation-related articles by area
for i in range(1,15):
    allDistinctAreas=[]
    for allArea in df_regSentsExpandRelevant[df_regSentsExpandRelevant['DominantDistinctArea'+str(i)]==1]['NounChunkMatchWordsFiltered']:
        distinctArea=[nc for nc in allArea if nc in nounchunks_area_dict]
        allDistinctAreas=allDistinctAreas+distinctArea
    allDistinctAreaCount=Counter(allDistinctAreas)
    var_name='Occurences_dda'+str(i)
    df_MatchWords = pd.DataFrame(allDistinctAreaCount.items(),columns = ['Noun Chunks',var_name])
    df_nounchunk_occurences=df_nounchunk_occurences.merge(df_MatchWords,on='Noun Chunks',how='outer')
print(df_nounchunk_occurences)

                       Noun Chunks  Occurences  Occurences_dda1  \
0                   new regulation       29880              NaN   
1               federal regulation       22168           3242.0   
2                      health care       17897           5083.0   
3                      real estate       17401            541.0   
4                  federal reserve       16590            189.0   
...                            ...         ...              ...   
10640  mercury contain thermometer           1              NaN   
10641                 abandon area           1              NaN   
10642                     deep gas           1              NaN   
10643               float facility           1              NaN   
10644              secondary payer           1              NaN   

       Occurences_dda2  Occurences_dda3  Occurences_dda4  Occurences_dda5  \
0                  NaN              NaN              NaN              NaN   
1               1271.0           1763.0  

In [23]:
df_nounchunk_occurences.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegArea_FilteredNounChunkOccurences_Robust1.csv',index=False)

## 4. Aggregate sentiment scores

In [24]:
# Reg relevant articles
df_regSentsExpandRelevant=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegSentsExpand_NounChunks_Area_Robust1.pkl')
print(df_regSentsExpandRelevant.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 9 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   ID                           493418 non-null  int64 
 1   RegSentsExpand               493418 non-null  object
 2   NounChunksMatch              493418 non-null  int64 
 3   NounChunkMatchWords          493418 non-null  object
 4   NounChunkMatchWordsFiltered  493418 non-null  object
 5   NounChunkMatchFiltered       493418 non-null  int64 
 6   AllDistinctAreas             493418 non-null  object
 7   DistinctAreaCount            493418 non-null  object
 8   DominantDistinctArea         493418 non-null  object
dtypes: int64(3), object(6)
memory usage: 33.9+ MB
None


In [25]:
# All news data
allNews=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/parsed_xml.pkl')
print(allNews.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822737 entries, 0 to 822736
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   ID             822737 non-null  object        
 1   Title          822737 non-null  object        
 2   Type           822737 non-null  object        
 3   StartDate      822737 non-null  datetime64[ns]
 4   EndDate        822737 non-null  object        
 5   Text           822737 non-null  object        
 6   TextWordCount  822737 non-null  object        
 7   PubTitle       822737 non-null  object        
 8   SourceType     822737 non-null  object        
 9   Year           822737 non-null  int64         
 10  Month          822737 non-null  int64         
 11  Newspaper      822737 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(9)
memory usage: 75.3+ MB
None


In [26]:
# Merge with all news data
allNews['ID']=allNews['ID'].astype('int64')
df=df_regSentsExpandRelevant.merge(allNews[['ID','Title','Type','SourceType','StartDate','Newspaper','Year','Month']],
                                  on='ID',how='left')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 493418 entries, 0 to 493417
Data columns (total 16 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           493418 non-null  int64         
 1   RegSentsExpand               493418 non-null  object        
 2   NounChunksMatch              493418 non-null  int64         
 3   NounChunkMatchWords          493418 non-null  object        
 4   NounChunkMatchWordsFiltered  493418 non-null  object        
 5   NounChunkMatchFiltered       493418 non-null  int64         
 6   AllDistinctAreas             493418 non-null  object        
 7   DistinctAreaCount            493418 non-null  object        
 8   DominantDistinctArea         493418 non-null  object        
 9   Title                        493418 non-null  object        
 10  Type                         493418 non-null  object        
 11  SourceType                

In [27]:
# List of columns for different approaches
area_range=15    # Number of areas + 1
col_list=[]
for i in range(1,area_range):
    var='DominantDistinctArea'+str(i)
    col_list.append(var)

In [28]:
# Dummies by dominant distinct area (dda)
for i in range(1,area_range):
    var='DominantDistinctArea'+str(i)
    df[var]=0
    for j in range(0, len(df)):
        if i in df['DominantDistinctArea'][j]:
            df[var][j]=1
#print(df.info())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [29]:
# Merge with sentiment scores
sentimentScores=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/RegRelevant_ArticleSentimentScores.csv')
print(sentimentScores.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                493418 non-null  int64  
 1   StartDate         493418 non-null  object 
 2   Newspaper         493418 non-null  object 
 3   UncertaintyScore  493418 non-null  float64
 4   GIscore           493418 non-null  float64
 5   LMscore           493418 non-null  float64
 6   LSDscore          493418 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 26.4+ MB
None


In [30]:
# Merge
df['ID']=df['ID'].astype('int64')
df2=df.merge(sentimentScores[['ID','UncertaintyScore','GIscore','LMscore','LSDscore']],on='ID',how='left')
#print(df2.info())

In [31]:
df2[['ID','StartDate','Newspaper','UncertaintyScore','GIscore','LMscore','LSDscore']+col_list].\
    to_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegArea_ArticleSentimentScores_Robust1.csv',index=False)

## 5. Monthly article count by area

In [32]:
# Aggregate monthly article counts
monthlyAreaCount=df[['Newspaper','Year','Month']+col_list].groupby(['Newspaper','Year','Month']).agg('sum').reset_index()
print(monthlyAreaCount.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2945 entries, 0 to 2944
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Newspaper               2945 non-null   object
 1   Year                    2945 non-null   int64 
 2   Month                   2945 non-null   int64 
 3   DominantDistinctArea1   2945 non-null   int64 
 4   DominantDistinctArea2   2945 non-null   int64 
 5   DominantDistinctArea3   2945 non-null   int64 
 6   DominantDistinctArea4   2945 non-null   int64 
 7   DominantDistinctArea5   2945 non-null   int64 
 8   DominantDistinctArea6   2945 non-null   int64 
 9   DominantDistinctArea7   2945 non-null   int64 
 10  DominantDistinctArea8   2945 non-null   int64 
 11  DominantDistinctArea9   2945 non-null   int64 
 12  DominantDistinctArea10  2945 non-null   int64 
 13  DominantDistinctArea11  2945 non-null   int64 
 14  DominantDistinctArea12  2945 non-null   int64 
 15  Domi

In [33]:
monthlyAreaCount.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegArea_MonthlyArticleCountByNewspaper_Robust1.csv',index=False)

## 6. Construct categorical sentiment indexes

In [34]:
df=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegArea_ArticleSentimentScores_Robust1.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 21 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   ID                      493418 non-null  int64  
 1   StartDate               493418 non-null  object 
 2   Newspaper               493418 non-null  object 
 3   UncertaintyScore        493418 non-null  float64
 4   GIscore                 493418 non-null  float64
 5   LMscore                 493418 non-null  float64
 6   LSDscore                493418 non-null  float64
 7   DominantDistinctArea1   493418 non-null  int64  
 8   DominantDistinctArea2   493418 non-null  int64  
 9   DominantDistinctArea3   493418 non-null  int64  
 10  DominantDistinctArea4   493418 non-null  int64  
 11  DominantDistinctArea5   493418 non-null  int64  
 12  DominantDistinctArea6   493418 non-null  int64  
 13  DominantDistinctArea7   493418 non-null  int64  
 14  DominantDistinctArea

In [35]:
# Reformat data
df['StartDate']=df['StartDate'].astype('datetime64[ns]')
df['Year']=df['StartDate'].dt.year
df['Month']=df['StartDate'].dt.month
df['Newspaper']=df['Newspaper'].astype('category')
#print(df.info())

In [36]:
# Create year-month dataframe
df_ym=df[['Year','Month']].drop_duplicates().sort_values(['Year','Month']).reset_index(drop=True).reset_index()
df_ym['YM']=df_ym['index']+1
df_ym['YM']=df_ym['YM'].astype('str')
df_ym=df_ym.drop('index',axis=1)
print(df_ym,'\n',len(df_ym))

     Year  Month   YM
0    1985      1    1
1    1985      2    2
2    1985      3    3
3    1985      4    4
4    1985      5    5
..    ...    ...  ...
423  2020      4  424
424  2020      5  425
425  2020      6  426
426  2020      7  427
427  2020      8  428

[428 rows x 3 columns] 
 428


In [37]:
# Merge year-month dataframe
df=df.merge(df_ym[['Year','Month','YM']],on=['Year','Month'],how='left').sort_values(['Year','Month']).reset_index(drop=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 24 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   ID                      493418 non-null  int64         
 1   StartDate               493418 non-null  datetime64[ns]
 2   Newspaper               493418 non-null  category      
 3   UncertaintyScore        493418 non-null  float64       
 4   GIscore                 493418 non-null  float64       
 5   LMscore                 493418 non-null  float64       
 6   LSDscore                493418 non-null  float64       
 7   DominantDistinctArea1   493418 non-null  int64         
 8   DominantDistinctArea2   493418 non-null  int64         
 9   DominantDistinctArea3   493418 non-null  int64         
 10  DominantDistinctArea4   493418 non-null  int64         
 11  DominantDistinctArea5   493418 non-null  int64         
 12  DominantDistinctArea6   493418

In [38]:
df=df.rename(columns={'UncertaintyScore':'Uncertaintyscore'})
YM_list=df_ym['YM'].tolist()
#print(YM_list)

In [39]:
# Define a function (suppressing constant) to estimate categorical index
def estimate_categorical_index(score, area):
    df_area=df[df[area]==1].reset_index(drop=True)
    FE_OLS=sm.ols(formula=score + ' ~ 0+C(YM)+C(Newspaper)', data=df_area).fit()
    #print(FE_OLS.summary())

    FE_estimates=pd.DataFrame()
    new_var=score.split('score')[0]+'_'+area
    FE_estimates[new_var]=FE_OLS.params[0:len(df_ym)]
    FE_estimates=FE_estimates.reset_index().rename(columns={'index':'FE'})
    FE_estimates['YM']=FE_estimates['FE'].str.split("[",expand=True)[1].str.split("]",expand=True)[0]
    
    for value in FE_estimates['YM']:
        if value not in YM_list:
            FE_estimates=FE_estimates[FE_estimates['YM']!=value]
    FE_estimates=FE_estimates.drop('FE',axis=1)
    
    return FE_estimates

In [40]:
# List of columns for different classification approaches
area_range=15
area_list=[]
for i in range(1,area_range):
    var='DominantDistinctArea'+str(i)
    area_list.append(var)

In [41]:
# Categorical Uncertainty Index
CategoricalUncertaintyIndex=df_ym
for area in area_list:
    try:
        estimates=estimate_categorical_index('Uncertaintyscore', area)
        CategoricalUncertaintyIndex=CategoricalUncertaintyIndex.merge(estimates,on='YM',how='left')
    except:
        print("Failed:",area)

In [42]:
CategoricalUncertaintyIndex.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegArea_MonthlyUncertaintyIndex_Robust1.csv',index=False)

In [43]:
# Categorical sentiment indexes
for dict in ['GI','LM','LSD']:
    CategoricalSentimentIndex=df_ym
    for area in area_list:
        try:
            estimates=estimate_categorical_index(dict+'score', area)
            CategoricalSentimentIndex=CategoricalSentimentIndex.merge(estimates,on='YM',how='left')
        except:
            print("Failed:",dict+":"+area)      
    CategoricalSentimentIndex.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegArea_Monthly'+dict+'Index_Robust1.csv',index=False)