In [1]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm

## 1. Import reg-relevant article sentiment scores

In [2]:
df=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegArea_ArticleSentimentScores.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 63 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   ID                       493418 non-null  int64  
 1   StartDate                493418 non-null  object 
 2   Newspaper                493418 non-null  object 
 3   UncertaintyScore         493418 non-null  float64
 4   GIscore                  493418 non-null  float64
 5   LMscore                  493418 non-null  float64
 6   LSDscore                 493418 non-null  float64
 7   DominantNounChunkArea1   493418 non-null  int64  
 8   DominantArea1            493418 non-null  int64  
 9   UniqueDistinctArea1      493418 non-null  int64  
 10  DominantDistinctArea1    493418 non-null  int64  
 11  DominantNounChunkArea2   493418 non-null  int64  
 12  DominantArea2            493418 non-null  int64  
 13  UniqueDistinctArea2      493418 non-null  int64  
 14  Domi

In [3]:
# Reformat data
df['StartDate']=df['StartDate'].astype('datetime64[ns]')
df['Year']=df['StartDate'].dt.year
df['Month']=df['StartDate'].dt.month
df['Newspaper']=df['Newspaper'].astype('category')
#print(df.info())

In [4]:
# Create year-month dataframe
df_ym=df[['Year','Month']].drop_duplicates().sort_values(['Year','Month']).reset_index(drop=True).reset_index()
df_ym['YM']=df_ym['index']+1
df_ym['YM']=df_ym['YM'].astype('str')
df_ym=df_ym.drop('index',axis=1)
print(df_ym,'\n',len(df_ym))

     Year  Month   YM
0    1985      1    1
1    1985      2    2
2    1985      3    3
3    1985      4    4
4    1985      5    5
..    ...    ...  ...
423  2020      4  424
424  2020      5  425
425  2020      6  426
426  2020      7  427
427  2020      8  428

[428 rows x 3 columns] 
 428


In [5]:
# Merge year-month dataframe
df=df.merge(df_ym[['Year','Month','YM']],on=['Year','Month'],how='left').sort_values(['Year','Month']).reset_index(drop=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 66 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   ID                       493418 non-null  int64         
 1   StartDate                493418 non-null  datetime64[ns]
 2   Newspaper                493418 non-null  category      
 3   UncertaintyScore         493418 non-null  float64       
 4   GIscore                  493418 non-null  float64       
 5   LMscore                  493418 non-null  float64       
 6   LSDscore                 493418 non-null  float64       
 7   DominantNounChunkArea1   493418 non-null  int64         
 8   DominantArea1            493418 non-null  int64         
 9   UniqueDistinctArea1      493418 non-null  int64         
 10  DominantDistinctArea1    493418 non-null  int64         
 11  DominantNounChunkArea2   493418 non-null  int64         
 12  DominantArea2   

## 2. Estimate categorical sentiment indexes

In [6]:
df=df.rename(columns={'UncertaintyScore':'Uncertaintyscore'})

In [7]:
YM_list=df_ym['YM'].tolist()
#print(YM_list)

In [8]:
# Define a function (suppressing constant) to estimate categorical index
def estimate_categorical_index(score, area):
    df_area=df[df[area]==1].reset_index(drop=True)
    FE_OLS=sm.ols(formula=score + ' ~ 0+C(YM)+C(Newspaper)', data=df_area).fit()
    #print(FE_OLS.summary())

    FE_estimates=pd.DataFrame()
    new_var=score.split('score')[0]+'_'+area
    FE_estimates[new_var]=FE_OLS.params[0:len(df_ym)]
    FE_estimates=FE_estimates.reset_index().rename(columns={'index':'FE'})
    FE_estimates['YM']=FE_estimates['FE'].str.split("[",expand=True)[1].str.split("]",expand=True)[0]
    
    for value in FE_estimates['YM']:
        if value not in YM_list:
            FE_estimates=FE_estimates[FE_estimates['YM']!=value]
    FE_estimates=FE_estimates.drop('FE',axis=1)
    
    return FE_estimates

In [9]:
# List of columns for different classification approaches
area_range=15
area_list=[]
for i in range(1,area_range):
    var1='DominantNounChunkArea'+str(i)
    area_list.append(var1)
    var2='DominantArea'+str(i)
    area_list.append(var2)
    var3='UniqueDistinctArea'+str(i)
    area_list.append(var3)
    var4='DominantDistinctArea'+str(i)
    area_list.append(var4)

In [10]:
# Define another function (with constant) to estimate categorical index
def estimate_categorical_index_constant(score, area):
    df_area=df[df[area]==1].reset_index(drop=True)
    FE_OLS=sm.ols(formula=score + ' ~ C(YM)+C(Newspaper)', data=df_area).fit()
    #print(FE_OLS.summary())

    FE_estimates=pd.DataFrame()
    new_var=score.split('score')[0]+'_'+area
    FE_estimates['coef']=FE_OLS.params[0:len(df_ym)]
    FE_estimates=FE_estimates.reset_index().rename(columns={'index':'FE'})
    
    for value in FE_estimates['FE']:
        if ('YM' not in value) & ('Intercept' not in value):
            FE_estimates=FE_estimates[FE_estimates['YM']!=value]
    
    intercept=FE_estimates[FE_estimates['FE']=='Intercept']['coef'].values
    FE_estimates.loc[FE_estimates['FE']!='Intercept',new_var]=FE_estimates.loc[FE_estimates['FE']!='Intercept','coef']+intercept
    FE_estimates.loc[FE_estimates['FE']=='Intercept',new_var]=FE_estimates.loc[FE_estimates['FE']=='Intercept','coef']
    FE_estimates.loc[FE_estimates['FE']=='Intercept','FE']='C(YM)[T.1]'
    FE_estimates=FE_estimates[['FE',new_var]].reset_index(drop=True)
    FE_estimates['YM']=FE_estimates['FE'].str.split("T.",expand=True)[1].str.split("]",expand=True)[0]
    FE_estimates=FE_estimates.drop('FE',axis=1)
    
    return FE_estimates

In [11]:
# Categorical Uncertainty Index
CategoricalUncertaintyIndex=df_ym
for area in area_list:
    try:
        estimates=estimate_categorical_index('Uncertaintyscore', area)
        CategoricalUncertaintyIndex=CategoricalUncertaintyIndex.merge(estimates,on='YM',how='left')
    except:
        print("Failed:",area)
        estimates=estimate_categorical_index_constant('Uncertaintyscore', area)
        CategoricalUncertaintyIndex=CategoricalUncertaintyIndex.merge(estimates,on='YM',how='left')

In [12]:
print(CategoricalUncertaintyIndex.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428 entries, 0 to 427
Data columns (total 59 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Year                                 428 non-null    int64  
 1   Month                                428 non-null    int64  
 2   YM                                   428 non-null    object 
 3   Uncertainty_DominantNounChunkArea1   428 non-null    float64
 4   Uncertainty_DominantArea1            428 non-null    float64
 5   Uncertainty_UniqueDistinctArea1      428 non-null    float64
 6   Uncertainty_DominantDistinctArea1    428 non-null    float64
 7   Uncertainty_DominantNounChunkArea2   428 non-null    float64
 8   Uncertainty_DominantArea2            428 non-null    float64
 9   Uncertainty_UniqueDistinctArea2      428 non-null    float64
 10  Uncertainty_DominantDistinctArea2    428 non-null    float64
 11  Uncertainty_DominantNounChunkAre

In [13]:
print(CategoricalUncertaintyIndex[['Year','Month','Uncertainty_DominantArea1','Uncertainty_DominantDistinctArea1',
                                   'Uncertainty_DominantArea2','Uncertainty_DominantDistinctArea2']].head())

   Year  Month  Uncertainty_DominantArea1  Uncertainty_DominantDistinctArea1  \
0  1985      1                   0.708625                           0.673390   
1  1985      2                   0.530314                           0.586184   
2  1985      3                   0.685743                           0.651077   
3  1985      4                   0.616541                           0.597864   
4  1985      5                   0.663612                           0.546952   

   Uncertainty_DominantArea2  Uncertainty_DominantDistinctArea2  
0                   0.548945                           0.635044  
1                   0.677661                           0.668489  
2                   0.704475                           0.717031  
3                   0.621052                           0.613644  
4                   0.622346                           0.626820  


In [14]:
CategoricalUncertaintyIndex.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegArea_MonthlyUncertaintyIndex.csv',index=False)

In [15]:
# Categorical sentiment indexes
for dict in ['GI','LM','LSD']:
    CategoricalSentimentIndex=df_ym
    for area in area_list:
        try:
            estimates=estimate_categorical_index(dict+'score', area)
            CategoricalSentimentIndex=CategoricalSentimentIndex.merge(estimates,on='YM',how='left')
        except:
            print("Failed:",dict+":"+area)
            estimates=estimate_categorical_index_constant(dict+'score', area)
            CategoricalSentimentIndex=CategoricalSentimentIndex.merge(estimates,on='YM',how='left')        
    CategoricalSentimentIndex.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegArea_Monthly'+dict+'Index.csv',index=False)

In [16]:
print(CategoricalSentimentIndex[['Year','Month','LSD_DominantArea8','LSD_DominantDistinctArea8',
                                'LSD_DominantArea8','LSD_DominantDistinctArea8']].head())

   Year  Month  LSD_DominantArea8  LSD_DominantDistinctArea8  \
0  1985      1           0.514461                   0.392215   
1  1985      2           0.491890                   0.684705   
2  1985      3           0.073573                  -0.478702   
3  1985      4           0.194892                   0.715342   
4  1985      5           0.487910                   0.665488   

   LSD_DominantArea8  LSD_DominantDistinctArea8  
0           0.514461                   0.392215  
1           0.491890                   0.684705  
2           0.073573                  -0.478702  
3           0.194892                   0.715342  
4           0.487910                   0.665488  
