In [21]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm

## 1. Article-level sentiment scores by area

In [22]:
df=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegArea_ArticleSentimentScores.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 63 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   ID                       493418 non-null  int64  
 1   StartDate                493418 non-null  object 
 2   Newspaper                493418 non-null  object 
 3   UncertaintyScore         493418 non-null  float64
 4   GIscore                  493418 non-null  float64
 5   LMscore                  493418 non-null  float64
 6   LSDscore                 493418 non-null  float64
 7   DominantNounChunkArea1   493418 non-null  int64  
 8   DominantArea1            493418 non-null  int64  
 9   UniqueDistinctArea1      493418 non-null  int64  
 10  DominantDistinctArea1    493418 non-null  int64  
 11  DominantNounChunkArea2   493418 non-null  int64  
 12  DominantArea2            493418 non-null  int64  
 13  UniqueDistinctArea2      493418 non-null  int64  
 14  Domi

In [23]:
df['StartDate']=df['StartDate'].astype('datetime64[ns]')
df['Year']=df['StartDate'].dt.year
df['Month']=df['StartDate'].dt.month
df['Newspaper']=df['Newspaper'].astype('category')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 65 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   ID                       493418 non-null  int64         
 1   StartDate                493418 non-null  datetime64[ns]
 2   Newspaper                493418 non-null  category      
 3   UncertaintyScore         493418 non-null  float64       
 4   GIscore                  493418 non-null  float64       
 5   LMscore                  493418 non-null  float64       
 6   LSDscore                 493418 non-null  float64       
 7   DominantNounChunkArea1   493418 non-null  int64         
 8   DominantArea1            493418 non-null  int64         
 9   UniqueDistinctArea1      493418 non-null  int64         
 10  DominantDistinctArea1    493418 non-null  int64         
 11  DominantNounChunkArea2   493418 non-null  int64         
 12  DominantArea2   

In [24]:
df_ym=df[['Year','Month']].drop_duplicates().sort_values(['Year','Month']).reset_index(drop=True).reset_index()
df_ym['YM']=df_ym['index']+1
df_ym=df_ym.drop('index',axis=1)
print(df_ym)

     Year  Month   YM
0    1985      1    1
1    1985      2    2
2    1985      3    3
3    1985      4    4
4    1985      5    5
..    ...    ...  ...
423  2020      4  424
424  2020      5  425
425  2020      6  426
426  2020      7  427
427  2020      8  428

[428 rows x 3 columns]


In [25]:
df=df.merge(df_ym[['Year','Month','YM']],on=['Year','Month'],how='left')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 493418 entries, 0 to 493417
Data columns (total 66 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   ID                       493418 non-null  int64         
 1   StartDate                493418 non-null  datetime64[ns]
 2   Newspaper                493418 non-null  category      
 3   UncertaintyScore         493418 non-null  float64       
 4   GIscore                  493418 non-null  float64       
 5   LMscore                  493418 non-null  float64       
 6   LSDscore                 493418 non-null  float64       
 7   DominantNounChunkArea1   493418 non-null  int64         
 8   DominantArea1            493418 non-null  int64         
 9   UniqueDistinctArea1      493418 non-null  int64         
 10  DominantDistinctArea1    493418 non-null  int64         
 11  DominantNounChunkArea2   493418 non-null  int64         
 12  DominantArea2   

## 2. Compute indexes

In [26]:
# Function to estimate index (suppressing constant)
def estimate_index(dataset,var_name):
    FE_OLS=sm.ols(formula=var_name + ' ~ 0+C(YM)+C(Newspaper)',
        data=dataset).fit()
    print(FE_OLS.summary())

    FE_estimates=pd.DataFrame()
    FE_estimates[var_name+'Index']=FE_OLS.params[0:max(df['YM'])]
    FE_estimates=FE_estimates.reset_index().rename(columns={'index':'FE'})
    FE_estimates['YM']=FE_estimates['FE'].str.split("[",expand=True)[1].str.split("]",expand=True)[0].astype('int64')
    
    return FE_estimates

### Uncertainty index excluding Areas 3 (transportation) and 4 (labor and workforce)

In [27]:
df_no34=df[(df['DominantDistinctArea3']!=1) & (df['DominantDistinctArea4']!=1)].sort_values('StartDate')
print("Remaining:",len(df_no34), "Removed:", len(df)-len(df_no34))

Remaining: 430748 Removed: 62670


In [28]:
UncertaintyIndex=estimate_index(df_no34,'UncertaintyScore')

                            OLS Regression Results                            
Dep. Variable:       UncertaintyScore   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     5.470
Date:                Wed, 26 May 2021   Prob (F-statistic):          1.57e-262
Time:                        16:44:55   Log-Likelihood:            -5.8943e+05
No. Observations:              430748   AIC:                         1.180e+06
Df Residuals:                  430314   BIC:                         1.184e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [29]:
UncertaintyIndex

Unnamed: 0,FE,UncertaintyScoreIndex,YM
0,C(YM)[1],0.605650,1
1,C(YM)[2],0.597292,2
2,C(YM)[3],0.621604,3
3,C(YM)[4],0.647136,4
4,C(YM)[5],0.661479,5
...,...,...,...
423,C(YM)[424],0.789668,424
424,C(YM)[425],0.695858,425
425,C(YM)[426],0.672160,426
426,C(YM)[427],0.658026,427


### Sentiment indexes excluding Areas 3 (transportation), 7 (finance and banking), 8 (general business and trade)

In [30]:
df_no378=df[(df['DominantDistinctArea3']!=1) & (df['DominantDistinctArea7']!=1) & (df['DominantDistinctArea8']!=1)]
print("Remaining:",len(df_no378), "Removed:", len(df)-len(df_no378))

Remaining: 294464 Removed: 198954


In [31]:
LMindex=estimate_index(df_no378,'LMscore')

                            OLS Regression Results                            
Dep. Variable:                LMscore   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     3.978
Date:                Wed, 26 May 2021   Prob (F-statistic):          2.48e-152
Time:                        16:45:14   Log-Likelihood:            -7.0249e+05
No. Observations:              294464   AIC:                         1.406e+06
Df Residuals:                  294030   BIC:                         1.410e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [32]:
GIindex=estimate_index(df_no378,'GIscore')

                            OLS Regression Results                            
Dep. Variable:                GIscore   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     4.765
Date:                Wed, 26 May 2021   Prob (F-statistic):          4.63e-209
Time:                        16:45:33   Log-Likelihood:            -8.3204e+05
No. Observations:              294464   AIC:                         1.665e+06
Df Residuals:                  294030   BIC:                         1.670e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [33]:
LSDindex=estimate_index(df_no378,'LSDscore')

                            OLS Regression Results                            
Dep. Variable:               LSDscore   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     5.094
Date:                Wed, 26 May 2021   Prob (F-statistic):          1.21e-233
Time:                        16:45:51   Log-Likelihood:            -7.8929e+05
No. Observations:              294464   AIC:                         1.579e+06
Df Residuals:                  294030   BIC:                         1.584e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [34]:
# Merge indexes
sentimentIndex=df_ym.merge(UncertaintyIndex,on='YM',how='outer').\
        merge(LMindex,on='YM',how='outer').\
        merge(GIindex,on='YM',how='outer').\
        merge(LSDindex,on='YM',how='outer').\
        sort_values(['Year','Month'])
print(sentimentIndex.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428 entries, 0 to 427
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   428 non-null    int64  
 1   Month                  428 non-null    int64  
 2   YM                     428 non-null    int64  
 3   FE_x                   428 non-null    object 
 4   UncertaintyScoreIndex  428 non-null    float64
 5   FE_y                   428 non-null    object 
 6   LMscoreIndex           428 non-null    float64
 7   FE_x                   428 non-null    object 
 8   GIscoreIndex           428 non-null    float64
 9   FE_y                   428 non-null    object 
 10  LSDscoreIndex          428 non-null    float64
dtypes: float64(4), int64(3), object(4)
memory usage: 40.1+ KB
None


In [35]:
sentimentIndex=sentimentIndex.drop(['FE_x','FE_y'],axis=1).\
        rename(columns={'UncertaintyScoreIndex':'UncertaintyIndex_ExArea34','LMscoreIndex':'LMindex_ExArea378',
                        'GIscoreIndex':'GIindex_ExArea378','LSDscoreIndex':'LSDindex_ExArea378'})

In [36]:
print(sentimentIndex.head())

   Year  Month  YM  UncertaintyIndex_ExArea34  LMindex_ExArea378  \
0  1985      1   1                   0.605650          -1.982825   
1  1985      2   2                   0.597292          -1.975764   
2  1985      3   3                   0.621604          -1.849796   
3  1985      4   4                   0.647136          -1.828774   
4  1985      5   5                   0.661479          -1.976950   

   GIindex_ExArea378  LSDindex_ExArea378  
0           1.173360            0.045982  
1           1.378060           -0.177901  
2           1.331696           -0.091253  
3           1.355680           -0.047734  
4           1.086896           -0.008463  


In [37]:
# Export
sentimentIndex.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Categorical Index/RegRelevant_MonthlySentimentIndex_ExcludeAreas.csv',index=False)