In [39]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
import re

## 1. Import reg-relevant article sentiment scores

In [40]:
# Import sentiment scores for reg relevant articles
df=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/RegRelevant_ArticleSentimentScores.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                493418 non-null  int64  
 1   StartDate         493418 non-null  object 
 2   Newspaper         493418 non-null  object 
 3   UncertaintyScore  493418 non-null  float64
 4   GIscore           493418 non-null  float64
 5   LMscore           493418 non-null  float64
 6   LSDscore          493418 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 26.4+ MB
None


In [41]:
# Change data format
df['StartDate']=df['StartDate'].astype('datetime64[ns]')
df['Year']=df['StartDate'].dt.year
df['Month']=df['StartDate'].dt.month
df['Newspaper']=df['Newspaper'].astype('category')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                493418 non-null  int64         
 1   StartDate         493418 non-null  datetime64[ns]
 2   Newspaper         493418 non-null  category      
 3   UncertaintyScore  493418 non-null  float64       
 4   GIscore           493418 non-null  float64       
 5   LMscore           493418 non-null  float64       
 6   LSDscore          493418 non-null  float64       
 7   Year              493418 non-null  int64         
 8   Month             493418 non-null  int64         
dtypes: category(1), datetime64[ns](1), float64(4), int64(3)
memory usage: 30.6 MB
None


In [42]:
# Unique year-month dataframe
df_ym=df[['Year','Month']].drop_duplicates().sort_values(['Year','Month']).reset_index(drop=True).reset_index()
df_ym['YM']=df_ym['index']+1
df_ym=df_ym.drop('index',axis=1)
print(df_ym)

     Year  Month   YM
0    1985      1    1
1    1985      2    2
2    1985      3    3
3    1985      4    4
4    1985      5    5
..    ...    ...  ...
423  2020      4  424
424  2020      5  425
425  2020      6  426
426  2020      7  427
427  2020      8  428

[428 rows x 3 columns]


In [43]:
df=df.merge(df_ym[['Year','Month','YM']],on=['Year','Month'],how='left').sort_values(['Year','Month']).reset_index(drop=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                493418 non-null  int64         
 1   StartDate         493418 non-null  datetime64[ns]
 2   Newspaper         493418 non-null  category      
 3   UncertaintyScore  493418 non-null  float64       
 4   GIscore           493418 non-null  float64       
 5   LMscore           493418 non-null  float64       
 6   LSDscore          493418 non-null  float64       
 7   Year              493418 non-null  int64         
 8   Month             493418 non-null  int64         
 9   YM                493418 non-null  int64         
dtypes: category(1), datetime64[ns](1), float64(4), int64(4)
memory usage: 34.4 MB
None


In [44]:
print(df[['ID','UncertaintyScore','GIscore','LMscore','LSDscore']].head())

          ID  UncertaintyScore   GIscore   LMscore  LSDscore
0  292028440          0.000000  0.000000 -4.651163 -2.325581
1  290814345          0.881057 -5.726872 -3.083700 -2.202643
2  290815623          0.543478  4.347826 -1.630435  4.347826
3  292029822          2.542373  1.694915 -0.847458  0.847458
4  294225539          0.000000  6.140351 -0.877193  0.000000


## 2. Estimate sentiment and uncertainty indexes

In [7]:
# Function to estimate index (suppressing constant)
def estimate_index(var_name):
    FE_OLS=sm.ols(formula=var_name + ' ~ 0+C(YM)+C(Newspaper)',
        data=df).fit()
    print(FE_OLS.summary())

    FE_estimates=pd.DataFrame()
    FE_estimates[var_name+'Index']=FE_OLS.params[0:len(df_ym)]
    FE_estimates=FE_estimates.reset_index().rename(columns={'index':'FE'})
    FE_estimates['YM']=FE_estimates['FE'].str.split("[",expand=True)[1].str.split("]",expand=True)[0].astype('int64')
    
    return FE_estimates

In [10]:
# Uncertainty index
UncertaintyIndex=estimate_index('UncertaintyScore')

                            OLS Regression Results                            
Dep. Variable:       UncertaintyScore   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     5.747
Date:                Fri, 12 Mar 2021   Prob (F-statistic):          5.87e-284
Time:                        20:49:15   Log-Likelihood:            -6.7151e+05
No. Observations:              493418   AIC:                         1.344e+06
Df Residuals:                  492984   BIC:                         1.349e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [11]:
# LM index
LMindex=estimate_index('LMscore')

                            OLS Regression Results                            
Dep. Variable:                LMscore   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     6.791
Date:                Fri, 12 Mar 2021   Prob (F-statistic):               0.00
Time:                        20:49:46   Log-Likelihood:            -1.1660e+06
No. Observations:              493418   AIC:                         2.333e+06
Df Residuals:                  492984   BIC:                         2.338e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [12]:
# GI index
GIindex=estimate_index('GIscore')

                            OLS Regression Results                            
Dep. Variable:                GIscore   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     6.233
Date:                Fri, 12 Mar 2021   Prob (F-statistic):          6.87e-322
Time:                        20:50:18   Log-Likelihood:            -1.3824e+06
No. Observations:              493418   AIC:                         2.766e+06
Df Residuals:                  492984   BIC:                         2.771e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [13]:
# LSD index
LSDindex=estimate_index('LSDscore')

                            OLS Regression Results                            
Dep. Variable:               LSDscore   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     7.100
Date:                Fri, 12 Mar 2021   Prob (F-statistic):               0.00
Time:                        20:50:50   Log-Likelihood:            -1.3063e+06
No. Observations:              493418   AIC:                         2.614e+06
Df Residuals:                  492984   BIC:                         2.618e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [14]:
# Merge indexes
sentimentIndex=df_ym.merge(UncertaintyIndex,on='YM',how='outer').\
        merge(LMindex,on='YM',how='outer').\
        merge(GIindex,on='YM',how='outer').\
        merge(LSDindex,on='YM',how='outer').\
        sort_values(['Year','Month'])
print(sentimentIndex.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428 entries, 0 to 427
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   428 non-null    int64  
 1   Month                  428 non-null    int64  
 2   YM                     428 non-null    int64  
 3   FE_x                   428 non-null    object 
 4   UncertaintyScoreIndex  428 non-null    float64
 5   FE_y                   428 non-null    object 
 6   LMscoreIndex           428 non-null    float64
 7   FE_x                   428 non-null    object 
 8   GIscoreIndex           428 non-null    float64
 9   FE_y                   428 non-null    object 
 10  LSDscoreIndex          428 non-null    float64
dtypes: float64(4), int64(3), object(4)
memory usage: 40.1+ KB
None


In [15]:
sentimentIndex=sentimentIndex.drop(['FE_x','FE_y'],axis=1).\
        rename(columns={'UncertaintyScoreIndex':'UncertaintyIndex','LMscoreIndex':'LMindex',
                        'GIscoreIndex':'GIindex','LSDscoreIndex':'LSDindex'})

In [16]:
print(sentimentIndex)

     Year  Month   YM  UncertaintyIndex   LMindex   GIindex  LSDindex
0    1985      1    1          0.604631 -1.785589  1.351983  0.261399
1    1985      2    2          0.591508 -1.902501  1.473553  0.059051
2    1985      3    3          0.626842 -1.963374  1.210650 -0.045643
3    1985      4    4          0.649306 -1.849722  1.411521  0.080193
4    1985      5    5          0.652741 -1.778145  1.455811  0.277250
..    ...    ...  ...               ...       ...       ...       ...
423  2020      4  424          0.787698 -1.819517  1.837781  0.400182
424  2020      5  425          0.679267 -2.021910  1.733135  0.252922
425  2020      6  426          0.661661 -2.050625  1.472307  0.442631
426  2020      7  427          0.659169 -1.944160  1.471159  0.305342
427  2020      8  428          0.655902 -1.790790  1.488318  0.294350

[428 rows x 7 columns]


In [17]:
print(sentimentIndex.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428 entries, 0 to 427
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              428 non-null    int64  
 1   Month             428 non-null    int64  
 2   YM                428 non-null    int64  
 3   UncertaintyIndex  428 non-null    float64
 4   LMindex           428 non-null    float64
 5   GIindex           428 non-null    float64
 6   LSDindex          428 non-null    float64
dtypes: float64(4), int64(3)
memory usage: 26.8 KB
None


In [18]:
# Export
sentimentIndex.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/RegRelevant_MonthlySentimentIndex.csv',index=False)

## 3. Estimate indexes for individual newspapers

In [45]:
# All newspapers
newspapers=list(df['Newspaper'].unique())
print(len(newspapers),newspapers)

7 ['Los Angeles Times', 'Chicago Tribune', 'Boston Globe', 'New York Times', 'Wall Street Journal', 'The Washington Post', 'USA Today']


In [46]:
# Function to estimate index for each newspaper
def estimate_index_newspaper(var_name, newspaper):
    df_newspaper=df[df['Newspaper']==newspaper].reset_index(drop=True)
    FE_OLS=sm.ols(formula=var_name + ' ~ 0+C(YM)',
        data=df_newspaper).fit()
    #print(FE_OLS.summary())

    FE_estimates=pd.DataFrame()
    new_var=re.split('score',var_name, flags=re.IGNORECASE)[0]+'Index_'+newspaper.replace(' ','')
    FE_estimates[new_var]=FE_OLS.params[0:len(df_ym)]
    FE_estimates=FE_estimates.reset_index().rename(columns={'index':'FE'})
    FE_estimates['YM']=FE_estimates['FE'].str.split("[",expand=True)[1].str.split("]",expand=True)[0].astype('int64')
    
    return FE_estimates[[new_var, 'YM']]

In [None]:
# Uncertainty indexes for individual newspapers
IndexByNewspaper=df_ym
for newspaper in newspapers:
    uncertainty=estimate_index_newspaper('UncertaintyScore',newspaper)
    IndexByNewspaper=IndexByNewspaper.merge(uncertainty,on='YM',how='outer')
    LM=estimate_index_newspaper('LMscore',newspaper)
    IndexByNewspaper=IndexByNewspaper.merge(LM,on='YM',how='outer')
    GI=estimate_index_newspaper('GIscore',newspaper)
    IndexByNewspaper=IndexByNewspaper.merge(GI,on='YM',how='outer')   
    LSD=estimate_index_newspaper('LSDscore',newspaper)
    IndexByNewspaper=IndexByNewspaper.merge(LSD,on='YM',how='outer')    
print(IndexByNewspaper.info())

In [38]:
IndexByNewspaper.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/RegRelevant_MonthlySentimentIndex_ByNewspaper.csv',index=False)