In [40]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as sm
import re

## 1. Identify articles containing "deregulat*"

In [41]:
# Import sentiment scores for reg relevant articles
df=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/RegRelevant_ArticleSentimentScores.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493418 entries, 0 to 493417
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                493418 non-null  int64  
 1   StartDate         493418 non-null  object 
 2   Newspaper         493418 non-null  object 
 3   UncertaintyScore  493418 non-null  float64
 4   GIscore           493418 non-null  float64
 5   LMscore           493418 non-null  float64
 6   LSDscore          493418 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 26.4+ MB
None


In [42]:
df_regSentsExpand=pd.read_pickle('/home/ec2-user/SageMaker/New Uncertainty/Reg Relevance/RegSentsExpand_NounChunks3.pkl')
print(df_regSentsExpand.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 822737 entries, 0 to 822736
Data columns (total 6 columns):
 #   Column                       Non-Null Count   Dtype 
---  ------                       --------------   ----- 
 0   ID                           822737 non-null  object
 1   RegSentsExpand               822737 non-null  object
 2   NounChunksMatch              822737 non-null  int64 
 3   NounChunkMatchWords          822737 non-null  object
 4   NounChunkMatchWordsFiltered  822737 non-null  object
 5   NounChunkMatchFiltered       822737 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 37.7+ MB
None


In [43]:
df_regSentsExpand['ID']=df_regSentsExpand['ID'].astype('int64')

In [44]:
df=df.merge(df_regSentsExpand[['ID','RegSentsExpand']],on='ID',how='left')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 493418 entries, 0 to 493417
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                493418 non-null  int64  
 1   StartDate         493418 non-null  object 
 2   Newspaper         493418 non-null  object 
 3   UncertaintyScore  493418 non-null  float64
 4   GIscore           493418 non-null  float64
 5   LMscore           493418 non-null  float64
 6   LSDscore          493418 non-null  float64
 7   RegSentsExpand    493418 non-null  object 
dtypes: float64(4), int64(1), object(3)
memory usage: 33.9+ MB
None


In [45]:
# Identify reg sections with "deregulat*"
df['Dereg']=0
for i in range(0,len(df)):
    if len(re.findall(r'\bderegulat[a-zA-Z]+\b',df['RegSentsExpand'][i],flags=re.IGNORECASE))>0:
        df['Dereg'][i]=1
print(df['Dereg'].value_counts())        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


0    467351
1     26067
Name: Dereg, dtype: int64


In [47]:
df_dereg=df[df['Dereg']==1].sort_values(['StartDate','Newspaper']).reset_index(drop=True)
print(df_dereg[['StartDate','Newspaper']])

        StartDate            Newspaper
0      1985-01-01         Boston Globe
1      1985-01-01         Boston Globe
2      1985-01-01      Chicago Tribune
3      1985-01-01       New York Times
4      1985-01-02       New York Times
...           ...                  ...
26062  2020-08-29  Wall Street Journal
26063  2020-08-30  The Washington Post
26064  2020-08-30  The Washington Post
26065  2020-08-30  Wall Street Journal
26066  2020-08-31  Wall Street Journal

[26067 rows x 2 columns]


## 2. Clean data

In [23]:
df=df[df['Dereg']!=1].reset_index(drop=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467351 entries, 0 to 467350
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ID                467351 non-null  int64  
 1   StartDate         467351 non-null  object 
 2   Newspaper         467351 non-null  object 
 3   UncertaintyScore  467351 non-null  float64
 4   GIscore           467351 non-null  float64
 5   LMscore           467351 non-null  float64
 6   LSDscore          467351 non-null  float64
 7   RegSentsExpand    467351 non-null  object 
 8   Dereg             467351 non-null  int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 32.1+ MB
None


In [24]:
# Change data format
df['StartDate']=df['StartDate'].astype('datetime64[ns]')
df['Year']=df['StartDate'].dt.year
df['Month']=df['StartDate'].dt.month
df['Newspaper']=df['Newspaper'].astype('category')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467351 entries, 0 to 467350
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                467351 non-null  int64         
 1   StartDate         467351 non-null  datetime64[ns]
 2   Newspaper         467351 non-null  category      
 3   UncertaintyScore  467351 non-null  float64       
 4   GIscore           467351 non-null  float64       
 5   LMscore           467351 non-null  float64       
 6   LSDscore          467351 non-null  float64       
 7   RegSentsExpand    467351 non-null  object        
 8   Dereg             467351 non-null  int64         
 9   Year              467351 non-null  int64         
 10  Month             467351 non-null  int64         
dtypes: category(1), datetime64[ns](1), float64(4), int64(4), object(1)
memory usage: 36.1+ MB
None


In [25]:
# Unique year-month dataframe
df_ym=df[['Year','Month']].drop_duplicates().sort_values(['Year','Month']).reset_index(drop=True).reset_index()
df_ym['YM']=df_ym['index']+1
df_ym=df_ym.drop('index',axis=1)
print(df_ym)

     Year  Month   YM
0    1985      1    1
1    1985      2    2
2    1985      3    3
3    1985      4    4
4    1985      5    5
..    ...    ...  ...
423  2020      4  424
424  2020      5  425
425  2020      6  426
426  2020      7  427
427  2020      8  428

[428 rows x 3 columns]


In [26]:
df=df.merge(df_ym[['Year','Month','YM']],on=['Year','Month'],how='left').sort_values(['Year','Month']).reset_index(drop=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467351 entries, 0 to 467350
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   ID                467351 non-null  int64         
 1   StartDate         467351 non-null  datetime64[ns]
 2   Newspaper         467351 non-null  category      
 3   UncertaintyScore  467351 non-null  float64       
 4   GIscore           467351 non-null  float64       
 5   LMscore           467351 non-null  float64       
 6   LSDscore          467351 non-null  float64       
 7   RegSentsExpand    467351 non-null  object        
 8   Dereg             467351 non-null  int64         
 9   Year              467351 non-null  int64         
 10  Month             467351 non-null  int64         
 11  YM                467351 non-null  int64         
dtypes: category(1), datetime64[ns](1), float64(4), int64(5), object(1)
memory usage: 39.7+ MB
None


In [27]:
print(df[['ID','UncertaintyScore','GIscore','LMscore','LSDscore']].head())

          ID  UncertaintyScore   GIscore   LMscore  LSDscore
0  292028440          0.000000  0.000000 -4.651163 -2.325581
1  290815623          0.543478  4.347826 -1.630435  4.347826
2  292029822          2.542373  1.694915 -0.847458  0.847458
3  294225539          0.000000  6.140351 -0.877193  0.000000
4  290808213          0.980392 -0.980392 -1.960784 -0.980392


## 3. Estimate sentiment and uncertainty indexes

In [28]:
# Function to estimate index (suppressing constant)
def estimate_index(var_name):
    FE_OLS=sm.ols(formula=var_name + ' ~ 0+C(YM)+C(Newspaper)',
        data=df).fit()
    print(FE_OLS.summary())

    FE_estimates=pd.DataFrame()
    FE_estimates[var_name+'Index']=FE_OLS.params[0:len(df_ym)]
    FE_estimates=FE_estimates.reset_index().rename(columns={'index':'FE'})
    FE_estimates['YM']=FE_estimates['FE'].str.split("[",expand=True)[1].str.split("]",expand=True)[0].astype('int64')
    
    return FE_estimates

In [29]:
# Uncertainty index
UncertaintyIndex=estimate_index('UncertaintyScore')

                            OLS Regression Results                            
Dep. Variable:       UncertaintyScore   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     5.571
Date:                Tue, 18 May 2021   Prob (F-statistic):          2.38e-270
Time:                        19:39:27   Log-Likelihood:            -6.3961e+05
No. Observations:              467351   AIC:                         1.280e+06
Df Residuals:                  466917   BIC:                         1.285e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [30]:
# LM index
LMindex=estimate_index('LMscore')

                            OLS Regression Results                            
Dep. Variable:                LMscore   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     6.471
Date:                Tue, 18 May 2021   Prob (F-statistic):               0.00
Time:                        19:40:10   Log-Likelihood:            -1.1079e+06
No. Observations:              467351   AIC:                         2.217e+06
Df Residuals:                  466917   BIC:                         2.221e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [31]:
# GI index
GIindex=estimate_index('GIscore')

                            OLS Regression Results                            
Dep. Variable:                GIscore   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     6.091
Date:                Tue, 18 May 2021   Prob (F-statistic):          1.21e-310
Time:                        19:40:53   Log-Likelihood:            -1.3129e+06
No. Observations:              467351   AIC:                         2.627e+06
Df Residuals:                  466917   BIC:                         2.631e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [32]:
# LSD index
LSDindex=estimate_index('LSDscore')

                            OLS Regression Results                            
Dep. Variable:               LSDscore   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     6.773
Date:                Tue, 18 May 2021   Prob (F-statistic):               0.00
Time:                        19:41:38   Log-Likelihood:            -1.2414e+06
No. Observations:              467351   AIC:                         2.484e+06
Df Residuals:                  466917   BIC:                         2.488e+06
Df Model:                         433                                         
Covariance Type:            nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(

In [33]:
# Merge indexes
sentimentIndex=df_ym.merge(UncertaintyIndex,on='YM',how='outer').\
        merge(LMindex,on='YM',how='outer').\
        merge(GIindex,on='YM',how='outer').\
        merge(LSDindex,on='YM',how='outer').\
        sort_values(['Year','Month'])
print(sentimentIndex.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428 entries, 0 to 427
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   428 non-null    int64  
 1   Month                  428 non-null    int64  
 2   YM                     428 non-null    int64  
 3   FE_x                   428 non-null    object 
 4   UncertaintyScoreIndex  428 non-null    float64
 5   FE_y                   428 non-null    object 
 6   LMscoreIndex           428 non-null    float64
 7   FE_x                   428 non-null    object 
 8   GIscoreIndex           428 non-null    float64
 9   FE_y                   428 non-null    object 
 10  LSDscoreIndex          428 non-null    float64
dtypes: float64(4), int64(3), object(4)
memory usage: 40.1+ KB
None


In [34]:
sentimentIndex=sentimentIndex.drop(['FE_x','FE_y'],axis=1).\
        rename(columns={'UncertaintyScoreIndex':'UncertaintyIndex','LMscoreIndex':'LMindex',
                        'GIscoreIndex':'GIindex','LSDscoreIndex':'LSDindex'})

In [35]:
print(sentimentIndex)

     Year  Month   YM  UncertaintyIndex   LMindex   GIindex  LSDindex
0    1985      1    1          0.593382 -1.873716  1.416218  0.229804
1    1985      2    2          0.565654 -2.026707  1.382028  0.014728
2    1985      3    3          0.615707 -2.035717  1.157062 -0.137810
3    1985      4    4          0.651001 -1.936667  1.449065  0.036833
4    1985      5    5          0.652354 -1.796473  1.494990  0.248346
..    ...    ...  ...               ...       ...       ...       ...
423  2020      4  424          0.781880 -1.813227  1.870549  0.422744
424  2020      5  425          0.684124 -2.008187  1.756084  0.272658
425  2020      6  426          0.663700 -2.049850  1.480211  0.454404
426  2020      7  427          0.660098 -1.953976  1.486845  0.316831
427  2020      8  428          0.659842 -1.774000  1.508842  0.291868

[428 rows x 7 columns]


In [36]:
print(sentimentIndex.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 428 entries, 0 to 427
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Year              428 non-null    int64  
 1   Month             428 non-null    int64  
 2   YM                428 non-null    int64  
 3   UncertaintyIndex  428 non-null    float64
 4   LMindex           428 non-null    float64
 5   GIindex           428 non-null    float64
 6   LSDindex          428 non-null    float64
dtypes: float64(4), int64(3)
memory usage: 26.8 KB
None


In [38]:
# Export
sentimentIndex.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Sentiment Analysis/RegRelevant_MonthlySentimentIndex_Robust1.csv',index=False)