In [1]:
import numpy as np
import pandas as pd
import datetime
import statsmodels.formula.api as sm

## 1. Import reg-relevant article sentiment scores

In [2]:
df=pd.read_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegArea_ArticleSentimentScores.csv')
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608172 entries, 0 to 608171
Data columns (total 36 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   ID                           608172 non-null  int64  
 1   Title                        608172 non-null  object 
 2   Type                         608172 non-null  object 
 3   StartDate                    608172 non-null  object 
 4   EndDate                      608172 non-null  object 
 5   TextWordCount                608172 non-null  int64  
 6   PubTitle                     608172 non-null  object 
 7   SourceType                   608172 non-null  object 
 8   Year                         608172 non-null  float64
 9   Month                        608172 non-null  float64
 10  Newspaper                    608172 non-null  object 
 11  RegSentsExpand               608172 non-null  object 
 12  RegSentExpandLength          608172 non-null  int64  
 13 

In [3]:
# Reformat data
df['StartDate']=df['StartDate'].astype('datetime64[ns]')
df['Year']=df['StartDate'].dt.year
df['Month']=df['StartDate'].dt.month
df['Newspaper']=df['Newspaper'].astype('category')
#print(df.info())

In [4]:
# Specify index start date and end date
start_date=datetime.datetime(1985,1,1)
end_date=datetime.datetime(2021,12,31)
end_month=end_date.strftime('%b%Y')

df=df[(df['StartDate']>=start_date) & (df['StartDate']<=end_date)].sort_values('StartDate').reset_index(drop=True)
print(df[['StartDate','Year','Month']])

        StartDate  Year  Month
0      1985-01-01  1985      1
1      1985-01-01  1985      1
2      1985-01-01  1985      1
3      1985-01-01  1985      1
4      1985-01-01  1985      1
...           ...   ...    ...
608167 2021-12-31  2021     12
608168 2021-12-31  2021     12
608169 2021-12-31  2021     12
608170 2021-12-31  2021     12
608171 2021-12-31  2021     12

[608172 rows x 3 columns]


In [5]:
# Create year-month dataframe
df_ym=df[['Year','Month']].drop_duplicates().sort_values(['Year','Month']).reset_index(drop=True).reset_index()
df_ym['YM']=df_ym['index']+1
df_ym['YM']=df_ym['YM'].astype('str')
df_ym=df_ym.drop('index',axis=1)
print(df_ym)

     Year  Month   YM
0    1985      1    1
1    1985      2    2
2    1985      3    3
3    1985      4    4
4    1985      5    5
..    ...    ...  ...
439  2021      8  440
440  2021      9  441
441  2021     10  442
442  2021     11  443
443  2021     12  444

[444 rows x 3 columns]


In [6]:
# Merge year-month dataframe
df=df.merge(df_ym[['Year','Month','YM']],on=['Year','Month'],how='left').sort_values(['Year','Month']).reset_index(drop=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608172 entries, 0 to 608171
Data columns (total 37 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   ID                           608172 non-null  int64         
 1   Title                        608172 non-null  object        
 2   Type                         608172 non-null  object        
 3   StartDate                    608172 non-null  datetime64[ns]
 4   EndDate                      608172 non-null  object        
 5   TextWordCount                608172 non-null  int64         
 6   PubTitle                     608172 non-null  object        
 7   SourceType                   608172 non-null  object        
 8   Year                         608172 non-null  int64         
 9   Month                        608172 non-null  int64         
 10  Newspaper                    608172 non-null  category      
 11  RegSentsExpand            

## 2. Estimate categorical indexes

In [7]:
df=df.rename(columns={'UncertaintyScore':'Uncertaintyscore'})

In [8]:
YM_list=df_ym['YM'].tolist()
#print(YM_list)

In [9]:
# Define a function (suppressing constant) to estimate categorical index
def estimate_categorical_index(score, area):
    df_area=df[df[area]==1].reset_index(drop=True)
    FE_OLS=sm.ols(formula=score + ' ~ 0+C(YM)+C(Newspaper)', data=df_area).fit()
    #print(FE_OLS.summary())

    FE_estimates=pd.DataFrame()
    new_var=score.split('score')[0]+'_'+area
    FE_estimates[new_var]=FE_OLS.params[0:len(df_ym)]
    FE_estimates=FE_estimates.reset_index().rename(columns={'index':'FE'})
    FE_estimates['YM']=FE_estimates['FE'].str.split("[",expand=True)[1].str.split("]",expand=True)[0]
    
    for value in FE_estimates['YM']:
        if value not in YM_list:
            FE_estimates=FE_estimates[FE_estimates['YM']!=value]
    FE_estimates=FE_estimates.drop('FE',axis=1)
    
    return FE_estimates

In [10]:
# List of columns for all areas
area_range=15
area_list=[]
for i in range(1,area_range):
    var='DominantDistinctArea'+str(i)
    area_list.append(var)

In [11]:
# Define another function (with constant) to estimate categorical index
def estimate_categorical_index_constant(score, area):
    df_area=df[df[area]==1].reset_index(drop=True)
    FE_OLS=sm.ols(formula=score + ' ~ C(YM)+C(Newspaper)', data=df_area).fit()
    #print(FE_OLS.summary())

    FE_estimates=pd.DataFrame()
    new_var=score.split('score')[0]+'_'+area
    FE_estimates['coef']=FE_OLS.params[0:len(df_ym)]
    FE_estimates=FE_estimates.reset_index().rename(columns={'index':'FE'})
    
    for value in FE_estimates['FE']:
        if ('YM' not in value) & ('Intercept' not in value):
            FE_estimates=FE_estimates[FE_estimates['YM']!=value]
    
    intercept=FE_estimates[FE_estimates['FE']=='Intercept']['coef'].values
    FE_estimates.loc[FE_estimates['FE']!='Intercept',new_var]=FE_estimates.loc[FE_estimates['FE']!='Intercept','coef']+intercept
    FE_estimates.loc[FE_estimates['FE']=='Intercept',new_var]=FE_estimates.loc[FE_estimates['FE']=='Intercept','coef']
    FE_estimates.loc[FE_estimates['FE']=='Intercept','FE']='C(YM)[T.1]'
    FE_estimates=FE_estimates[['FE',new_var]].reset_index(drop=True)
    FE_estimates['YM']=FE_estimates['FE'].str.split("T.",expand=True)[1].str.split("]",expand=True)[0]
    FE_estimates=FE_estimates.drop('FE',axis=1)
    
    return FE_estimates

In [12]:
# Categorical Uncertainty Index
CategoricalUncertaintyIndex=df_ym
for area in area_list:
    try:
        estimates=estimate_categorical_index('Uncertaintyscore', area)
        CategoricalUncertaintyIndex=CategoricalUncertaintyIndex.merge(estimates,on='YM',how='left')
    except:
        print("Failed:",area)
        estimates=estimate_categorical_index_constant('Uncertaintyscore', area)
        CategoricalUncertaintyIndex=CategoricalUncertaintyIndex.merge(estimates,on='YM',how='left')

In [13]:
print(CategoricalUncertaintyIndex.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 444 entries, 0 to 443
Data columns (total 17 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Year                                444 non-null    int64  
 1   Month                               444 non-null    int64  
 2   YM                                  444 non-null    object 
 3   Uncertainty_DominantDistinctArea1   444 non-null    float64
 4   Uncertainty_DominantDistinctArea2   444 non-null    float64
 5   Uncertainty_DominantDistinctArea3   444 non-null    float64
 6   Uncertainty_DominantDistinctArea4   444 non-null    float64
 7   Uncertainty_DominantDistinctArea5   444 non-null    float64
 8   Uncertainty_DominantDistinctArea6   444 non-null    float64
 9   Uncertainty_DominantDistinctArea7   444 non-null    float64
 10  Uncertainty_DominantDistinctArea8   444 non-null    float64
 11  Uncertainty_DominantDistinctArea9   444 non-n

In [14]:
print(CategoricalUncertaintyIndex[['Year','Month','Uncertainty_DominantDistinctArea1',
                                   'Uncertainty_DominantDistinctArea2']].head())

   Year  Month  Uncertainty_DominantDistinctArea1  \
0  1985      1                           0.734827   
1  1985      2                           0.669878   
2  1985      3                           0.680366   
3  1985      4                           0.637691   
4  1985      5                           0.651844   

   Uncertainty_DominantDistinctArea2  
0                           0.670121  
1                           0.729419  
2                           0.747894  
3                           0.575533  
4                           0.601764  


In [15]:
CategoricalUncertaintyIndex.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegArea_MonthlyUncertaintyIndex_'+str(end_month)+'.csv',index=False)

In [16]:
# Categorical sentiment indexes
for dict in ['GI','LM','LSD']:
    CategoricalSentimentIndex=df_ym
    for area in area_list:
        try:
            estimates=estimate_categorical_index(dict+'score', area)
            CategoricalSentimentIndex=CategoricalSentimentIndex.merge(estimates,on='YM',how='left')
        except:
            print("Failed:",dict+":"+area)
            estimates=estimate_categorical_index_constant(dict+'score', area)
            CategoricalSentimentIndex=CategoricalSentimentIndex.merge(estimates,on='YM',how='left')        
    CategoricalSentimentIndex.to_csv('/home/ec2-user/SageMaker/New Uncertainty/Jan1985-Dec2021/RegArea_Monthly'+dict+'Index_'+str(end_month)+'.csv',index=False)