In [1]:
import pandas as pd
import numpy as np
import time
from src.GWY_news_utils import aggregate_content,calculate_attitude_score,load_attitude_dictionary,regression_x_y

In [2]:
import statsmodels.api as sm

In [3]:
from statsmodels.iolib.summary2 import summary_col

### Data Import

define the directory

In [5]:
dictionary_directory = "data/dictionary/"
input_directory = "data/"
output_directory = "data/output/"
result_directory = "result/"

Import the 'important policy/news' file

In [6]:
premier_df = pd.read_csv(input_directory+"content.csv")
premier_df = premier_df.iloc[:,1:]
premier_df.index = pd.to_datetime(premier_df.date)
content = premier_df.content

Import the attitude lexicon

In [7]:
attitude_word_dict = load_attitude_dictionary(dictionary_directory)

reform_list = attitude_word_dict["reform"]
economy_list = attitude_word_dict["economy"]
regulation_list = attitude_word_dict["regulation"]
intervention_list = attitude_word_dict["intervention"]
market_list = attitude_word_dict["market"]

reform_list = reform_list + market_list

FileNotFoundError: [Errno 2] No such file or directory: 'data/dictionary/reform.txt'

In [None]:
reform_list

Import stock index return dataset

In [18]:
index_df = pd.read_csv("data/index_ret.csv")
index_df.index = pd.to_datetime(index_df.Trddt)
index_df = index_df.iloc[:,1:]

Import economic uncertainty index data

In [19]:
epu_df = pd.read_excel("data/cnepu_daily_2_september_2020_updated.xlsx")
epu_df.index = pd.to_datetime(epu_df.iloc[:,0],format="%Y-%m-%d")
epu_df = epu_df.iloc[:,1:]

### Variable Creation

Aggregate the policy/news by time (daily, weekly and monthly)

In [8]:
daily_series = aggregate_content("B",content)
weekly_series = aggregate_content("W",content)
monthly_series = aggregate_content("BM",content)

Calculate return for different frequency

In [9]:
index_m = index_df.asfreq("BM",method="ffill")
index_m = index_m.pct_change()

index_w = index_df.asfreq("W",method="ffill")
index_w = index_w.pct_change()

index_d = index_df.asfreq("B",method="ffill")
index_d = index_d.pct_change()

NameError: name 'index_df' is not defined

Calculate attitude score for different frequency

In [22]:
economy_score_list_m = calculate_attitude_score(monthly_series,economy_list)
reform_score_list_m = calculate_attitude_score(monthly_series,reform_list)
regulation_score_list_m = calculate_attitude_score(monthly_series,regulation_list)
intervention_score_list_m = calculate_attitude_score(monthly_series,intervention_list)

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/1y/w6y_szxn0m12nsrrdnzgp4540000gn/T/jieba.cache
Loading model cost 1.665 seconds.
Prefix dict has been built successfully.


In [23]:
economy_score_list_w = calculate_attitude_score(weekly_series,economy_list)
reform_score_list_w = calculate_attitude_score(weekly_series,reform_list)
regulation_score_list_w = calculate_attitude_score(weekly_series,regulation_list)
intervention_score_list_w = calculate_attitude_score(weekly_series,intervention_list)

In [24]:
economy_score_list_d = calculate_attitude_score(daily_series,economy_list)
reform_score_list_d = calculate_attitude_score(daily_series,reform_list)
regulation_score_list_d = calculate_attitude_score(daily_series,regulation_list)
intervention_score_list_d = calculate_attitude_score(daily_series,intervention_list)

Calculate number of policy issued at different frequency

In [10]:
number_of_text_d = content.resample("B").apply(len)
number_of_text_w = content.resample("W").apply(len)
number_of_text_m = content.resample("BM").apply(len)

Calculate the length of the aggreagte text for different frequency

In [11]:
text_length_m = monthly_series.apply(len)
text_length_w = weekly_series.apply(len)
text_length_d = daily_series.apply(len)

Import China Economic Policy Uncertainty Index

In [12]:
epu_df_m = epu_df.asfreq("BM","ffill")
epu_df_w = epu_df.asfreq("W","ffill")
epu_df_d = epu_df.asfreq("B","ffill")

NameError: name 'epu_df' is not defined

In [28]:
epu_df_m_r = epu_df_m.pct_change()
epu_df_w_r = epu_df_w.pct_change()
epu_df_d_r = epu_df_d.pct_change()

In [29]:
epu_df_m_r.CNEPU_Daily[epu_df_m_r.CNEPU_Daily==np.inf] = epu_df_m_r.median()[0]
epu_df_w_r.CNEPU_Daily[epu_df_w_r.CNEPU_Daily==np.inf] = epu_df_w_r.median()[0]
epu_df_d_r.CNEPU_Daily[epu_df_d_r.CNEPU_Daily==np.inf] = epu_df_d_r.median()[0]

Aggregate different scores and predictors into dataframe

In [30]:
score_df_m = pd.concat([economy_score_list_m,
                        reform_score_list_m,
                        regulation_score_list_m,
                        intervention_score_list_m,
                        text_length_m,
                        number_of_text_m,
                        epu_df_m,
                        epu_df_m_r],axis=1)

score_df_m.columns = ["economy","reform","regulation","intervention","total_length","number_of_text","EPU","EPU_change"]



score_df_w = pd.concat([economy_score_list_w,
                        reform_score_list_w,
                        regulation_score_list_w,
                        intervention_score_list_w,
                        text_length_w,
                        number_of_text_w,
                        epu_df_w,
                        epu_df_w_r],axis=1)

score_df_w.columns = ["economy","reform","regulation","intervention","total_length","number_of_text","EPU","EPU_change"]







score_df_d = pd.concat([economy_score_list_d,
                        reform_score_list_d,
                        regulation_score_list_d,
                        intervention_score_list_d,
                        text_length_d,
                        number_of_text_d,
                        epu_df_d,
                        epu_df_d_r],axis=1)

score_df_d.columns = ["economy","reform","regulation","intervention","total_length","number_of_text","EPU","EPU_change"]











### Export the attitude score dataframe

In [58]:
score_df_m.to_csv(output_directory+"monthly_score_df_extra.csv")
score_df_w.to_csv(output_directory+"weekly_score_df_extra.csv")
score_df_d.to_csv(output_directory+"daily_score_df_extra.csv")

### Merge the X and Y variables

In [31]:
merge_df_m = index_m.merge(score_df_m,how="inner",right_index=True,left_index=True)
merge_df_w = index_w.merge(score_df_w,how="inner",right_index=True,left_index=True)
merge_df_d = index_d.merge(score_df_d,how="inner",right_index=True,left_index=True)

### Set up the regression dataframe

### Simple OLS

In [204]:
number = [7]

In [205]:
y = merge_df_d["I000002"]
x = merge_df_d.iloc[:,number]

X,Y = regression_x_y(y,x,y_predict_lag=1,y_include_lag=0,x_include_lag=0)
model = sm.OLS(Y,X)
result_1 = model.fit()

In [206]:
y = merge_df_d["I000020"]
x = merge_df_d.iloc[:,number]

X,Y = regression_x_y(y,x,y_predict_lag=1,y_include_lag=0,x_include_lag=0)
model = sm.OLS(Y,X)
result_2 = model.fit()

In [207]:
y = merge_df_d["I399005"]
x = merge_df_d.iloc[:,number]

X,Y = regression_x_y(y,x,y_predict_lag=1,y_include_lag=0,x_include_lag=0)
model = sm.OLS(Y,X)
result_3 = model.fit()

In [208]:
y = merge_df_d["I399106"]
x = merge_df_d.iloc[:,number]

X,Y = regression_x_y(y,x,y_predict_lag=1,y_include_lag=0,x_include_lag=0)
model = sm.OLS(Y,X)
result_4 = model.fit()

In [209]:
result_table = summary_col([result_1,result_2,result_3,result_4],
                             stars=True,
                             float_format="%0.3f",
                             info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                                        'R2':lambda x: "{:.2f}".format(x.rsquared)})

In [210]:
result_table

0,1,2,3,4
,I000002,I000020,I399005,I399106
const,0.000,-0.000,0.000,-0.000
,(0.000),(0.001),(0.001),(0.000)
intervention,-0.000,-0.000,-0.001,-0.000
,(0.001),(0.001),(0.001),(0.001)
R-squared,-0.001,-0.001,-0.001,-0.001
,0.000,0.000,0.000,0.000
N,966,966,966,966
R2,0.00,0.00,0.00,0.00


In [60]:
merge_df_w.columns

Index(['I000002', 'I000020', 'I399005', 'I399106', 'economy', 'reform',
       'regulation', 'intervention', 'total_length', 'number_of_text', 'EPU',
       'EPU_change'],
      dtype='object')

In [39]:
y = merge_df_m["I000002"]
x = merge_df_m.iloc[:,4:8]

X,Y = regression_x_y(y,x,y_predict_lag=1,y_include_lag=1,x_include_lag=0)

model = sm.OLS(Y,X)
result_1 = model.fit()

In [40]:
y = merge_df_m["I000020"]
x = merge_df_m.iloc[:,4:8]

X,Y = regression_x_y(y,x,y_predict_lag=1,y_include_lag=1,x_include_lag=0)

model = sm.OLS(Y,X)
result_2 = model.fit()

In [41]:
y = merge_df_m["I399005"]
x = merge_df_m.iloc[:,4:8]

X,Y = regression_x_y(y,x,y_predict_lag=1,y_include_lag=1,x_include_lag=0)

model = sm.OLS(Y,X)
result_3 = model.fit()

In [42]:
y = merge_df_m["I399106"]
x = merge_df_m.iloc[:,4:8]

X,Y = regression_x_y(y,x,y_predict_lag=1,y_include_lag=1,x_include_lag=0)

model = sm.OLS(Y,X)
result_4 = model.fit()

In [43]:
result_table = summary_col([result_1,result_2,result_3,result_4],
                             stars=True,
                             float_format="%0.3f",
                             info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                                        'R2':lambda x: "{:.2f}".format(x.rsquared)})

In [44]:
result_table

0,1,2,3,4
,I000002_y,I000020_y,I399005_y,I399106_y
I000002_x,0.069,,,
,(0.148),,,
I000020_x,,0.096,,
,,(0.159),,
I399005_x,,,0.213,
,,,(0.157),
I399106_x,,,,0.146
,,,,(0.158)
R-squared,0.145,-0.003,0.029,0.005


In [38]:
print(result_table.as_latex())

\begin{table}
\caption{}
\begin{center}
\begin{tabular}{lcccc}
\hline
             & I000002\_y & I000020\_y & I399005\_y & I399106\_y  \\
\midrule
I000002\_x   & -0.005     &            &            &             \\
             & (0.032)    &            &            &             \\
I000020\_x   &            & 0.031      &            &             \\
             &            & (0.032)    &            &             \\
I399005\_x   &            &            & -0.002     &             \\
             &            &            & (0.032)    &             \\
I399106\_x   &            &            &            & -0.001      \\
             &            &            &            & (0.032)     \\
R-squared    & 0.000      & 0.007      & 0.001      & 0.003       \\
             & 0.005      & 0.012      & 0.006      & 0.008       \\
const        & -0.000     & -0.001     & -0.000     & -0.000      \\
             & (0.000)    & (0.001)    & (0.001)    & (0.001)     \\
economy      & -0.001   

## Add Control Variable

In [None]:
merge_df_w.columns

In [233]:
index_ticker = "I399106"
merge_df = merge_df_d

In [234]:
merge_df.columns

Index(['I000002', 'I000020', 'I399005', 'I399106', 'economy', 'reform',
       'regulation', 'intervention', 'total_length', 'number_of_text', 'EPU',
       'EPU_change'],
      dtype='object')

In [235]:
y = merge_df[index_ticker]
x = merge_df.iloc[:,4:8]

X,Y = regression_x_y(y,x,y_predict_lag=1,y_include_lag=1,x_include_lag=0)

model = sm.OLS(Y,X)
result_1 = model.fit()

In [236]:
y = merge_df[index_ticker]
x = merge_df.iloc[:,4:10]

X,Y = regression_x_y(y,x,y_predict_lag=1,y_include_lag=1,x_include_lag=0)

model = sm.OLS(Y,X)
result_2 = model.fit()

In [237]:
y = merge_df[index_ticker]
x = merge_df.iloc[:,4:]

X,Y = regression_x_y(y,x,y_predict_lag=1,y_include_lag=1,x_include_lag=0)

model = sm.OLS(Y,X)
result_3 = model.fit()

In [238]:
result_table = summary_col([result_1,result_2,result_3],
                             stars=True,
                             float_format="%0.3f",
                             info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                                        'R2':lambda x: "{:.2f}".format(x.rsquared)},
                          regressor_order=["const","economy","intervention","reform","regulation","I000002_x ",
                                           "total_length","number_of_text","EPU","EPU_change"])

In [239]:
print(result_table.as_latex())

\begin{table}
\caption{}
\begin{center}
\begin{tabular}{lccc}
\hline
                 & I399106\_y I & I399106\_y II & I399106\_y III  \\
\midrule
const            & -0.000       & -0.001        & -0.002          \\
                 & (0.001)      & (0.001)       & (0.001)         \\
economy          & -0.001       & -0.001*       & -0.001*         \\
                 & (0.001)      & (0.001)       & (0.001)         \\
intervention     & -0.001       & -0.001        & -0.001          \\
                 & (0.001)      & (0.001)       & (0.001)         \\
reform           & 0.001***     & 0.001**       & 0.001**         \\
                 & (0.000)      & (0.000)       & (0.000)         \\
regulation       & -0.001       & -0.001        & -0.001          \\
                 & (0.001)      & (0.001)       & (0.001)         \\
total\_length    &              & 0.000         & 0.000           \\
                 &              & (0.000)       & (0.000)         \\
number\_of\_text &       

In [240]:
result_table

0,1,2,3
,I399106_y I,I399106_y II,I399106_y III
const,-0.000,-0.001,-0.002
,(0.001),(0.001),(0.001)
economy,-0.001,-0.001*,-0.001*
,(0.001),(0.001),(0.001)
intervention,-0.001,-0.001,-0.001
,(0.001),(0.001),(0.001)
reform,0.001***,0.001**,0.001**
,(0.000),(0.000),(0.000)
regulation,-0.001,-0.001,-0.001
