In [22]:
import pandas as pd
import plotly.express as px
import numpy as np
import statsmodels.formula.api as sm

In [4]:
final_data = pd.read_csv('final_data.csv', index_col = 0)

final_data.head()

Unnamed: 0,Date,symbols,Categories,count_post,Percent_price_change,abs
0,2018-07-06,A,Other,0,1.332474,1.332474
1,2018-07-06,AAL,Other,0,3.202944,3.202944
2,2018-07-06,AAOI,Other,0,-1.499345,1.499345
3,2018-07-06,AAP,Other,0,2.755704,2.755704
4,2018-07-06,AAPL,Other,0,1.38852,1.38852


## Test Hypothesis 1

$H_0$: Number of post does not affect the percentage change of stock price<br>
$H_1$: Number of post affect the percentage change of stock price


From the result below, p-value of count_post is 0.000. Thus, we can conclude that number of post affect the percentage change of stock price. One unit increase in post results in -0.0090% change in stock price in the next business day.

In [26]:
data_test1 = final_data.pivot_table(values = 'count_post', index = ['Date','symbols','Percent_price_change','abs'], aggfunc = np.sum).reset_index()

data_test1.head()

Unnamed: 0,Date,symbols,Percent_price_change,abs,count_post
0,2018-07-06,A,1.332474,1.332474,0
1,2018-07-06,AAL,3.202944,3.202944,0
2,2018-07-06,AAOI,-1.499345,1.499345,0
3,2018-07-06,AAP,2.755704,2.755704,0
4,2018-07-06,AAPL,1.38852,1.38852,0


In [28]:
result_1 = sm.ols('Percent_price_change ~ count_post',data = data_test1).fit()

print(result_1.summary())

                             OLS Regression Results                             
Dep. Variable:     Percent_price_change   R-squared:                       0.006
Model:                              OLS   Adj. R-squared:                  0.005
Method:                   Least Squares   F-statistic:                     22.17
Date:                  Wed, 01 Dec 2021   Prob (F-statistic):           2.59e-06
Time:                          21:20:59   Log-Likelihood:                -7290.2
No. Observations:                  3950   AIC:                         1.458e+04
Df Residuals:                      3948   BIC:                         1.460e+04
Df Model:                             1                                         
Covariance Type:              nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1052      0.027

## Test Hypothesis 2

$H_0$: Categories of source does not related to the percentage change of stock price<br>
$H_1$: Categories of source does not related to the percentage change of stock price

From the result below, the only category who affect the percentage change in stock price is Organiztion as its p-value is 0.01.

In [37]:
# Create dummy variable

data_test2 = final_data.iloc[:,:]
data_test2['Individual'] = [1 if i == 'Individual' else 0 for i in final_data['Categories']]
data_test2['News_Agency'] = [1 if i == 'News Agency' else 0 for i in final_data['Categories']]
data_test2['Organization'] = [1 if i == 'Organization' else 0 for i in final_data['Categories']]

data_test2.head()

Unnamed: 0,Date,symbols,Categories,count_post,Percent_price_change,abs,Individual,News Agency,Organization,News_Agency
0,2018-07-06,A,Other,0,1.332474,1.332474,0,0,0,0
1,2018-07-06,AAL,Other,0,3.202944,3.202944,0,0,0,0
2,2018-07-06,AAOI,Other,0,-1.499345,1.499345,0,0,0,0
3,2018-07-06,AAP,Other,0,2.755704,2.755704,0,0,0,0
4,2018-07-06,AAPL,Other,0,1.38852,1.38852,0,0,0,0


In [39]:
result_2 = sm.ols('Percent_price_change ~ count_post + Individual + News_Agency + Organization',data = data_test2).fit()

print(result_2.summary())

                             OLS Regression Results                             
Dep. Variable:     Percent_price_change   R-squared:                       0.006
Model:                              OLS   Adj. R-squared:                  0.005
Method:                   Least Squares   F-statistic:                     8.897
Date:                  Wed, 01 Dec 2021   Prob (F-statistic):           3.69e-07
Time:                          21:33:03   Log-Likelihood:                -10660.
No. Observations:                  5875   AIC:                         2.133e+04
Df Residuals:                      5870   BIC:                         2.136e+04
Df Model:                             4                                         
Covariance Type:              nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.0907     

## Test Hypothesis 3

$H_0$: Number of post from differenct source categories does not related to the percentage change of stock price<br>
$H_1$: Number of post from differenct source categories does not related to the percentage change of stock price

From the result below, at the alpha 0.05, all of the variables have p-value more than 0.05. Thus, we can conclude that number of post from differenct source categories does not related to the percentage change of stock price.

If we increase alpha to 0.1, there are two variables that is statistically significant, count_post:News_Agency and Organization. We can interpret this two variable as below:
- Controlling for other variables, if number of news agency's post about the stock increse one time, close stock price will change -0.0255% in the next business day. 
- Controlling for other variables, if organiztion mention this stock in its post at least one time, close stock price will change -0.1853% in the next business day.

In [40]:
result_3 = sm.ols('Percent_price_change ~ count_post + count_post*Individual + count_post*News_Agency + count_post*Organization',data = data_test2).fit()

print(result_3.summary())

                             OLS Regression Results                             
Dep. Variable:     Percent_price_change   R-squared:                       0.007
Model:                              OLS   Adj. R-squared:                  0.006
Method:                   Least Squares   F-statistic:                     7.119
Date:                  Wed, 01 Dec 2021   Prob (F-statistic):           1.42e-07
Time:                          21:34:07   Log-Likelihood:                -10656.
No. Observations:                  5875   AIC:                         2.133e+04
Df Residuals:                      5868   BIC:                         2.137e+04
Df Model:                             6                                         
Covariance Type:              nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Interc