# Modeling
- NCCU PyDay 2022: https://www.facebook.com/groups/pythontw/posts/10162007038723438/
- Author: TENG-LIN YU
- Email: tlyu0419@gmail.com
- Github: https://github.com/TLYu0419/Stock-Prediction-Using-Facebook-Sentiment-with-Python

## 實驗設計
  - 1: 情感預測波動
  - 2: 情感預測股價 
  - 2: 過去的股價+情感 預測今天的股價
    - QA: 如何抓過去幾天的?
    - Stock Prediction Using Twitter Sentiment Analysis
  - 3: 預測股價還是預測波動

## Import packages

In [1]:
# !pip install statsmodels

In [1]:
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import grangercausalitytests

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error

from sklearn.model_selection import TimeSeriesSplit
import statsmodels.api as sm
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', 100)
# pd.set_option('display.max_rows', 300)

## Load Data

In [21]:
stock_price = pd.read_pickle('./stock_price_by_date.pickle')
stock_price['DATE'] = stock_price['DATE'].apply(lambda x: x.strftime('%Y-%m-%d'))
stock_price

Unnamed: 0,DATE,CLOSING_INDEX
0,2020-12-01,13885.67
1,2020-12-02,13989.14
2,2020-12-03,13977.09
3,2020-12-04,14132.44
4,2020-12-05,14194.52
...,...,...
503,2022-04-18,16898.87
504,2022-04-19,16993.40
505,2022-04-20,17148.88
506,2022-04-21,17127.95


In [22]:
sentiment = pd.read_pickle('./sentiment_by_date.pickle')
sentiment = sentiment.reset_index()
sentiment

Unnamed: 0,DATE,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD
0,2020-12-24,199,0,2,0,0,0,0
1,2020-12-28,1257,0,14,4,1,0,0
2,2020-12-29,3159,21,21,3,2,0,2
3,2020-12-30,9448,541,44,309,10,0,13
4,2020-12-31,14667,671,210,32,24,29,7
...,...,...,...,...,...,...,...,...
477,2022-04-18,120844,33551,721,2228,979,597,3694
478,2022-04-19,65164,4433,1197,1934,448,186,853
479,2022-04-20,75773,9032,464,1925,240,344,226
480,2022-04-21,31128,2935,504,1373,382,700,129


In [23]:
df = pd.merge(left=stock_price, right=sentiment, how='left', on='DATE')
df 

Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD
0,2020-12-01,13885.67,,,,,,,
1,2020-12-02,13989.14,,,,,,,
2,2020-12-03,13977.09,,,,,,,
3,2020-12-04,14132.44,,,,,,,
4,2020-12-05,14194.52,,,,,,,
...,...,...,...,...,...,...,...,...,...
503,2022-04-18,16898.87,120844.0,33551.0,721.0,2228.0,979.0,597.0,3694.0
504,2022-04-19,16993.40,65164.0,4433.0,1197.0,1934.0,448.0,186.0,853.0
505,2022-04-20,17148.88,75773.0,9032.0,464.0,1925.0,240.0,344.0,226.0
506,2022-04-21,17127.95,31128.0,2935.0,504.0,1373.0,382.0,700.0,129.0


In [24]:
# df = df.loc[df['DATE'].apply(lambda x: '2021-07-01' <= x <= '2022-04-20')]
df = df.loc[df['DATE'].apply(lambda x: '2021-01-01' <= x <= '2021-12-31')]
df
# # df = df.loc[df['DATE'].apply(lambda x: '2021-10-01' <= x <= '2022-04-17')]
# print(df['DATE'].min())
# print(df['DATE'].max())

Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD
31,2021-01-01,14817.2800,17495.0,175.0,112.0,125.0,9.0,24.0,5.0
32,2021-01-02,14859.6550,8691.0,291.0,73.0,79.0,14.0,7.0,13.0
33,2021-01-03,14880.8425,4966.0,142.0,36.0,103.0,6.0,2.0,12.0
34,2021-01-04,14902.0300,26926.0,2680.0,176.0,278.0,44.0,9.0,41.0
35,2021-01-05,15000.0300,9681.0,617.0,37.0,132.0,6.0,11.0,4.0
...,...,...,...,...,...,...,...,...,...
391,2021-12-27,18048.9400,14183.0,523.0,82.0,228.0,41.0,157.0,77.0
392,2021-12-28,18196.8100,37211.0,1896.0,172.0,220.0,547.0,483.0,111.0
393,2021-12-29,18248.2800,14600.0,1255.0,65.0,192.0,18.0,11.0,60.0
394,2021-12-30,18218.8400,22604.0,2276.0,173.0,899.0,62.0,165.0,169.0


In [25]:
df = df.set_index('DATE')
std = StandardScaler()
df = pd.DataFrame(std.fit_transform(df), columns=df.columns, index=df.index)
df

Unnamed: 0_level_0,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-01,-2.829343,-0.021187,-0.606136,-0.103150,-0.616975,-0.648357,-0.288258,-0.479160
2021-01-02,-2.772029,-1.047360,-0.548314,-0.304623,-0.818799,-0.581878,-0.453689,-0.420468
2021-01-03,-2.743373,-1.481538,-0.622585,-0.495764,-0.713500,-0.688244,-0.502345,-0.427805
2021-01-04,-2.714716,1.078068,0.642510,0.227473,0.054309,-0.183007,-0.434226,-0.215048
2021-01-05,-2.582167,-0.931968,-0.385816,-0.490598,-0.586263,-0.688244,-0.414764,-0.486496
...,...,...,...,...,...,...,...,...
2021-12-27,1.541598,-0.407226,-0.432671,-0.258129,-0.165065,-0.222894,1.005996,0.049064
2021-12-28,1.741598,2.276864,0.251716,0.206809,-0.200165,6.504734,4.178378,0.298503
2021-12-29,1.811213,-0.358621,-0.067798,-0.345950,-0.323014,-0.528696,-0.414764,-0.075656
2021-12-30,1.771394,0.574306,0.441131,0.211975,2.778930,0.056316,1.083846,0.724016


In [26]:
# df['N1D_CLOSING_INDEX'] = df['CLOSING_INDEX'].shift(-1)
# df['DIFF'] = df['N1D_CLOSING_INDEX'] - df['CLOSING_INDEX']
# df['DIRECT'] = df['DIFF'].apply(lambda x: int(x>0))
# df['LIKE'] = np.log(df['LIKE'])
# df['HAHA'] = np.log(df['HAHA'])
# df['LOVE'] = np.log(df['LOVE'])
# df['WOW'] = np.log(df['WOW'])
# df['CARE'] = np.log(df['CARE'])
# df['ANGRY'] = np.log(df['ANGRY']+1)
# df['SAD'] = np.log(df['SAD'])
# df

# df = df.loc[df['DATE'].apply(lambda x: '2021-12-01' <= x <= '2022-04-17')]
# # df = df.loc[df['DATE'].apply(lambda x: '2021-10-01' <= x <= '2022-04-17')]
# print(df['DATE'].min())
# print(df['DATE'].max())

In [27]:
# std = StandardScaler()
# columns = ['CLOSING_INDEX', 'LIKE', 'HAHA', 'LOVE', 'WOW', 'CARE', 'ANGRY', 'SAD']
# df = pd.DataFrame(std.fit_transform(df[columns]), columns=columns, index=df['DATE'])
# df

## Granger causality
- The Granger Causality test is used to determine whether or not one time series is useful for forecasting another.
- https://www.statology.org/granger-causality-test-in-python/
- https://www.youtube.com/watch?v=4TkNZviNJC0

In [28]:
#perform Granger-Causality test
print('======================= LIKE =======================')
result = grangercausalitytests(df[['CLOSING_INDEX', 'LIKE']], maxlag=5)
print('======================= HAHA =======================')
result = grangercausalitytests(df[['CLOSING_INDEX', 'HAHA']], maxlag=5)
print('======================= LOVE =======================')
result = grangercausalitytests(df[['CLOSING_INDEX', 'LOVE']], maxlag=5)
print('======================= WOW========================')
result = grangercausalitytests(df[['CLOSING_INDEX', 'WOW']], maxlag=5)
print('======================= CARE =======================')
result = grangercausalitytests(df[['CLOSING_INDEX', 'CARE']], maxlag=5)
print('======================= ANGRY =======================')
result = grangercausalitytests(df[['CLOSING_INDEX', 'ANGRY']], maxlag=5)
print('======================= SAD =======================')
result = grangercausalitytests(df[['CLOSING_INDEX', 'SAD']], maxlag=5)


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.2507  , p=0.6169  , df_denom=361, df_num=1
ssr based chi2 test:   chi2=0.2528  , p=0.6151  , df=1
likelihood ratio test: chi2=0.2527  , p=0.6152  , df=1
parameter F test:         F=0.2507  , p=0.6169  , df_denom=361, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=0.3054  , p=0.7370  , df_denom=358, df_num=2
ssr based chi2 test:   chi2=0.6193  , p=0.7337  , df=2
likelihood ratio test: chi2=0.6188  , p=0.7339  , df=2
parameter F test:         F=0.3054  , p=0.7370  , df_denom=358, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=0.3308  , p=0.8031  , df_denom=355, df_num=3
ssr based chi2 test:   chi2=1.0119  , p=0.7984  , df=3
likelihood ratio test: chi2=1.0105  , p=0.7987  , df=3
parameter F test:         F=0.3308  , p=0.8031  , df_denom=355, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=0.8230  , p=0.5112  

## Set of Features
- Feature: 
  - 0.05
    - LIKE > 3D 
    - WOW > 3D
  - ELSE
    - HAHA > 2D
    - SAD > 2D
    - LOVE > 1D
    - WOW > 1D
    - ANGET > 1D
-  To confirm the inverse dependence of other mood dimensions on TWSE we investigated a total of 4 different possibilities.
   - Module 1: $ L3D\_LIKE $
   - Module 2: $ L3D\_WOW $ 
   - Module 3: $ L3D\_LIKE + L3D\_WOW $
   - Module 4: $ L3D\_LIKE + L3D\_WOW + HAHA + LOVE + CARE + ANGRY + SAD$

In [12]:
stock_price = pd.read_pickle('./stock_price_by_date.pickle')
stock_price['DATE'] = stock_price['DATE'].apply(lambda x: x.strftime('%Y-%m-%d'))
sentiment = pd.read_pickle('./sentiment_by_date.pickle')
df = pd.merge(left=stock_price, right=sentiment, how='left', on='DATE')
df = df.loc[df['DATE'].apply(lambda x: '2021-07-01' <= x <= '2022-04-20')]
df

Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD
212,2021-07-01,17713.9400,6508.0,141.0,91.0,53.0,12.0,40.0,9.0
213,2021-07-02,17710.1500,25305.0,3227.0,185.0,386.0,36.0,61.0,33.0
214,2021-07-03,17814.7400,11057.0,113.0,53.0,48.0,11.0,9.0,4.0
215,2021-07-04,17867.0350,15265.0,442.0,50.0,60.0,22.0,12.0,6.0
216,2021-07-05,17919.3300,20673.0,469.0,126.0,992.0,53.0,12.0,833.0
...,...,...,...,...,...,...,...,...,...
501,2022-04-16,16951.5250,82358.0,10830.0,3662.0,2802.0,355.0,175.0,260.0
502,2022-04-17,16925.1975,92631.0,14661.0,3095.0,996.0,484.0,601.0,311.0
503,2022-04-18,16898.8700,120844.0,33551.0,721.0,2228.0,979.0,597.0,3694.0
504,2022-04-19,16993.4000,65164.0,4433.0,1197.0,1934.0,448.0,186.0,853.0


In [None]:
stock_price = pd.read_pickle('./stock_price_by_date.pickle')
stock_price['DATE'] = stock_price['DATE'].apply(lambda x: x.strftime('%Y-%m-%d'))
sentiment = pd.read_pickle('./sentiment_by_date.pickle')
df = pd.merge(left=stock_price, right=sentiment, how='left', on='DATE')
df

Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD
0,2021-07-01,17713.9400,6509,141,91,53,12,40,9
1,2021-07-02,17710.1500,25305,3227,185,386,36,61,33
2,2021-07-03,17814.7400,11057,113,53,48,11,9,4
3,2021-07-04,17867.0350,15266,442,50,60,22,12,6
4,2021-07-05,17919.3300,20674,469,126,992,53,12,833
...,...,...,...,...,...,...,...,...,...
287,2022-04-14,17245.6500,80054,9636,989,1610,1210,619,253
288,2022-04-15,17004.1800,60520,8634,451,1869,222,343,259
289,2022-04-16,16951.5250,68885,8800,3096,2266,284,158,229
290,2022-04-17,16925.1975,61989,9211,1811,845,308,560,222


In [35]:
df['LIKE'].rolling(2).sum()

0           NaN
1       31814.0
2       36362.0
3       26323.0
4       35940.0
         ...   
287    141368.0
288    140574.0
289    129405.0
290    130874.0
291     80202.0
Name: LIKE, Length: 292, dtype: float64

In [36]:
df['L3D_LIKE'] = df['LIKE'].rolling(3).sum()
df['L3D_WOW'] = df['WOW'].rolling(3).sum()

df['N1D_CLOSING_INDEX'] = df['CLOSING_INDEX'].shift(-1)
df['DIFF'] = df['N1D_CLOSING_INDEX'] - df['CLOSING_INDEX']
df['DIRECT'] = df['DIFF'].apply(lambda x: int(x>0))
df['L3D_LIKE'] = np.log(df['L3D_LIKE'])
df['HAHA'] = np.log(df['HAHA'])
df['LOVE'] = np.log(df['LOVE'])
df['L3D_WOW'] = np.log(df['L3D_WOW'])
df['CARE'] = np.log(df['CARE'])
df['ANGRY'] = np.log(df['ANGRY']+1)
df['SAD'] = np.log(df['SAD'])

training = df.loc[df['DATE'].apply(lambda x: '2021-12-01' <= x <= '2022-02-28')]
testing = df.loc[df['DATE'].apply(lambda x: '2022-03-01' <= x <= '2022-04-17')]

df

Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD,L3D_LIKE,L3D_WOW,N1D_CLOSING_INDEX,DIFF,DIRECT
0,2021-07-01,17713.9400,6509,4.948760,4.510860,53,2.484907,3.713572,2.197225,,,17710.1500,-3.7900,0
1,2021-07-02,17710.1500,25305,8.079308,5.220356,386,3.583519,4.127134,3.496508,,,17814.7400,104.5900,1
2,2021-07-03,17814.7400,11057,4.727388,3.970292,48,2.397895,2.302585,1.386294,10.665951,6.188264,17867.0350,52.2950,1
3,2021-07-04,17867.0350,15266,6.091310,3.912023,60,3.091042,2.564949,1.791759,10.851819,6.202536,17919.3300,52.2950,1
4,2021-07-05,17919.3300,20674,6.150603,4.836282,992,3.970292,2.564949,6.725034,10.757839,7.003065,17913.0700,-6.2600,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,2022-04-14,17245.6500,80054,9.173261,6.896694,1610,7.098376,6.429719,5.533389,12.288279,8.320205,17004.1800,-241.4700,0
288,2022-04-15,17004.1800,60520,9.063463,6.111467,1869,5.402677,5.840642,5.556828,12.215468,8.518592,16951.5250,-52.6550,0
289,2022-04-16,16951.5250,68885,9.082507,8.037866,2266,5.648974,5.068904,5.433722,12.252283,8.656085,16925.1975,-26.3275,0
290,2022-04-17,16925.1975,61989,9.128154,7.501634,845,5.730100,6.329721,5.402677,12.162089,8.513185,16898.8700,-26.3275,0


In [37]:
module1 = ['L3D_LIKE']
module2 = ['L3D_WOW']
module3 = ['L3D_LIKE', 'L3D_WOW']
module4 = ['L3D_LIKE', 'L3D_WOW', 'HAHA', 'LOVE', 'CARE', 'ANGRY', 'SAD']
target = ['DIFF']

## 模型
- Model
  - Regression
    - LR
    - SVR
    - MAP
  - Classfication
    - Logistic Regression
    - SVM
    - MAP
    - SOFNN
- KSVC
- [Normalization vs Standardization in Linear Regression](https://www.baeldung.com/cs/normalization-vs-standardization)

In [38]:
regressor = LinearRegression()
regressor.fit(training[module1], training[target])
print(regressor.score(training[module1], training[target]))
print(regressor.score(testing[module1], testing[target]))

0.025613870700517127
-0.018584499614078265


In [39]:
regressor = LinearRegression()
regressor.fit(training[module2], training[target])
print(regressor.score(training[module2], training[target]))
print(regressor.score(testing[module2], testing[target]))

0.00785544992759768
0.007138808945051478


In [40]:
regressor = LinearRegression()
regressor.fit(training[module3], training[target])
print(regressor.score(training[module3], training[target]))
print(regressor.score(testing[module3], testing[target]))

0.028728866042186718
-0.0363887264410363


In [41]:
regressor = LinearRegression()
regressor.fit(training[module4], training[target])
print(regressor.score(training[module4], training[target]))
print(regressor.score(testing[module4], testing[target]))

0.06747745167530583
-0.093150218678405


In [43]:
# statmodels
training2 = training[module2]
training2 = sm.add_constant(training2, prepend=False)

# Fit and summarize OLS model
mod = sm.OLS(training[target], training2)
res = mod.fit()

print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                   DIFF   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                    0.6968
Date:                Tue, 19 Apr 2022   Prob (F-statistic):              0.406
Time:                        23:18:18   Log-Likelihood:                -549.20
No. Observations:                  90   AIC:                             1102.
Df Residuals:                      88   BIC:                             1107.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
L3D_WOW      -15.2399     18.258     -0.835      0.4

### MOVEMENT2
- log

In [45]:
# stock_price = pd.read_pickle('./stock_price_by_date.pickle')
# stock_price['DATE'] = stock_price['DATE'].apply(lambda x: x.strftime('%Y-%m-%d'))
# sentiment = pd.read_pickle('./sentiment_by_date.pickle')
# df = pd.merge(left=stock_price, right=sentiment, how='left', on='DATE')
# df

In [46]:
# df = df.loc[df['DATE'].apply(lambda x: '2021-12-01' <= x <= '2022-04-18')]

# df['LIKE'] = np.log(df['LIKE'])
# df['HAHA'] = np.log(df['HAHA'])
# df['LOVE'] = np.log(df['LOVE'])
# df['WOW'] = np.log(df['WOW'])
# df['CARE'] = np.log(df['CARE'])
# df['ANGRY'] = np.log(df['ANGRY']+1)
# df['SAD'] = np.log(df['SAD'])
# df

In [47]:
# df['N1D_CLOSING_INDEX'] = df['CLOSING_INDEX'].shift(-1)

# df['MOVEMENT'] = df['N1D_CLOSING_INDEX'] - df['CLOSING_INDEX'] 

# std = StandardScaler()
# df['CLOSING_INDEX'] = std.fit_transform(df[['CLOSING_INDEX']])


# training = df.loc[df['DATE'].apply(lambda x: '2021-12-01' <= x <= '2022-02-28')]
# testing = df.loc[df['DATE'].apply(lambda x: '2022-03-01' <= x <= '2022-04-17')]
# df

In [48]:
# module1 = ['CLOSING_INDEX']
# module2 = ['CLOSING_INDEX', 'LIKE']
# module3 = ['CLOSING_INDEX', 'CARE']
# module4 = ['CLOSING_INDEX', 'LIKE', 'CARE']
# module5 = ['CLOSING_INDEX', 'LIKE', 'CARE', 'HAHA', 'SAD']
# module6 = ['CLOSING_INDEX', 'LIKE', 'CARE', 'HAHA', 'SAD', 'LOVE', 'WOW', 'ANGRY']
# target = ['MOVEMENT']

In [157]:
# regressor = LinearRegression()
# regressor.fit(training[module1], training[target])
# print(regressor.score(training[module1], training[target]))
# print(regressor.score(testing[module1], testing[target]))

0.06191251813896914
-0.22826208289002614


In [158]:
regressor = LinearRegression()
regressor.fit(training[module2], training[target])
print(regressor.score(training[module2], training[target]))
print(regressor.score(testing[module2], testing[target]))

0.08168283744024618
-0.09099626669944638


In [159]:
regressor = LinearRegression()
regressor.fit(training[module3], training[target])
print(regressor.score(training[module3], training[target]))
print(regressor.score(testing[module3], testing[target]))

0.06730202968278653
-0.16471371009719915


In [160]:
regressor = LinearRegression()
regressor.fit(training[module4], training[target])
print(regressor.score(training[module4], training[target]))
print(regressor.score(testing[module4], testing[target]))

0.08411779753685056
-0.0982771618269993


In [161]:
regressor = LinearRegression()
regressor.fit(training[module5], training[target])
print(regressor.score(training[module5], training[target]))
print(regressor.score(testing[module5], testing[target]))

0.08681091957530562
-0.07651330872749029


In [162]:
regressor = LinearRegression()
regressor.fit(training[module6], training[target])
print(regressor.score(training[module6], training[target]))
print(regressor.score(testing[module6], testing[target]))

0.10317449594670913
-0.1377301947593026


### MOVEMENT3
- percent

In [44]:
# stock_price = pd.read_pickle('./stock_price_by_date.pickle')
# stock_price['DATE'] = stock_price['DATE'].apply(lambda x: x.strftime('%Y-%m-%d'))
# sentiment = pd.read_pickle('./sentiment_by_date.pickle')
# df = pd.merge(left=stock_price, right=sentiment, how='left', on='DATE')
# df

### MOVEMENT_DIRECT

In [172]:
stock_price = pd.read_pickle('./stock_price_by_date.pickle')
stock_price['DATE'] = stock_price['DATE'].apply(lambda x: x.strftime('%Y-%m-%d'))
sentiment = pd.read_pickle('./sentiment_by_date.pickle')
df = pd.merge(left=stock_price, right=sentiment, how='left', on='DATE')
df

Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD
0,2021-07-01,17713.9400,6509,141,91,53,12,40,9
1,2021-07-02,17710.1500,25305,3227,185,386,36,61,33
2,2021-07-03,17814.7400,11057,113,53,48,11,9,4
3,2021-07-04,17867.0350,15266,442,50,60,22,12,6
4,2021-07-05,17919.3300,20674,469,126,992,53,12,833
...,...,...,...,...,...,...,...,...,...
287,2022-04-14,17245.6500,80054,9636,989,1610,1210,619,253
288,2022-04-15,17004.1800,60520,8634,451,1869,222,343,259
289,2022-04-16,16951.5250,68885,8800,3096,2266,284,158,229
290,2022-04-17,16925.1975,61989,9211,1811,845,308,560,222


In [173]:
df['N1D_CLOSING_INDEX'] = df['CLOSING_INDEX'].shift(-1)
df['DIRECTED'] = df['N1D_CLOSING_INDEX'] - df['CLOSING_INDEX']
df['DIRECTED2'] = df['DIRECTED'].apply(lambda x: int(x>0))

df.loc[:,['LIKE']] = np.log(df['LIKE'])
df.loc[:,['HAHA']] = np.log(df['HAHA'])
df.loc[:,['LOVE']] = np.log(df['LOVE'])
df.loc[:,['WOW']] = np.log(df['WOW'])
df.loc[:,['CARE']] = np.log(df['CARE'])
df.loc[:,['ANGRY']] = np.log(df['ANGRY']+1)
df.loc[:,['SAD']] = np.log(df['SAD'])

training = df.loc[df['DATE'].apply(lambda x: '2021-12-01' <= x <= '2022-02-31')]
testing = df.loc[df['DATE'].apply(lambda x: '2022-03-01' <= x <= '2022-04-17')]
df

Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD,N1D_CLOSING_INDEX,DIRECTED,DIRECTED2
0,2021-07-01,17713.9400,8.780941,4.948760,4.510860,3.970292,2.484907,3.713572,2.197225,17710.1500,-3.7900,0
1,2021-07-02,17710.1500,10.138757,8.079308,5.220356,5.955837,3.583519,4.127134,3.496508,17814.7400,104.5900,1
2,2021-07-03,17814.7400,9.310819,4.727388,3.970292,3.871201,2.397895,2.302585,1.386294,17867.0350,52.2950,1
3,2021-07-04,17867.0350,9.633383,6.091310,3.912023,4.094345,3.091042,2.564949,1.791759,17919.3300,52.2950,1
4,2021-07-05,17919.3300,9.936632,6.150603,4.836282,6.899723,3.970292,2.564949,6.725034,17913.0700,-6.2600,0
...,...,...,...,...,...,...,...,...,...,...,...,...
287,2022-04-14,17245.6500,11.290457,9.173261,6.896694,7.383989,7.098376,6.429719,5.533389,17004.1800,-241.4700,0
288,2022-04-15,17004.1800,11.010729,9.063463,6.111467,7.533159,5.402677,5.840642,5.556828,16951.5250,-52.6550,0
289,2022-04-16,16951.5250,11.140194,9.082507,8.037866,7.725771,5.648974,5.068904,5.433722,16925.1975,-26.3275,0
290,2022-04-17,16925.1975,11.034712,9.128154,7.501634,6.739337,5.730100,6.329721,5.402677,16898.8700,-26.3275,0


In [49]:
df

Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD,L3D_LIKE,L3D_WOW,N1D_CLOSING_INDEX,DIFF,DIRECT
0,2021-07-01,17713.9400,6509,4.948760,4.510860,53,2.484907,3.713572,2.197225,,,17710.1500,-3.7900,0
1,2021-07-02,17710.1500,25305,8.079308,5.220356,386,3.583519,4.127134,3.496508,,,17814.7400,104.5900,1
2,2021-07-03,17814.7400,11057,4.727388,3.970292,48,2.397895,2.302585,1.386294,10.665951,6.188264,17867.0350,52.2950,1
3,2021-07-04,17867.0350,15266,6.091310,3.912023,60,3.091042,2.564949,1.791759,10.851819,6.202536,17919.3300,52.2950,1
4,2021-07-05,17919.3300,20674,6.150603,4.836282,992,3.970292,2.564949,6.725034,10.757839,7.003065,17913.0700,-6.2600,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287,2022-04-14,17245.6500,80054,9.173261,6.896694,1610,7.098376,6.429719,5.533389,12.288279,8.320205,17004.1800,-241.4700,0
288,2022-04-15,17004.1800,60520,9.063463,6.111467,1869,5.402677,5.840642,5.556828,12.215468,8.518592,16951.5250,-52.6550,0
289,2022-04-16,16951.5250,68885,9.082507,8.037866,2266,5.648974,5.068904,5.433722,12.252283,8.656085,16925.1975,-26.3275,0
290,2022-04-17,16925.1975,61989,9.128154,7.501634,845,5.730100,6.329721,5.402677,12.162089,8.513185,16898.8700,-26.3275,0


In [50]:
module1 = ['L3D_LIKE']
module2 = ['L3D_WOW']
module3 = ['L3D_LIKE', 'L3D_WOW']
module4 = ['L3D_LIKE', 'L3D_WOW', 'HAHA', 'LOVE', 'CARE', 'ANGRY', 'SAD']
target = ['DIRECT']

In [52]:
clf = LogisticRegression()
clf.fit(training[module1], training[target])
print(clf.score(training[module1], training[target]))
print(clf.score(testing[module1], testing[target]))
testing['PRED'] = clf.predict(testing[module1])
testing

0.6222222222222222
0.2916666666666667


  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['PRED'] = clf.predict(testing[module1])


Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD,L3D_LIKE,L3D_WOW,N1D_CLOSING_INDEX,DIFF,DIRECT,PRED
243,2022-03-01,17898.25,60371,7.631432,7.153834,1288,6.006353,5.777652,7.909489,12.514366,7.978996,17867.6,-30.65,0,1
244,2022-03-02,17867.6,58454,8.33663,5.814131,1626,6.398595,5.429346,5.31812,12.438703,8.264878,17934.4,66.8,1,1
245,2022-03-03,17934.4,56972,9.283219,5.978886,7078,6.200509,6.267201,7.779049,12.077085,9.20954,17736.52,-197.88,0,1
246,2022-03-04,17736.52,74795,8.847647,6.639876,1744,7.303843,5.869297,6.070738,12.155942,9.254166,17457.605,-278.915,0,1
247,2022-03-05,17457.605,36434,9.231123,5.616771,398,4.691348,4.60517,4.787492,12.032915,9.12913,17318.1475,-139.4575,0,1
248,2022-03-06,17318.1475,62376,9.204624,6.942157,1469,5.521461,5.420535,5.710427,12.064538,8.19174,17178.69,-139.4575,0,1
249,2022-03-07,17178.69,86153,9.125218,7.924796,928,7.367709,5.723585,7.021976,12.127911,7.935587,16825.25,-353.44,0,1
250,2022-03-08,16825.25,78121,9.298992,6.883463,828,5.703782,5.043425,5.192957,12.331162,8.078688,17015.36,190.11,1,1
251,2022-03-09,17015.36,42180,8.24722,5.934894,693,6.759255,4.804021,5.537334,12.237833,7.803435,17433.2,417.84,1,1
252,2022-03-10,17433.2,60286,9.208138,6.2106,1199,7.077498,5.068904,5.968708,12.103968,7.908387,17264.74,-168.46,0,1


In [53]:
clf = LogisticRegression()
clf.fit(training[module2], training[target])
print(clf.score(training[module2], training[target]))
print(clf.score(testing[module2], testing[target]))
testing['PRED'] = clf.predict(testing[module2])
testing

0.6222222222222222
0.2916666666666667


  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['PRED'] = clf.predict(testing[module2])


Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD,L3D_LIKE,L3D_WOW,N1D_CLOSING_INDEX,DIFF,DIRECT,PRED
243,2022-03-01,17898.25,60371,7.631432,7.153834,1288,6.006353,5.777652,7.909489,12.514366,7.978996,17867.6,-30.65,0,1
244,2022-03-02,17867.6,58454,8.33663,5.814131,1626,6.398595,5.429346,5.31812,12.438703,8.264878,17934.4,66.8,1,1
245,2022-03-03,17934.4,56972,9.283219,5.978886,7078,6.200509,6.267201,7.779049,12.077085,9.20954,17736.52,-197.88,0,1
246,2022-03-04,17736.52,74795,8.847647,6.639876,1744,7.303843,5.869297,6.070738,12.155942,9.254166,17457.605,-278.915,0,1
247,2022-03-05,17457.605,36434,9.231123,5.616771,398,4.691348,4.60517,4.787492,12.032915,9.12913,17318.1475,-139.4575,0,1
248,2022-03-06,17318.1475,62376,9.204624,6.942157,1469,5.521461,5.420535,5.710427,12.064538,8.19174,17178.69,-139.4575,0,1
249,2022-03-07,17178.69,86153,9.125218,7.924796,928,7.367709,5.723585,7.021976,12.127911,7.935587,16825.25,-353.44,0,1
250,2022-03-08,16825.25,78121,9.298992,6.883463,828,5.703782,5.043425,5.192957,12.331162,8.078688,17015.36,190.11,1,1
251,2022-03-09,17015.36,42180,8.24722,5.934894,693,6.759255,4.804021,5.537334,12.237833,7.803435,17433.2,417.84,1,1
252,2022-03-10,17433.2,60286,9.208138,6.2106,1199,7.077498,5.068904,5.968708,12.103968,7.908387,17264.74,-168.46,0,1


In [56]:
clf = LogisticRegression()
clf.fit(training[module3], training[target])
print(clf.score(training[module3], training[target]))
print(clf.score(testing[module3], testing[target]))
testing['PRED'] = clf.predict(testing[module3])
testing

0.6222222222222222
0.2916666666666667


  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['PRED'] = clf.predict(testing[module3])


Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD,L3D_LIKE,L3D_WOW,N1D_CLOSING_INDEX,DIFF,DIRECT,PRED
243,2022-03-01,17898.25,60371,7.631432,7.153834,1288,6.006353,5.777652,7.909489,12.514366,7.978996,17867.6,-30.65,0,1
244,2022-03-02,17867.6,58454,8.33663,5.814131,1626,6.398595,5.429346,5.31812,12.438703,8.264878,17934.4,66.8,1,1
245,2022-03-03,17934.4,56972,9.283219,5.978886,7078,6.200509,6.267201,7.779049,12.077085,9.20954,17736.52,-197.88,0,1
246,2022-03-04,17736.52,74795,8.847647,6.639876,1744,7.303843,5.869297,6.070738,12.155942,9.254166,17457.605,-278.915,0,1
247,2022-03-05,17457.605,36434,9.231123,5.616771,398,4.691348,4.60517,4.787492,12.032915,9.12913,17318.1475,-139.4575,0,1
248,2022-03-06,17318.1475,62376,9.204624,6.942157,1469,5.521461,5.420535,5.710427,12.064538,8.19174,17178.69,-139.4575,0,1
249,2022-03-07,17178.69,86153,9.125218,7.924796,928,7.367709,5.723585,7.021976,12.127911,7.935587,16825.25,-353.44,0,1
250,2022-03-08,16825.25,78121,9.298992,6.883463,828,5.703782,5.043425,5.192957,12.331162,8.078688,17015.36,190.11,1,1
251,2022-03-09,17015.36,42180,8.24722,5.934894,693,6.759255,4.804021,5.537334,12.237833,7.803435,17433.2,417.84,1,1
252,2022-03-10,17433.2,60286,9.208138,6.2106,1199,7.077498,5.068904,5.968708,12.103968,7.908387,17264.74,-168.46,0,1


In [57]:
clf = LogisticRegression()
clf.fit(training[module4], training[target])
print(clf.score(training[module4], training[target]))
print(clf.score(testing[module4], testing[target]))
testing['PRED'] = clf.predict(testing[module4])
testing

0.6333333333333333
0.5208333333333334


  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['PRED'] = clf.predict(testing[module4])


Unnamed: 0,DATE,CLOSING_INDEX,LIKE,HAHA,LOVE,WOW,CARE,ANGRY,SAD,L3D_LIKE,L3D_WOW,N1D_CLOSING_INDEX,DIFF,DIRECT,PRED
243,2022-03-01,17898.25,60371,7.631432,7.153834,1288,6.006353,5.777652,7.909489,12.514366,7.978996,17867.6,-30.65,0,1
244,2022-03-02,17867.6,58454,8.33663,5.814131,1626,6.398595,5.429346,5.31812,12.438703,8.264878,17934.4,66.8,1,1
245,2022-03-03,17934.4,56972,9.283219,5.978886,7078,6.200509,6.267201,7.779049,12.077085,9.20954,17736.52,-197.88,0,1
246,2022-03-04,17736.52,74795,8.847647,6.639876,1744,7.303843,5.869297,6.070738,12.155942,9.254166,17457.605,-278.915,0,1
247,2022-03-05,17457.605,36434,9.231123,5.616771,398,4.691348,4.60517,4.787492,12.032915,9.12913,17318.1475,-139.4575,0,0
248,2022-03-06,17318.1475,62376,9.204624,6.942157,1469,5.521461,5.420535,5.710427,12.064538,8.19174,17178.69,-139.4575,0,0
249,2022-03-07,17178.69,86153,9.125218,7.924796,928,7.367709,5.723585,7.021976,12.127911,7.935587,16825.25,-353.44,0,1
250,2022-03-08,16825.25,78121,9.298992,6.883463,828,5.703782,5.043425,5.192957,12.331162,8.078688,17015.36,190.11,1,1
251,2022-03-09,17015.36,42180,8.24722,5.934894,693,6.759255,4.804021,5.537334,12.237833,7.803435,17433.2,417.84,1,1
252,2022-03-10,17433.2,60286,9.208138,6.2106,1199,7.077498,5.068904,5.968708,12.103968,7.908387,17264.74,-168.46,0,1


In [59]:
# statmodels
training2 = training[module4]
training2 = sm.add_constant(training2, prepend=False)

# Fit and summarize OLS model
mod = sm.Logit(training[target], training2)
res = mod.fit()

print(res.summary())

Optimization terminated successfully.
         Current function value: 0.621364
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 DIRECT   No. Observations:                   90
Model:                          Logit   Df Residuals:                       82
Method:                           MLE   Df Model:                            7
Date:                Tue, 19 Apr 2022   Pseudo R-squ.:                 0.06275
Time:                        23:24:21   Log-Likelihood:                -55.923
converged:                       True   LL-Null:                       -59.667
Covariance Type:            nonrobust   LLR p-value:                    0.3799
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
L3D_LIKE      -0.3990      1.037     -0.385      0.701      -2.432       1.634
L3D_WOW        0.2121      0.

In [61]:
clf = LogisticRegression()
clf.fit(training[module4], training[target])
print(clf.score(training[module4], training[target]))
print(clf.score(testing[module4], testing[target]))
testing['PRED'] = clf.predict(testing[module4])
# testing

0.6333333333333333
0.5208333333333334


  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testing['PRED'] = clf.predict(testing[module4])


In [63]:
testing.to_pickle('pred.pkl')

## Visualization

In [None]:
regressor = LinearRegression()
regressor.fit(training[module2], training[target])
print(regressor.score(training[module2], training[target]))
print(regressor.score(testing[module2], testing[target]))

In [79]:
stock_pred = training[module2]
stock_pred['PRED'] = regressor.predict(training[module2])
stock_pred['N1D_CLOSING_INDEX'] =  training[target]
stock_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_pred['PRED'] = regressor.predict(training[module2])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_pred['N1D_CLOSING_INDEX'] =  training[target]


Unnamed: 0,CLOSING_INDEX,L4D_LIKE,PRED,N1D_CLOSING_INDEX
153,0.155651,-0.490039,0.278185,0.439337
154,0.439337,-0.574272,0.533599,0.382678
155,0.382678,-0.589458,0.483677,0.373558
156,0.373558,-0.712353,0.479743,0.368998
157,0.368998,-0.721443,0.475994,0.364438
...,...,...,...,...
238,0.173135,3.590294,0.154780,0.290846
239,0.290846,4.221204,0.238082,0.542148
240,0.542148,3.488555,0.486751,0.667799
241,0.667799,2.242522,0.641046,0.730624


In [83]:
stock_pred = testing[['DATE']+module2]
stock_pred['PRED'] = regressor.predict(testing[module2])
stock_pred['N1D_CLOSING_INDEX'] =  testing[target]
stock_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_pred['PRED'] = regressor.predict(testing[module2])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_pred['N1D_CLOSING_INDEX'] =  testing[target]


Unnamed: 0,DATE,CLOSING_INDEX,L4D_LIKE,PRED,N1D_CLOSING_INDEX
243,2022-03-01,0.79345,1.936617,0.763323,0.730847
244,2022-03-02,0.730847,2.185679,0.699109,0.867287
245,2022-03-03,0.867287,1.957675,0.828338,0.463112
246,2022-03-04,0.463112,1.329215,0.489935,-0.106579
247,2022-03-05,-0.106579,1.073213,-0.0085,-0.391424
248,2022-03-06,-0.391424,1.115158,-0.263506,-0.67627
249,2022-03-07,-0.67627,1.427244,-0.527712,-1.39818
250,2022-03-08,-1.39818,1.462815,-1.171587,-1.009875
251,2022-03-09,-1.009875,1.524268,-0.828001,-0.156426
252,2022-03-10,-0.156426,1.501915,-0.067477,-0.50051


In [85]:
stock_pred['DIFF_PRED'] = stock_pred['PRED'] - stock_pred['CLOSING_INDEX']
stock_pred['DIFF_PRED2'] = stock_pred['DIFF_PRED'].apply(lambda x: int(x>0))
stock_pred['DIFF_ACTU'] = stock_pred['N1D_CLOSING_INDEX'] - stock_pred['CLOSING_INDEX']
stock_pred['DIFF_ACTU2'] = stock_pred['DIFF_ACTU'].apply(lambda x: int(x>0))
stock_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_pred['DIFF_PRED'] = stock_pred['PRED'] - stock_pred['CLOSING_INDEX']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_pred['DIFF_PRED2'] = stock_pred['DIFF_PRED'].apply(lambda x: int(x>0))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stock_pred['DIFF_ACTU'] = stock_pred['N1D_CLOSING_IND

Unnamed: 0,DATE,CLOSING_INDEX,L4D_LIKE,PRED,N1D_CLOSING_INDEX,DIFF_PRED,DIFF_PRED2,DIFF_ACTU,DIFF_ACTU2
243,2022-03-01,0.79345,1.936617,0.763323,0.730847,-0.030127,0,-0.062603,0
244,2022-03-02,0.730847,2.185679,0.699109,0.867287,-0.031737,0,0.136441,1
245,2022-03-03,0.867287,1.957675,0.828338,0.463112,-0.03895,0,-0.404175,0
246,2022-03-04,0.463112,1.329215,0.489935,-0.106579,0.026822,1,-0.569691,0
247,2022-03-05,-0.106579,1.073213,-0.0085,-0.391424,0.098078,1,-0.284845,0
248,2022-03-06,-0.391424,1.115158,-0.263506,-0.67627,0.127918,1,-0.284845,0
249,2022-03-07,-0.67627,1.427244,-0.527712,-1.39818,0.148557,1,-0.72191,0
250,2022-03-08,-1.39818,1.462815,-1.171587,-1.009875,0.226592,1,0.388304,1
251,2022-03-09,-1.009875,1.524268,-0.828001,-0.156426,0.181874,1,0.853449,1
252,2022-03-10,-0.156426,1.501915,-0.067477,-0.50051,0.088949,1,-0.344084,0


In [87]:
stock_pred.groupby(['DIFF_PRED2', 'DIFF_ACTU2']).size().reset_index()

Unnamed: 0,DIFF_PRED2,DIFF_ACTU2,0
0,0,0,2
1,0,1,1
2,1,0,17
3,1,1,11


In [None]:
stock_pred['']