# Import the necessary packages 

In [81]:
import pandas as pd
import numpy as np
#import pandas_datareader.data as web
from datetime import date, datetime
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols 

# Load data sets & process data

In [82]:
#Load S&P 500 data
spx = pd.read_csv('..\data\$spx_intraday-30min.csv')
#Split Time into data and Time columns
#spx['Date'] = [datetime.strftime(datetime.strptime(x.split(' ')[0], '%m/%d/%Y'),'%Y-%m-%d')for x in spx['Time']]
spx['Date'] = [x.split(' ')[0] for x in spx['Time']]
spx['Rounded_Time'] = [x.split(' ')[1] for x in spx['Time']]

#Calculate log Returns over 30 min increments
spx['Return'] = np.log(spx['Last'])- np.log(spx['Open'])
spx = spx.dropna()

#Load lstm balanced data set
lstm_bal = pd.read_csv('../output/results/LSTM_balanced_results.csv',index_col=0)
#lstm_bal = lstm_bal.rename(columns={0 :'Column 0'},inplace=True)

#Rounding time to the closest half hour
def round_time(time):
    if int(time.split(':')[1])<30:
        return str(time.split(':')[0]+':00')
    else:
        return str(time.split(':')[0]+':30')

lstm_bal['Rounded_Time'] = [round_time(x) for x in lstm_bal['Time']]

lstm_bal = lstm_bal[['Date','Time','Rounded_Time','outcome']]
lstm_bal.head(5)


Unnamed: 0,Date,Time,Rounded_Time,outcome
0,1/20/2017,6:31,6:30,-1
1,1/20/2017,11:51,11:30,1
2,1/20/2017,11:51,11:30,-1
3,1/20/2017,11:52,11:30,1
4,1/20/2017,11:53,11:30,-1


In [83]:
#Separate lstm_bal data set into Positive (positive tweets), & negative (negative tweets)
lstm_pos = lstm_bal[lstm_bal['outcome']==1]
lstm_neg = lstm_bal[lstm_bal['outcome']==-1]


In [84]:
# Commented out because the code takes long to run 
# #Drop rows of 30 min with conflicting sentiment tweets (both + and - in the same 30 min)
# neg_drop_index = []
# pos_drop_index = []
# for i in range(0,lstm_neg.shape[0]):
#     for j in range(0,lstm_pos.shape[0]):
#         if lstm_neg.iloc[i,0]== lstm_pos.iloc[j,0] and lstm_neg.iloc[i,3]== lstm_pos.iloc[j,3]:
#             neg_drop_index.append(i)
#             pos_drop_index.append(j)

# lstm_neg2 = lstm_neg.drop(lstm_neg.index[neg_drop_index])
# lstm_pos2 = lstm_pos.drop(lstm_pos.index[pos_drop_index])

# #Drop duplicates
# lstm_neg2 = lstm_neg2.drop_duplicates(subset=['Date', 'Rounded_Time'], keep='first')
# lstm_pos2 = lstm_pos2.drop_duplicates(subset=['Date', 'Rounded_Time'], keep='first')

# #Concatenating postive and negative tweets 
# lstm_merged = pd.concat([lstm_neg2,lstm_pos2])
# lstm_merged.to_csv('../output/results/lstm_merged.csv')

In [95]:
# Load lstm_merged data set
lstm_merged = pd.read_csv('../output/results/lstm_merged.csv')

# Merging S&P with lstm_bal data sets

In [92]:
spx_reg = spx.merge(lstm_merged, how = 'outer', on = ['Date','Rounded_Time'])

#Drop unnecessary columns
spx_reg = spx_reg.drop(['High','Low','Volume','Change'],axis = 1)
spx_reg['outcome'] = spx_reg['outcome'].fillna(0)
#spx_reg.to_csv('../output/results/spx_reg.csv')

#Assign Neutral to half hours with no tweet
spx_reg['Positive'] = (spx_reg['outcome'] == 1)
spx_reg['Negative'] = (spx_reg['outcome'] == -1)
#spx_reg['Neutral'] = (spx_reg['outcome'] == 0)
spx_reg = spx_reg[spx_reg['outcome']!=0]
spx_reg = spx_reg[['Date','Rounded_Time','Return','Positive','Negative']]
spx_reg['Positive'] = [int(x==True) for x in spx_reg['Positive']]
spx_reg['Negative'] = [int(x==True) for x in spx_reg['Negative']]
#spx_reg['Neutral'] = [int(x==True) for x in spx_reg['Neutral']]
print(spx_reg.head(20))

          Date Rounded_Time    Return  Positive  Negative
1   12/31/2019        16:00  0.000334         1         0
2   12/31/2019        15:30  0.001983         1         0
3   12/31/2019        15:00  0.001363         1         0
5   12/31/2019        14:00  0.000264         0         1
6   12/31/2019        13:30  0.000637         0         1
7   12/31/2019        13:30  0.000637         1         0
11  12/31/2019        11:30  0.000314         0         1
13  12/31/2019        10:30 -0.000998         1         0
15  12/31/2019         9:30  0.002345         0         1
17  12/30/2019        16:00 -0.000161         0         1
33  12/27/2019        15:30 -0.000451         1         0
35  12/27/2019        14:30  0.000422         0         1
43  12/27/2019        10:30  0.000521         0         1
49  12/26/2019        15:00  0.000000         0         1
50  12/26/2019        14:30 -0.000337         0         1
70  12/23/2019        17:00  0.000000         0         1
71  12/23/2019

# Run Regression on merged data set

In [96]:
#spx_1000 = spx_reg[spx_reg['Rounded_Time']=='10:00']
fit_pos = ols('Return ~ C(Positive)', data=spx_reg).fit() 
print(fit_pos.summary())
fit_neg = ols('Return ~ C(Negative)', data=spx_reg).fit() 
print(fit_neg.summary())
# fit_neutral = ols('Return ~ C(Neutral)', data=spx_reg).fit() 
# print(fit_neutral.summary())

                            OLS Regression Results                            
Dep. Variable:                 Return   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                   0.07727
Date:                Sun, 13 Dec 2020   Prob (F-statistic):              0.781
Time:                        23:38:11   Log-Likelihood:                 9448.5
No. Observations:                1912   AIC:                        -1.889e+04
Df Residuals:                    1910   BIC:                        -1.888e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept        -8.171e-05   5.83e-05  