In [2]:
from textblob import TextBlob
import csv
import os
import math
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sb
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier 



In [3]:
# read in all covid data
tweets = pd.DataFrame()
for file in os.listdir('data'):
    if 'covid' in file:
        tweets = pd.concat([tweets, pd.read_csv(os.path.join('data',
        file), names = ["Date", "id1", "id2", "popularity", "content"])], axis = 0)
tweets.drop_duplicates(inplace = True)



In [5]:
sent = []
sub = []
for tweet in tweets.content:    
    blob = TextBlob(tweet)    
    sent.append(blob.sentiment[0])
    sub.append(blob.sentiment[1])

In [6]:
# add to tweets df
tweets['sentiment'] = pd.Series(sent)
tweets['subjectivity'] = pd.Series(sub)

# aggregate data by day
tweets['date'] = pd.to_datetime(tweets['Date'])
tweets['date'] = tweets['date'].dt.date


In [7]:
# remove neutral tweets and aggregate by day
tweets1 = tweets[(tweets[['sentiment']] != 0).all(axis=1)]
tweets1 = tweets1.groupby('date').agg('mean')
# drop irrelevant cols
tweets1.drop(['id1', 'id2'], axis = 1, inplace = True)



In [8]:
# sp 500 data
sp = pd.read_csv('S&P 500 Historical Data update.csv', header = 0, thousands=',')
sp['date'] = pd.to_datetime(sp['Date'])
sp['date'] = sp['date'].dt.date
sp.drop('Date', axis = 1, inplace = True)

In [10]:
# merge stock and tweet data
df = tweets1.merge(sp, how = "inner", on = "date")

In [11]:
# categorical dependent var
inc_dec = []
for i in df['Change %']:
    ch = float(i.strip('%'))/100
    if ch >= 0:
        inc_dec.append(1)
    elif ch < 0:
        inc_dec.append(0)

The 'Predicted_change' - into the dataframe with a simple pandas operation:

Previous day's 'Compound_multiplied_scaled' should predict next day's 'Pct_change'.

In [12]:
df['change'] = pd.Series(inc_dec)

Create another 'label' -column - 'Buy/Sell' - which is 1 if 'Predicted_change' is positive (=buy) and -1 if negative (=sell)

In [14]:
forecast_col = 'change'
forecast_out = int(math.ceil(0.125 * len(df)))
df['Predicted_change_stock'] = df[forecast_col].shift(-forecast_out)
buy_or_sell = []
for row in df['change']:
    if row >= 0:
        buy_or_sell.append(1)
    elif row < 0:
        buy_or_sell.append(-1) 

In [15]:
#Adds -1 or +1 to the column based on if 'Predicted_change' is negative or positive
df['Buy/Sell'] = buy_or_sell

# The 'Buy/Sell' values need to be shifted up one row to match the 'Predicted_change' values
df['Buy/Sell'] = df['Buy/Sell'].shift(-1)

In [16]:
df

Unnamed: 0,date,sentiment,subjectivity,Price,Open,High,Low,Vol.,Change %,change,Predicted_change_stock,Buy/Sell
0,2022-02-14,0.082815,0.505213,4401.67,4412.61,4426.22,4364.84,-,-0.38%,0,1.0,1.0
1,2022-02-15,0.082815,0.505216,4471.07,4429.28,4472.77,4429.28,-,1.58%,1,1.0,1.0
2,2022-02-16,0.082824,0.505226,4475.01,4455.75,4489.55,4429.68,-,0.09%,1,0.0,1.0
3,2022-02-17,0.082817,0.50522,4380.26,4456.06,4456.06,4373.81,-,-2.12%,0,0.0,1.0
4,2022-02-18,0.082264,0.505493,4348.87,4384.57,4394.6,4327.22,-,-0.72%,0,0.0,1.0
5,2022-02-22,0.083558,0.506902,4304.74,4332.74,4362.12,4267.11,-,-1.01%,0,0.0,1.0
6,2022-02-23,0.084422,0.503225,4225.5,4324.93,4341.51,4221.51,-,-1.84%,0,1.0,1.0
7,2022-02-24,0.085117,0.503595,4288.7,4155.77,4294.73,4114.65,-,1.50%,1,,


In [17]:
#split train/test data
train, test = train_test_split(df, test_size = 0.25, random_state = 42)

y_train = train.pop('change')
x_train = train[['sentiment', 'subjectivity']]
y_test = test.pop('change')
x_test = test[['sentiment', 'subjectivity']]

In [18]:
# from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=42)
logreg.fit(x_train, y_train)
logreg.score( x_test, y_test)



0.5

Accuracy after cross validation

In [21]:
logreg_cv = cross_val_score(logreg, x_train, y_train, cv=2)
print(logreg_cv.mean())

0.6666666666666666




In [25]:
linear= LinearRegression()
linear.fit(x_train, y_train)
linear.score( x_test, y_test)

-14.088303000339842

In [26]:
linarreg_cv = cross_val_score(linear, x_train, y_train, cv=2)
print(linarreg_cv .mean())

-0.9999999999999999
