In [11]:
import numpy as np
import pandas as pd
import hvplot.pandas
from pathlib import Path
import matplotlib.pyplot as plt

%matplotlib inline

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [13]:
df = pd.read_csv('data/pfizer_sentiment_analysis.csv', index_col="date", infer_datetime_format=True, parse_dates=True)
df = df.resample('D').mean()
df.head()

Unnamed: 0_level_0,compound,positive,negative,neutral
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-01,-0.8625,0.0,0.314,0.686
2020-01-02,0.04029,0.0651,0.046,0.8889
2020-01-03,-0.02382,0.0496,0.0564,0.894
2020-01-04,-0.2294,0.0,0.0515,0.9485
2020-01-05,0.148,0.0495,0.028,0.9225


In [14]:
df2 = pd.read_csv('data/pfizer_prices.csv', index_col="t", infer_datetime_format=True, parse_dates=True)
df2["c"] = df2["c"].pct_change()
df2 = df2.dropna()
df2["c"][df2["c"] < 0] = 0
df2["c"][df2["c"] > 0] = 1
df2.head()

Unnamed: 0_level_0,c,h,l,o,v
t,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-03,0.0,39.24,38.67,38.72,11193319
2020-01-06,0.0,39.0,38.7,38.82,10206988
2020-01-07,0.0,39.13,38.68,39.12,14328671
2020-01-08,1.0,39.22,38.75,38.76,12580358
2020-01-09,0.0,39.27,38.79,39.27,15754368


In [15]:
df = pd.concat([df,df2], axis =1)
df = df.dropna()
df.head()

Unnamed: 0,compound,positive,negative,neutral,c,h,l,o,v
2020-01-03,-0.02382,0.0496,0.0564,0.894,0.0,39.24,38.67,38.72,11193319.0
2020-01-06,-0.070842,0.035917,0.0455,0.918583,0.0,39.0,38.7,38.82,10206988.0
2020-01-07,0.08678,0.0387,0.0094,0.9519,0.0,39.13,38.68,39.12,14328671.0
2020-01-08,0.135414,0.052714,0.011,0.936286,1.0,39.22,38.75,38.76,12580358.0
2020-01-09,-0.108008,0.025917,0.057417,0.916917,0.0,39.27,38.79,39.27,15754368.0


In [16]:
X = df.drop(["c"], axis=1)
y = df["c"]

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 50)

In [18]:
model = LinearDiscriminantAnalysis().fit(x_train, y_train)

In [19]:
predictions = model.predict(x_test)

In [20]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         0.0       0.56      0.69      0.62        26
         1.0       0.38      0.26      0.31        19

    accuracy                           0.51        45
   macro avg       0.47      0.48      0.47        45
weighted avg       0.49      0.51      0.49        45



In [22]:
predictions

array([1., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.,
       1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])

In [23]:
x_test

Unnamed: 0,compound,positive,negative,neutral,h,l,o,v
2020-09-28,0.05432,0.0414,0.0299,0.9287,36.575,36.16,36.16,11631390.0
2020-05-21,0.073233,0.0855,0.065333,0.849167,37.65,37.05,37.57,28639057.0
2020-08-12,0.132878,0.072778,0.036222,0.891,38.55,37.81,37.83,33983446.0
2020-03-02,0.1794,0.055364,0.0,0.944636,34.95,33.445,33.91,31505102.0
2020-01-06,-0.070842,0.035917,0.0455,0.918583,39.0,38.7,38.82,10206988.0
2020-04-14,-0.08931,0.0371,0.0661,0.8968,36.63,35.335,35.93,33641572.0
2020-07-01,0.237445,0.09985,0.0236,0.87655,34.54,33.72,34.54,61063678.0
2020-08-03,0.03738,0.0726,0.0566,0.8708,38.66,38.24,38.61,27225104.0
2020-09-22,0.357878,0.146889,0.027889,0.825222,36.33,35.74,35.81,19031807.0
2020-09-24,0.1564,0.09,0.038818,0.871182,36.04,35.54,35.95,17826964.0
