In [151]:
import numpy as np
import pandas as pd
import json
import datetime
import nltk
from nltk.corpus import stopwords
from operator import itemgetter 
import matplotlib.pyplot as plt
import sklearn

In [152]:
with open('submission/posts.json') as file:
    data = file.read()
    new_data = data.replace('}{', '},{')
    json_data = json.loads(f'[{new_data}]')
    
submission = pd.DataFrame.from_records(json_data)

submission_timed = submission
submission_timed['time'] = pd.to_datetime(submission['time'])

In [153]:
stock_list = ['GME', 'AMC','NAKD']
close = {}


for stock in stock_list:
    raw = pd.read_csv('stocks/'+stock+'.csv')
    timed = raw 
    timed['Date'] = pd.to_datetime(timed['Date'])

    date = np.array(raw['Date'].to_list()[:-1])
    close[stock] = np.array(raw['Adj Close'].to_list())

In [154]:
submission_gme = submission_timed[submission_timed['title'].str.lower().str.contains('gme')]

In [155]:
gme_up_date = date[close['GME'][1:] - close['GME'][:-1] >= 0]
gme_down_date = date[close['GME'][1:] - close['GME'][:-1] < 0]

In [156]:
gme_date = np.concatenate((gme_up_date,gme_down_date))

In [157]:
date2submission = {}

for date in gme_date:
    date2submission[date] = submission_gme[submission_gme['time'] == date]
    

In [158]:
date2title = {}
stop_words = set(stopwords.words('english')) 
tokenizer = nltk.RegexpTokenizer(r"\w+")
#tokenizer = nltk.tokenize.casual.TweetTokenizer()
word_set = set()

for date in gme_date:
    title_str = date2submission[date]['title'].str.cat(sep=' ') 
    text_str = date2submission[date]['text'].str.cat(sep=' ')
    author_str = date2submission[date]['author'].str.cat(sep=' ')
    string = title_str +' ' +text_str + ' '+ author_str
    words = tokenizer.tokenize(string)
    words_filtered = [w.lower() for w in words if w.isalpha() and not w.lower() in stop_words]
    word_set.update(words_filtered)
    word_freq = nltk.FreqDist(words_filtered)
    date2title[date] = word_freq

In [159]:
word2index = {}
index2word = {}
date2index = {}
for i, word in enumerate(word_set):
    word2index[word] = i
    index2word[i] = word
    
for i, date in enumerate(gme_date):
    date2index[date] = i

In [160]:
n_words = len(word2index)
n_dates = len(gme_date)

wordCount = np.zeros((n_dates, n_words))
label = np.zeros((n_dates,))

In [161]:
for date in gme_date:
    date_idx = date2index[date]
    if date in gme_up_date:
        label[date_idx] = 1
    
    for word in date2title[date]:        
        word_idx = word2index[word]
        wordCount[date_idx][word_idx] = date2title[date][word]
    

In [162]:
X = wordCount
y = label

In [163]:
X.shape

(124, 55014)

In [164]:
y.shape

(124,)

In [165]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import ShuffleSplit

gnb_clf = GaussianNB()
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=11)
scores = cross_val_score(gnb_clf, X, y, cv=cv,scoring='accuracy')
print(np.mean(scores))

0.472


In [166]:
from sklearn.svm import SVC
svm_clf = SVC()
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=11)
scores = cross_val_score(svm_clf, X, y, cv=cv,scoring='accuracy')
print(np.mean(scores))



0.512


In [168]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV


#lr_cv_clf = LogisticRegressionCV(cv=cv, random_state=11,tol=1e-3,max_iter=1000).fit(X, y)
lr_cv_clf = LogisticRegressionCV(cv=cv, random_state=11,max_iter=1000,solver='liblinear').fit(X, y)



In [182]:
scores = cross_val_score(lr_cv_clf, X, y, cv=cv,scoring='f1')

In [183]:
print(np.mean(scores))

0.6372039072039072


In [171]:
coef2index = {}

for i, coef in enumerate(lr_cv_clf.coef_[0]):
    coef2index[coef] = i

In [172]:
sort_coef2index = sorted(coef2index.items(), key = itemgetter(0), reverse=True)

In [176]:
n_predictor = 20

top_up_predictor = sort_coef2index[:n_predictor]
top_down_predictor = sort_coef2index[-n_predictor:]

In [179]:
for predictor in top_up_predictor:
    print(index2word[predictor[1]],predictor[0])

would 0.19590451740836304
sell 0.19016377605053203
streamable 0.187513574265436
squeeze 0.18567342486440555
short 0.18197344576594301
red 0.15865192401275535
point 0.15016894352503884
wsb 0.1499505997962657
gt 0.14836546877715365
selling 0.14374704274412922
investing 0.14022840017979757
sold 0.13884275503248675
going 0.1326758709227111
today 0.12849663204578707
sne 0.12598808686434826
trade 0.12429786576299906
cohen 0.12360462929703943
yolo 0.1131509535145309
affectionate 0.11196603200546844
ryan 0.11132974438981162


In [181]:
for predictor in top_down_predictor[::-1]:
    print(index2word[predictor[1]],predictor[0])

puts -0.26638915672364716
next -0.2663100284488798
keep -0.1841075682522767
words -0.18409267823392597
us -0.1777961119068264
buying -0.17654669224431463
calls -0.16998247709154093
holders -0.16459610706089917
reggie -0.1551772046667154
ontologicala -0.15453406629592767
thoughts -0.15191225561898158
bought -0.14953719500289897
open -0.13976732891140306
im -0.13947473128622537
interest -0.13718162536781023
shit -0.13632714134764096
ragnarok -0.13549781186349646
baitmanz -0.13549502213773099
jan -0.13312002876272286
way -0.1299717078691803
