In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from text_clean import *

import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
import pandas as pd

def label_stocks1(df):
    labels = []
    for i in range(1, len(df.index)):
      if df.iloc[i, :]['Close'] - df.iloc[i - 1, :]['Close'] < 0:
        labels.append(-1)
      else:
        labels.append(1)
    
    df.drop(0, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df['value'] = pd.Series(labels)


def label_stocks2(row):
    if row['Close'] >= row['yesterday_close']:
        return 1
    else:
      return 0


def concat_headlines(df):
    df['text'] = df['text'].apply(lambda x:x + ' ')
    df = df.groupby('Date', as_index=False).text.sum()
    df['text'] = df['text'].str.lstrip()
    

apple_stocks = "Data/Stock data/Stocks/aapl.us.txt"
amazon_stocks = "Data/Stock data/Stocks/amzn.us.txt"
facebook_stocks = "Data/Stock data/Stocks/fb.us.txt"

apple_headlines = "Data/News data/apple_deduped.csv"
amazon_headlines = "Data/News data/amazon_deduped.csv"
facebook_headlines = "Data/News data/facebook_deduped.csv"

apple_reddit = "Data/News data/apple_reddit.csv"
amazon_reddit = "Data/News data/amazon_reddit.csv"
facebook_reddit = "Data/News data/facebook_reddit.csv"

apple_tweets = "Data/Tweets/apple_tweets.csv"
amazon_tweets = "Data/Tweets/amazon_tweets.csv"
facebook_tweets = "Data/Tweets/facebook_tweets.csv"

mydateparser = lambda x: pd.datetime.strptime(x[:10], "%Y-%m-%d")
# df = pd.read_csv('Data/Tweets/apple_tweets.csv', parse_dates=[0], date_parser=mydateparser)
df = pd.read_csv(apple_headlines, parse_dates=[1])
stock = pd.read_csv(apple_stocks, parse_dates=[0])

stock['yesterday_close'] = stock['Close'].shift()
stock['yesterday_volume'] = stock['Volume'].shift()
stock['value'] = stock.apply(lambda row: label_stocks2(row), axis=1)
# label_stocks1(df)

# df.rename({'created_at': 'Date'}, inplace=True, axis=1)
df = pd.merge(df, stock, on="Date", how="inner")

In [84]:
"""
df = df[['target', 'value']]

pos = df.loc[df['value'] == 1, 'target'].copy().reset_index(drop=True)
neg = df.loc[df['value'] == -1, 'target'].copy().reset_index(drop=True)

neg = pd.concat([pd.DataFrame(neg), pd.DataFrame(np.zeros(neg.shape), columns=['class'])], 1)
pos = pd.concat([pd.DataFrame(pos), pd.DataFrame(np.ones(pos.shape), columns=['class'])], 1)

np.random.seed(42)
rand = np.random.permutation(pos.shape[0])
pos = pos.iloc[rand[:neg.shape[0]]].reset_index(drop=True)

df = pd.concat([pos, neg]).sample(frac=1).reset_index(drop=True)
"""

for i in ['Open', 'High', 'Low', 'yesterday_volume', 'yesterday_close']:
  scaler = MinMaxScaler()
  scaled = scaler.fit_transform(df[i].astype('float64').values.reshape(-1, 1))
  df[i] = pd.DataFrame(scaled)

get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[['Open', 'High', 'Low', 'yesterday_volume', 'yesterday_close']], validate=False)

vect = TfidfVectorizer(analyzer='word', strip_accents='unicode', ngram_range=(1, 3), binary=True)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', get_numeric_data)
            ])),
              ('text_features', Pipeline([
                 ('selector', get_text_data),
                 ('vec',  vect)
             ]))
         ])),
    ('clf', LinearSVC(class_weight='balanced'))
])

In [86]:
X = df[['text', 'Open', 'High', 'Low', 'yesterday_volume', 'yesterday_close']]
y = df['value']
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

TRAIN: [   0    1    2 ... 2530 2531 2532] TEST: [2533 2534 2535 ... 5061 5062 5063]
TRAIN: [   0    1    2 ... 5061 5062 5063] TEST: [5064 5065 5066 ... 7592 7593 7594]
TRAIN: [   0    1    2 ... 7592 7593 7594] TEST: [ 7595  7596  7597 ... 10123 10124 10125]
TRAIN: [    0     1     2 ... 10123 10124 10125] TEST: [10126 10127 10128 ... 12654 12655 12656]
TRAIN: [    0     1     2 ... 12654 12655 12656] TEST: [12657 12658 12659 ... 15185 15186 15187]


In [66]:
model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('numeric_features', Pipeline(memory=None,
     steps=[('selector', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function <lambda> at 0x000001A660B4D9D8>, inv_kw_args=None,
          inverse_func=N...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [67]:
model.score(X_train, y_train)

0.9996707818930041

In [68]:
y_pred = model.predict(X_test)

print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\nF1 Score: {:.2f}".format(f1_score(y_test, y_pred) * 100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 84.86%

F1 Score: 85.58

Confusion Matrix:
 [[1213  241]
 [ 219 1365]]
