In [19]:
import pandas as pd

import plotly.graph_objs as go
import plotly.offline as py

from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv('data/Combined_News_DJIA.csv')
train_data = data[data['Date'] < '2015-01-01']
test_data = data[data['Date'] > '2014-12-31']
frac = len(train_data) / (len(data))
print("Train:Test Data split proportion: {:.0%}:{:.0%}".format(frac, 1-frac))

Train:Test Data split proportion: 81%:19%


In [32]:
train_headlines = []
for i in range(len(train_data)):
    train_headlines.append(' '.join(str(h) for h in train_data.iloc[i,[11,26]]))
test_headlines = []
for i in range(len(test_data)):
    test_headlines.append(' '.join(str(h) for h in test_data.iloc[i,[11,26]]))

In [36]:
def nlp(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed = tokenizer.tokenize(text)
    
    porter_stemmer = PorterStemmer()
    text_processed = [porter_stemmer.stem(word) for word in text_processed]
    
    try:
        text_processed.remove('b')
    except: 
        pass
    
    return text_processed

In [33]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), analyzer=nlp)
Phi = vectorizer.fit_transform(train_headlines)

nb = BernoulliNB(alpha=0.5)
nb.fit(Phi, train_data['Label'])

Y = vectorizer.transform(test_headlines)
predictions = nb.predict(Y)
predictions_prob = nb.predict_proba(Y)[:, 1]
pd.crosstab(test_data["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,76,110
1,44,148


In [34]:
precision = precision_score(test_data["Label"], predictions)
recall = recall_score(test_data["Label"], predictions)
accuracy = accuracy_score(test_data["Label"], predictions)
f1 = f1_score(test_data["Label"], predictions)
auc = roc_auc_score(test_data['Label'], predictions_prob)

pd.DataFrame(data = {'Precision':[precision], 'Recall':[recall], 
                     'F1':[f1], 'Accuracy':[accuracy], 'AUC':[auc]}, index=['Score'])

Unnamed: 0,Precision,Recall,F1,Accuracy,AUC
Score,0.573643,0.770833,0.657778,0.592593,0.582045


In [41]:
porter_stemmer = PorterStemmer()
porter_stemmer.stem('inconvenient')

'inconveni'