In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import sklearn as sk
import scipy as sp
import matplotlib as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression

# 1. Read and clean the data

In [2]:
data = pd.read_csv('Combined_News_DJIA.csv')
print(data.shape)

train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']

(1989, 27)


## Word Counts: CountVectorizer

In [3]:
cv_object = CountVectorizer(token_pattern = '[a-zA-Z]{2,}', 
                            lowercase = True, 
                            stop_words = 'english', 
                            min_df = 3,
                            ngram_range = (2,2)
                           )  # do we need to use a N-gram model?

# train set
# build a "single list of strings"
trainheadlines = []
for row in range(0,len(train.index)):
    trainheadlines.append(' '.join(str(x) for x in train.iloc[row,2:27]))
train_mat = cv_object.fit_transform(trainheadlines)

# test set
testheadlines = []
for row in range(0,len(test.index)):
    testheadlines.append(' '.join(str(x) for x in test.iloc[row,2:27]))
test_mat = cv_object.transform(testheadlines)

In [4]:
# summary
# dimensions
print(train_mat.shape)
print(test_mat.shape)

# top 15 words
print(cv_object.get_feature_names()[:10])
# many stemming words, emmmmm

(1611, 13428)
(378, 13428)
['aaa credit', 'aaa rating', 'abbot point', 'abbott government', 'abbott says', 'abc news', 'abducted women', 'abdullah bin', 'abdullah jordan', 'abdullah saleh']


In [5]:
# logistic model
lm = LogisticRegression()
lm.fit(train_mat, train["Label"])

test_yhat = lm.predict(test_mat)

In [6]:
# confusion matrix
confusion = pd.crosstab(test["Label"], test_yhat, rownames=["Actual"], colnames=["Predicted"])
print(confusion)

# misclassification rate
mis = np.mean(test_yhat != test["Label"])
print(mis)

Predicted   0    1
Actual            
0          90   96
1          74  118
0.449735449735


## Word Frequency: TfidVectorizer

In [7]:
tfid = TfidfTransformer(use_idf=True)
train_mat2 = tfid.fit_transform(train_mat)
test_mat2 = tfid.transform(test_mat)

In [8]:
# logistic model
lm = LogisticRegression()
lm.fit(train_mat2, train["Label"])

test_yhat = lm.predict(test_mat2)

# confusion matrix
confusion = pd.crosstab(test["Label"], test_yhat, rownames=["Actual"], colnames=["Predicted"])
print(confusion)

# misclassification rate
mis = np.mean(test_yhat != test["Label"])
print(mis)

Predicted   0    1
Actual            
0          27  159
1          28  164
0.494708994709


Available approaches:
1. Logistic regression/GAM ...
2. Bayesian Models
3. PCA/Matrix factorization
4. LDA/Clustering

Also, we have to deal with the stemming problem (use nltk)

In [13]:
a = np.random.randn(12288, 150) # a.shape = (12288, 150)
b = np.random.randn(150, 45) # b.shape = (150, 45)
c = np.dot(a,b)