In [1]:
import pandas as pd

In [2]:
combined = pd.read_csv('data/Combined_News_DJIA.csv')
news = pd.read_csv('data/RedditNews.csv')
market = pd.read_csv('data/upload_DJIA_table.csv')

In [3]:
print(combined.shape)
combined.head(1)

(1989, 27)


Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""


In [4]:
# Remove 'b'-prefixes
# Applymap -> Like .apply() but Applied to Entire DF
combined = combined.applymap(lambda cell: cell.strip() if type(cell)==str else cell)
combined = combined.applymap(lambda cell: cell.lstrip('b"') if type(cell)==str else cell)
combined.head(2)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,Georgia 'downs two Russian warplanes' as count...,'BREAKING: Musharraf to be impeached.','Russia Today: Columns of troops roll into Sou...,'Russian tanks are moving towards the capital ...,"Afghan children raped with 'impunity,' U.N. of...",'150 Russian tanks have entered South Ossetia ...,"Breaking: Georgia invades South Ossetia, Russi...",The 'enemy combatent' trials are nothing but a...,...,'Georgia Invades South Ossetia - if Russia get...,'Al-Qaeda Faces Islamist Backlash',"'Condoleezza Rice: ""The US would not act to pr...",'This is a busy day: The European Union has a...,"Georgia will withdraw 1,000 soldiers from Iraq...",'Why the Pentagon Thinks Attacking Iran is a B...,'Caucasus in crisis: Georgia invades South Oss...,'Indian shoe manufactory - And again in a ser...,'Visitors Suffering from Mental Illnesses Bann...,"No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,'Why wont America and Nato help us? If they wo...,'Bush puts foot down on Georgian conflict',Jewish Georgian minister: Thanks to Israeli tr...,'Georgian army flees in disarray as Russians a...,"Olympic opening ceremony fireworks 'faked'""",'What were the Mossad with fraudulent New Zeal...,'Russia angered by Israeli military sale to Ge...,'An American citizen living in S.Ossetia blame...,...,'Israel and the US behind the Georgian aggress...,"'""Do not believe TV, neither Russian nor Georg...",'Riots are still going on in Montreal (Canada)...,'China to overtake US as largest manufacturer','War in South Ossetia [PICS]','Israeli Physicians Group Condemns State Torture',' Russia has just beaten the United States ove...,'Perhaps *the* question about the Georgia - Ru...,'Russia is so much better at war',So this is what it's come to: trading sex for ...


In [5]:
# Sample Corpus from Combined DF
# -- Includes Only Top1 Headlines
corpus = combined[['Date', 'Label', 'Top1']]
print(corpus.shape)
corpus.head(3)

(1989, 3)


Unnamed: 0,Date,Label,Top1
0,2008-08-08,0,Georgia 'downs two Russian warplanes' as count...
1,2008-08-11,1,'Why wont America and Nato help us? If they wo...
2,2008-08-12,0,'Remember that adorable 9-year-old who sang at...


In [104]:
corpus.tail(2)

Unnamed: 0,Date,Label,Top1
1987,2016-06-30,1,Jamaica proposes marijuana dispensers for tour...
1988,2016-07-01,1,A 117-year-old woman in Mexico City finally re...


In [128]:
# Separate Train and Validation
mask = pd.to_datetime(corpus['Date']) < pd.to_datetime(2015, format='%Y')
val_train_size_ratio = (sum(~mask)) / (len(corpus['Date']))
print("Val Size Ratio: ", val_train_size_ratio)
corpus_train = corpus[mask]
corpus_val = corpus[~mask]

Val Size Ratio:  0.19004524886877827


In [13]:
# Import Statements
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

## Try on Built-In Dataset

In [14]:
# Dataset
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism',
              'talk.religion.misc']

data = fetch_20newsgroups(subset='train', categories=categories)

In [15]:
# Create Pipeline Components

vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
rfc = RandomForestClassifier()

In [16]:
# Define the Pipeline

pipe = Pipeline([
    # Vectorizer
    ('vect', vect),
    # Classifier
    ('clf', rfc)
])


In [22]:
# Potential Parameters for Our Pipeline Components
parameters = {
    'vect__max_df': (0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (500, 1000),
    'clf__max_depth': (15, 20)
}

grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=6, verbose=1)
# grid_search.fit(X, y)
grid_search.fit(data.data, data.target)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    6.6s
[Parallel(n_jobs=6)]: Done  80 out of  80 | elapsed:   12.0s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect',
                                        TfidfVectorizer(ngram_range=(1, 2),
                                                        stop_words='english')),
                                       ('clf', RandomForestClassifier())]),
             n_jobs=6,
             param_grid={'clf__max_depth': (15, 20),
                         'vect__max_df': (0.75, 1.0),
                         'vect__max_features': (500, 1000),
                         'vect__min_df': (0.02, 0.05)},
             verbose=1)

In [35]:
# grid_search.best_params_, grid_search.best_score_
# ({'clf__max_depth': 20,
#   'vect__max_df': 1.0,
#   'vect__max_features': 1000,
#   'vect__min_df': 0.02},
#  0.9089827281381749)

## Latent Semantic Indexing
Combination of Bag of Words + Dimensionality Reduction

## Try with Headline Data

In [27]:
print(corpus.shape)
corpus.head(3)

(1989, 3)


Unnamed: 0,Date,Label,Top1
0,2008-08-08,0,Georgia 'downs two Russian warplanes' as count...
1,2008-08-11,1,'Why wont America and Nato help us? If they wo...
2,2008-08-12,0,'Remember that adorable 9-year-old who sang at...


In [94]:
from sklearn.decomposition import TruncatedSVD
# This is an option for dimensionality reduction that apparantly
# works well with bag of words vectors.

svd = TruncatedSVD(n_components=100,
                   algorithm='randomized',
                   n_iter=10)

In [95]:
params = {
    'lsi__svd__n_components': [10, 100, 250],
    'lsi__vect__max_df': [0.9, 0.95, 1.0],
    'clf__n_estimators': [5, 10, 20]
}

In [96]:
# Same Pipeline Components
# Text to Vector and Classifier
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
rfc = RandomForestClassifier()

In [97]:
# Define the Pipeline

# LSI
lsi = Pipeline([('vect', vect), ('svd', svd)])

# Estimator Pipe
pipe = Pipeline([('lsi', lsi), ('clf', rfc)])

In [98]:
# Implement Grid Search
grid_search = GridSearchCV(pipe, params, cv=5, n_jobs=6, verbose=1)
# grid_search.fit(X, y)
grid_search.fit(corpus['Top1'], corpus['Label'])

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   20.3s
[Parallel(n_jobs=6)]: Done 135 out of 135 | elapsed:  1.2min finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('lsi',
                                        Pipeline(steps=[('vect',
                                                         TfidfVectorizer(ngram_range=(1,
                                                                                      2),
                                                                         stop_words='english')),
                                                        ('svd',
                                                         TruncatedSVD(n_components=100,
                                                                      n_iter=10))])),
                                       ('clf', RandomForestClassifier())]),
             n_jobs=6,
             param_grid={'clf__n_estimators': [5, 10, 20],
                         'lsi__svd__n_components': [10, 100, 250],
                         'lsi__vect__max_df': [0.9, 0.95, 1.0]},
             verbose=1)

In [99]:
# First Pass
# grid_search.best_params_, grid_search.best_score_
# ({'clf__max_depth': 15,
#   'vect__max_df': 1.0,
#   'vect__max_features': 500,
#   'vect__min_df': 0.05},
#  0.5369618875232586)

# Pass with LSI
# ({'clf__n_estimators': 20,
#   'lsi__svd__n_components': 100,
#   'lsi__vect__max_df': 0.9},
#  0.531912712175487)

In [100]:
grid_search.best_params_, grid_search.best_score_

({'clf__n_estimators': 20,
  'lsi__svd__n_components': 10,
  'lsi__vect__max_df': 0.95},
 0.5218814475399669)

### Try with spaCy Vectorizor
I'm not sure how to get a spaCy vectorizer to work within an sklearn pipeline.

In [45]:
import spacy
import numpy as np
nlp = spacy.load('en_core_web_lg')
vectors = [np.array(nlp(word).vector) for word in corpus['Top1']]

In [65]:
# # We Just Need a Classifier as the text has already been classified

# pipe = Pipeline([
#     # Vectorizer
#     ('vect', piplinize),
#     # Classifier
#     ('clf', rfc)
# ])

In [66]:
# # Potential Parameters for Our Pipeline Components
# parameters = {
#     'clf__max_depth': (15, 20)
# }

# grid_search = GridSearchCV(pipe, parameters, cv=5, n_jobs=6, verbose=1)
# # grid_search.fit(X, y)
# grid_search.fit(corpus['Vectors'], corpus['Label'])

In [101]:
# Get spaCy Vectors Function
def get_word_vectors(docs):
    return [nlp(doc).vector for doc in docs]

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [132]:
# Vectorize Train and Validation
X_train = get_word_vectors(corpus_train['Top1'])
X_val = get_word_vectors(corpus_val['Top1'])

In [133]:
# Fit on the Random Foretst Classifier
rfc.fit(X_train, corpus_train['Label'])

RandomForestClassifier()

In [139]:
# Slightly Improved Accuracy
rfc.score(X_val, corpus_val['Label'])

0.544973544973545

In [140]:
predictions = rfc.predict(X_val)

In [141]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

parameters = {
    'max_depth': (15, 20),
    'n_estimators': (10, 100, 150)
}

grid_search = GridSearchCV(RandomForestClassifier(), parameters, 
                           cv=5, n_jobs=4, verbose=1)
grid_search.fit(X_train, corpus_train['Label'])

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    9.2s finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'max_depth': (15, 20), 'n_estimators': (10, 100, 150)},
             verbose=1)

In [145]:
# Get Score
grid_search.score(X_val, corpus_val['Label'])

0.5423280423280423