# Imports 

In [49]:
import pandas as pd
import time
import os
import numpy as np
import scipy as sp
import re
from sklearn.feature_extraction import stop_words as stop
from sklearn.metrics import roc_auc_score as auroc
from sklearn.metrics import confusion_matrix as cm
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 10)

# Functions

In [2]:
# used to mostly make the accuracy scores more readable
def make_percent(num):
    return '%.2f'%(num * 100) + '%'

# go through 'columns' of 'df' and replace parts of text based on 'regx' with "".
# used here to remove syntax from each headline in the dataframe
def df_regx_replace(df, columns, regx):
    for col in columns:
        df[col] = df[col].replace(to_replace = regx, value = "", regex = True)
        df[col] = df[col].str.lower()
    return df
        
# go through each column and tokenize/un-stop-word it. 
# used to begin the proccess of text normalization
def df_text_analyzer(df, columns, analyzer):
    for col in columns:
        column = df[col].copy()
        for index, val in column.iteritems():
            column.loc[index] = analyzer(val)
        df[col] = column
    return df

# remove rows that contain nan values
# used to remove 3 rows from this specific data set
def df_text_remove_nan(df):
    for row_index, row in df.iterrows():
        has_nan = False
        for index, val in row.iteritems():
            if(pd.isnull(val)):
                has_nan = True
        if (has_nan):
            print("removed row: " + str(row_index))
            df = df.drop(axis = 0, labels = row_index)
    return df

# may not be needed
# create a single array of all the tokens that occurr in the data set.
# df is dataframe of all data, columns is list of column names
def df_text_single_array(df, columns):
    element_index = 0;
    for col in columns:
        column = df[col].copy()
        col_num = 0
        for row_index, row_val in column.iteritems():
            for element in row_val:
                element_index = element_index + 1
    array = np.empty(shape = element_index, dtype = "S30")
    element_index = 0
    for col in columns:
        column = df[col].copy()
        for row_index, row_val in column.iteritems():
            for element in row_val:
                array[element_index] = element
                element_index = element_index + 1
    return array

# may not be needed, used to iterate through all cells in a df and transforming with
# tfidf transform function. Was having memory? issues.
def df_tfidf_transform(df, columns, tfidf):
    for col in columns:
        column = df[col].copy()
        for row_index, row_val in column.iteritems():
#             print(tfidf.transform(row_val))
            columns.loc[row_index] = tfidf.transform(row_val)
            print(columns.iloc[row_index])
#         df[col] = column
    return df

# append all the columns into one column, used to combine columns for aggregate vectorization.
def df_concat_str_columns(df, columns):
    concat_columns = df['Top1'].copy()
    for row_index, row_value in df.iterrows():
        single_string = ""
        for col in columns:
            single_string = single_string + " " + df[col].loc[row_index]
        concat_columns[row_index] = single_string
    return concat_columns

# print a given confusion matrix
def print_confusion(cm):
    return

def df_concat_str_columns_weighted(df, columns):
    concat_columns = df['Top1'].copy()
    for row_index, row_value in df.iterrows():
        single_string = ""
        current_weight = 25
        for col in columns:
            for i in range(current_weight, 0, -1):
                single_string = single_string + " " + df[col].loc[row_index]
            current_weight = current_weight - 1
        concat_columns[row_index] = single_string
    return concat_columns

# Parse The Data
Read the data into a dataframe, vectorize the first headline column, and extract the labels column.

## Import Data

In [3]:
data_set = pd.read_csv("C:\\Users\\carmi\\OneDrive\\Documents\\Datasets\\stocknews\\Combined_News_DJIA.csv")
df_original = pd.DataFrame(data_set)
df_original.iloc[0:2]

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as countries move to brink of war""",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)',"b'Russian tanks are moving towards the capital of South Ossetia, which has reportedly been completely destroyed by Georgian artillery fire'","b""Afghan children raped with 'impunity,' U.N. official says - this is sick, a three year old was raped and they do nothing""",b'150 Russian tanks have entered South Ossetia whilst Georgia shoots down two Russian jets.',"b""Breaking: Georgia invades South Ossetia, Russia warned it would intervene on SO's side""","b""The 'enemy combatent' trials are nothing but a sham: Salim Haman has been sentenced to 5 1/2 years, but will be kept longer anyway just because they feel like it.""",...,"b'Georgia Invades South Ossetia - if Russia gets involved, will NATO absorb Georgia and unleash a full scale war?'",b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to prevent an Israeli strike on Iran."" Israeli Defense Minister Ehud Barak: ""Israel is prepared for uncompromising victory in the case of military hostilities.""'",b'This is a busy day: The European Union has approved new sanctions against Iran in protest at its nuclear programme.',"b""Georgia will withdraw 1,000 soldiers from Iraq to help fight off Russian forces in Georgia's breakaway region of South Ossetia""",b'Why the Pentagon Thinks Attacking Iran is a Bad Idea - US News &amp; World Report',b'Caucasus in crisis: Georgia invades South Ossetia',"b'Indian shoe manufactory - And again in a series of ""you do not like your work?""'",b'Visitors Suffering from Mental Illnesses Banned from Olympics',"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,"b'Why wont America and Nato help us? If they wont help us now, why did we help them in Iraq?'",b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli training, we're fending off Russia """,b'Georgian army flees in disarray as Russians advance - Gori abandoned to Russia without a shot fired',"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zealand Passports doing in Iraq?',b'Russia angered by Israeli military sale to Georgia',b'An American citizen living in S.Ossetia blames U.S. and Georgian leaders for the genocide of innocent people',...,b'Israel and the US behind the Georgian aggression?',"b'""Do not believe TV, neither Russian nor Georgian. There are much more victims""'",b'Riots are still going on in Montreal (Canada) because police murdered a boy on Saturday.',b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Torture',b' Russia has just beaten the United States over the head with Peak Oil',b'Perhaps *the* question about the Georgia - Russia conflict ',b'Russia is so much better at war',"b""So this is what it's come to: trading sex for food."""


Import the combined dataset, and place it in a dataframe. This dataset has labels with values 0 - indicating the DJIA went down, and 1 - indicating the DJIA went up or stayed the same. The next 27 columns are the top 27 headlines for a given date in ascending order.

## Clean Data
There are a few things we need to do in order to make this data easier to work with. Thankfully, there are no missing values. However, we will want to do some basic text processing in order to get rid of things like stop words, syntax, and more easily vectorize the headlines.

In [4]:
columns = df_original.iloc[:, 2:]
regx = "[/\-_\+=\*,\.\"\'()!\?@#$%\^;:\[\]]|^b[\'\"]*"
# regx = "[\s\W*|\W\s*]"
df_no_syntax = df_regx_replace(df_original, columns, regx)
df_no_syntax.iloc[0:2]

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,georgia downs two russian warplanes as countries move to brink of war,breaking musharraf to be impeached,russia today columns of troops roll into south ossetia footage from fighting youtube,russian tanks are moving towards the capital of south ossetia which has reportedly been completely destroyed by georgian artillery fire,afghan children raped with impunity un official says this is sick a three year old was raped and they do nothing,150 russian tanks have entered south ossetia whilst georgia shoots down two russian jets,breaking georgia invades south ossetia russia warned it would intervene on sos side,the enemy combatent trials are nothing but a sham salim haman has been sentenced to 5 12 years but will be kept longer anyway just because they feel like it,...,georgia invades south ossetia if russia gets involved will nato absorb georgia and unleash a full scale war,alqaeda faces islamist backlash,condoleezza rice the us would not act to prevent an israeli strike on iran israeli defense minister ehud barak israel is prepared for uncompromising victory in the case of military hostilities,this is a busy day the european union has approved new sanctions against iran in protest at its nuclear programme,georgia will withdraw 1000 soldiers from iraq to help fight off russian forces in georgias breakaway region of south ossetia,why the pentagon thinks attacking iran is a bad idea us news &amp world report,caucasus in crisis georgia invades south ossetia,indian shoe manufactory and again in a series of you do not like your work,visitors suffering from mental illnesses banned from olympics,no help for mexicos kidnapping surge
1,2008-08-11,1,why wont america and nato help us if they wont help us now why did we help them in iraq,bush puts foot down on georgian conflict,jewish georgian minister thanks to israeli training were fending off russia,georgian army flees in disarray as russians advance gori abandoned to russia without a shot fired,olympic opening ceremony fireworks faked,what were the mossad with fraudulent new zealand passports doing in iraq,russia angered by israeli military sale to georgia,an american citizen living in sossetia blames us and georgian leaders for the genocide of innocent people,...,israel and the us behind the georgian aggression,do not believe tv neither russian nor georgian there are much more victims,riots are still going on in montreal canada because police murdered a boy on saturday,china to overtake us as largest manufacturer,war in south ossetia pics,israeli physicians group condemns state torture,russia has just beaten the united states over the head with peak oil,perhaps the question about the georgia russia conflict,russia is so much better at war,so this is what its come to trading sex for food


Removed all puncation/syntax, and lowercased each headline for easier parasing.

In [5]:
df_no_nan = df_text_remove_nan(df_no_syntax)
df_no_nan.iloc[0:2]

removed row: 277
removed row: 348
removed row: 681


Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,georgia downs two russian warplanes as countries move to brink of war,breaking musharraf to be impeached,russia today columns of troops roll into south ossetia footage from fighting youtube,russian tanks are moving towards the capital of south ossetia which has reportedly been completely destroyed by georgian artillery fire,afghan children raped with impunity un official says this is sick a three year old was raped and they do nothing,150 russian tanks have entered south ossetia whilst georgia shoots down two russian jets,breaking georgia invades south ossetia russia warned it would intervene on sos side,the enemy combatent trials are nothing but a sham salim haman has been sentenced to 5 12 years but will be kept longer anyway just because they feel like it,...,georgia invades south ossetia if russia gets involved will nato absorb georgia and unleash a full scale war,alqaeda faces islamist backlash,condoleezza rice the us would not act to prevent an israeli strike on iran israeli defense minister ehud barak israel is prepared for uncompromising victory in the case of military hostilities,this is a busy day the european union has approved new sanctions against iran in protest at its nuclear programme,georgia will withdraw 1000 soldiers from iraq to help fight off russian forces in georgias breakaway region of south ossetia,why the pentagon thinks attacking iran is a bad idea us news &amp world report,caucasus in crisis georgia invades south ossetia,indian shoe manufactory and again in a series of you do not like your work,visitors suffering from mental illnesses banned from olympics,no help for mexicos kidnapping surge
1,2008-08-11,1,why wont america and nato help us if they wont help us now why did we help them in iraq,bush puts foot down on georgian conflict,jewish georgian minister thanks to israeli training were fending off russia,georgian army flees in disarray as russians advance gori abandoned to russia without a shot fired,olympic opening ceremony fireworks faked,what were the mossad with fraudulent new zealand passports doing in iraq,russia angered by israeli military sale to georgia,an american citizen living in sossetia blames us and georgian leaders for the genocide of innocent people,...,israel and the us behind the georgian aggression,do not believe tv neither russian nor georgian there are much more victims,riots are still going on in montreal canada because police murdered a boy on saturday,china to overtake us as largest manufacturer,war in south ossetia pics,israeli physicians group condemns state torture,russia has just beaten the united states over the head with peak oil,perhaps the question about the georgia russia conflict,russia is so much better at war,so this is what its come to trading sex for food


Remove any rows that contain nan values in the columns.

# Baseline Accuracy 
To quickly get a baseline accuracy, combining the columns seems to be the simplest way to get a vectorized data set for classification input. We will do more nuanced vectorization and classification after getting a baseline for this dataset.

## Combine Columns

In [6]:
df = df_no_nan
columns = df.iloc[:, 2:]
concat_column = df_concat_str_columns(df, columns)
concat_column = pd.DataFrame(concat_column)
concat_column.iloc[0]

Top1     georgia downs two russian warplanes as countries move to brink of war breaking musharraf to be impeached russia today columns of troops roll into south ossetia footage from fighting youtube russian tanks are moving towards the capital of south ossetia which has reportedly been completely destroyed by georgian artillery fire afghan children raped with impunity un official says  this is sick a three year old was raped and they do nothing 150 russian tanks have entered south ossetia whilst georgia shoots down two russian jets breaking georgia invades south ossetia russia warned it would intervene on sos side the enemy combatent trials are nothing but a sham salim haman has been sentenced to 5 12 years but will be kept longer anyway just because they feel like it georgian troops retreat from s osettain capital presumably leaving several hundred people killed video did the us prep georgia for war with russia rice gives green light for israel to attack iran says us has no veto over 

Concatenate all the headlines into one corpus to be vectorized in a tdidf/word count vectorizer

In [7]:
df_no_col = df.drop(labels = columns.columns, axis = 1)
df_concat = pd.concat([df_no_col, concat_column], axis = 1)
df_concat.iloc[0:2]

Unnamed: 0,Date,Label,Top1
0,2008-08-08,0,georgia downs two russian warplanes as countries move to brink of war breaking musharraf to be impeached russia today columns of troops roll into south ossetia footage from fighting youtube russian tanks are moving towards the capital of south ossetia which has reportedly been completely destroyed by georgian artillery fire afghan children raped with impunity un official says this is sick a three year old was raped and they do nothing 150 russian tanks have entered south ossetia whilst georgia shoots down two russian jets breaking georgia invades south ossetia russia warned it would intervene on sos side the enemy combatent trials are nothing but a sham salim haman has been sentenced to 5 12 years but will be kept longer anyway just because they feel like it georgian troops retreat from s osettain capital presumably leaving several hundred people killed video did the us prep georgia for war with russia rice gives green light for israel to attack iran says us has no veto over israeli military ops announcingclass action lawsuit on behalf of american public against the fbi sorussia and georgia are at war and the nyts top story is opening ceremonies of the olympics what a fucking disgrace and yet further proof of the decline of journalism china tells bush to stay out of other countries affairs did world war iii start today georgia invades south ossetia if russia gets involved will nato absorb georgia and unleash a full scale war alqaeda faces islamist backlash condoleezza rice the us would not act to prevent an israeli strike on iran israeli defense minister ehud barak israel is prepared for uncompromising victory in the case of military hostilities this is a busy day the european union has approved new sanctions against iran in protest at its nuclear programme georgia will withdraw 1000 soldiers from iraq to help fight off russian forces in georgias breakaway region of south ossetia why the pentagon thinks attacking iran is a bad idea us news &amp world report caucasus in crisis georgia invades south ossetia indian shoe manufactory and again in a series of you do not like your work visitors suffering from mental illnesses banned from olympics no help for mexicos kidnapping surge
1,2008-08-11,1,why wont america and nato help us if they wont help us now why did we help them in iraq bush puts foot down on georgian conflict jewish georgian minister thanks to israeli training were fending off russia georgian army flees in disarray as russians advance gori abandoned to russia without a shot fired olympic opening ceremony fireworks faked what were the mossad with fraudulent new zealand passports doing in iraq russia angered by israeli military sale to georgia an american citizen living in sossetia blames us and georgian leaders for the genocide of innocent people welcome to world war iv now in high definition georgias move a mistake of monumental proportions russia presses deeper into georgia us says regime change is goal abhinav bindra wins first ever individual olympic gold medal for india us ship heads for arctic to define territory drivers in a jerusalem taxi station threaten to quit rather than work for their new boss an arab the french team is stunned by phelps and the 4x100m relay team israel and the us behind the georgian aggression do not believe tv neither russian nor georgian there are much more victims riots are still going on in montreal canada because police murdered a boy on saturday china to overtake us as largest manufacturer war in south ossetia pics israeli physicians group condemns state torture russia has just beaten the united states over the head with peak oil perhaps the question about the georgia russia conflict russia is so much better at war so this is what its come to trading sex for food


Remove redundant, non-combined columns and append the new concatenated column onto the existing dataframe with labels.

## Vectorize Data 

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
sw = stop.ENGLISH_STOP_WORDS
tfidf = TfidfVectorizer(stop_words = sw)
tfidf.fit(df_concat['Top1'])
print(tfidf.get_feature_names()[0:1000], len(tfidf.get_feature_names()))

['000', '0001', '001', '003', '004', '005', '006', '007', '0077', '00s', '01', '011', '014', '014ckwh', '017', '02', '0200', '021hour', '022', '0220', '0221', '025', '03', '030', '035017', '04', '05', '050', '05eurgb', '06', '060', '062', '068as', '07', '07232014', '075', '07baku1268', '07pc', '08', '089m', '08baku671', '09', '0900', '0930', '0935', '10', '100', '1000', '10000', '100000', '1000000', '1000000000', '100000man', '100000person', '100000th', '10000s', '10000strong', '10000yearold', '10001500', '1000km', '1000megawatt', '1000per', '1000s', '1000strong', '1000x', '1000year', '1000yearold', '100200', '1004', '1004am', '1006', '100abarrel', '100apack', '100billion', '100bn', '100day', '100ds', '100fold', '100foot', '100ft', '100k', '100km', '100m', '100mil', '100million', '100mstretch', '100mw', '100page', '100s', '100th', '100x', '100year', '100yearold', '101', '1011', '1012', '1017', '101st', '102', '1020', '102000', '1021', '10262010', '102day', '102yearold', '103', '1030', 

Create a dictionary in a tfidf vectorizer given our concatenated dataset.

In [9]:
features_concat = tfidf.transform(df_concat['Top1'])
data_labels = df_concat['Label']
display(features_concat)
print(features_concat)

<1986x39900 sparse matrix of type '<class 'numpy.float64'>'
	with 489780 stored elements in Compressed Sparse Row format>

  (0, 39697)	0.04792113397841143
  (0, 39586)	0.018635470668102998
  (0, 39576)	0.02309480578615759
  (0, 39317)	0.03510663510162514
  (0, 39295)	0.03222955083293025
  (0, 39198)	0.052627215370409826
  (0, 38962)	0.07472527781990451
  (0, 38644)	0.05458685467845757
  (0, 38632)	0.03848278340226787
  (0, 38591)	0.09580465919607202
  (0, 38346)	0.05874965900421731
  (0, 38220)	0.028760672420553638
  (0, 38214)	0.04589661170480536
  (0, 38159)	0.0641100318305066
  (0, 37476)	0.07190063735605241
  (0, 37430)	0.03528136571540746
  (0, 37203)	0.08424845497195356
  (0, 36718)	0.057575186629517626
  (0, 36629)	0.055135230165267744
  (0, 36160)	0.0639523086984654
  (0, 35872)	0.05955281974015128
  (0, 35600)	0.035923795876247486
  (0, 35374)	0.09332647229079422
  (0, 34955)	0.05458685467845757
  (0, 34709)	0.04748600345927687
  :	:
  (1985, 3536)	0.09229817581704898
  (1985, 3108)	0.0404703132373173
  (1985, 3009)	0.03580320373466395
  (1985, 2796)	0.06189488586264328
  (1985, 2761)	0.035896283

We have converted the *whole* column of concatenated headlines into a single sparse matrix using the tfidf transformer built on the dictionary created earlier. At this point, I'm not sure if I should have created a sparse matrix for each row, or use the above method instead.

## Training/Test Sets

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    features_concat,
    data_labels,
    train_size = 0.80,
    random_state = 1234
)
display(x_train)
display(x_test)



<1588x39900 sparse matrix of type '<class 'numpy.float64'>'
	with 392171 stored elements in Compressed Sparse Row format>

<398x39900 sparse matrix of type '<class 'numpy.float64'>'
	with 97609 stored elements in Compressed Sparse Row format>

Partition the whole concatenated dataset into seperate train/test sets.

## Begin training

### Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB
gnbc = GaussianNB()
start = time.time()
gnbc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(gnbc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

46.98% || 2.33 seconds


### Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
start = time.time()
dtc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(dtc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

50.75% || 27.11 seconds


### Logistic Regression Classifier

In [13]:
from sklearn.linear_model import LogisticRegression
lgc = LogisticRegression()
start = time.time()
lgc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(lgc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

51.01% || 0.63 seconds


These classifiers didn't do very well. They are rather basic though. Let's try some ensemble methods to see if the more complex algorithms can produce better results right off the bat.

## Ensemble Methods

### Random Forest (Gini)

In [14]:
from sklearn.ensemble import RandomForestClassifier
rfc_gini = RandomForestClassifier(n_estimators = 100, criterion = 'gini')
start = time.time()
rfc_gini.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_gini.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

  from numpy.core.umath_tests import inner1d


53.02% || 15.80 seconds


Not unexcpectedly, the ensemble method performed slightly better than the above simpler classification algorithms. Lets tweak some hyperparameters to see if that makes a difference.

In [15]:
rfc_gini = RandomForestClassifier(n_estimators = 1000, criterion = 'gini')
start = time.time()
rfc_gini.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_gini.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

51.01% || 157.33 seconds


Increasing the estimators doesn't seem to result in a significant increase in accuracy. This doesn't even take into consideration that the algorithm took 2.5 minutes to run.

In [16]:
rfc_gini = RandomForestClassifier(n_estimators = 100, criterion = 'gini', min_samples_leaf = 200)
start = time.time()
rfc_gini.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_gini.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

53.77% || 1.49 seconds


Increasing the min samples for a leaf node also doesnt significantly change the accuracy, however it does reduce runtime significantly. Lets try a different split criteria from gini.

### Random Forest (Information Gain)

In [17]:
rfc_info = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
start = time.time()
rfc_info.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_info.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

47.49% || 19.20 seconds


Using information gain with default paramters also doesn't significantly change the accuracy of the model generated. Next we'll try Extremely Randomized Trees from sklearn.

### Extremly Randomized Trees

In [18]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators = 100, criterion = 'gini')
start = time.time()
etc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(etc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

53.02% || 51.43 seconds


Again, not a significant increase in the accuracy of the model. Let's try one more ensemble method -- boosting.

### AdaBoost

In [19]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
start = time.time()
abc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(abc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

51.51% || 95.99 seconds


Not unsurprisingly, using AdaBoost doesn't affect the accuracy significantly. It seems that the way that the data was prepared is not sufficient in creating an accurate classifier for the DJIA based on the headlines.

# Improving Accuracy
The next step is not to concatenate the headlines and, instead, weight each of the 25 headlines for each tuple accordingly to provide the classifiers with another source of important information to utilize when classifying the data.

## Data Preparation Mark II
The initial data preparation involving concatenation of the headlines and then vectorization of the words of each tuple using tfidf was not sufficient in creating a more accurate classifier of the movement of the DJIA versus news headlines.

### Change TFIDF Frequency Structures
Changing the min frequency and max frequency hyper parameters, and then running a coupld of our best classifiers from above could potentially remove extraneous information and imporove the accuracy of the predictions.

#### Reinitializing Variables

In [20]:
df = df_concat
df.iloc[0:2]

Unnamed: 0,Date,Label,Top1
0,2008-08-08,0,georgia downs two russian warplanes as countries move to brink of war breaking musharraf to be impeached russia today columns of troops roll into south ossetia footage from fighting youtube russian tanks are moving towards the capital of south ossetia which has reportedly been completely destroyed by georgian artillery fire afghan children raped with impunity un official says this is sick a three year old was raped and they do nothing 150 russian tanks have entered south ossetia whilst georgia shoots down two russian jets breaking georgia invades south ossetia russia warned it would intervene on sos side the enemy combatent trials are nothing but a sham salim haman has been sentenced to 5 12 years but will be kept longer anyway just because they feel like it georgian troops retreat from s osettain capital presumably leaving several hundred people killed video did the us prep georgia for war with russia rice gives green light for israel to attack iran says us has no veto over israeli military ops announcingclass action lawsuit on behalf of american public against the fbi sorussia and georgia are at war and the nyts top story is opening ceremonies of the olympics what a fucking disgrace and yet further proof of the decline of journalism china tells bush to stay out of other countries affairs did world war iii start today georgia invades south ossetia if russia gets involved will nato absorb georgia and unleash a full scale war alqaeda faces islamist backlash condoleezza rice the us would not act to prevent an israeli strike on iran israeli defense minister ehud barak israel is prepared for uncompromising victory in the case of military hostilities this is a busy day the european union has approved new sanctions against iran in protest at its nuclear programme georgia will withdraw 1000 soldiers from iraq to help fight off russian forces in georgias breakaway region of south ossetia why the pentagon thinks attacking iran is a bad idea us news &amp world report caucasus in crisis georgia invades south ossetia indian shoe manufactory and again in a series of you do not like your work visitors suffering from mental illnesses banned from olympics no help for mexicos kidnapping surge
1,2008-08-11,1,why wont america and nato help us if they wont help us now why did we help them in iraq bush puts foot down on georgian conflict jewish georgian minister thanks to israeli training were fending off russia georgian army flees in disarray as russians advance gori abandoned to russia without a shot fired olympic opening ceremony fireworks faked what were the mossad with fraudulent new zealand passports doing in iraq russia angered by israeli military sale to georgia an american citizen living in sossetia blames us and georgian leaders for the genocide of innocent people welcome to world war iv now in high definition georgias move a mistake of monumental proportions russia presses deeper into georgia us says regime change is goal abhinav bindra wins first ever individual olympic gold medal for india us ship heads for arctic to define territory drivers in a jerusalem taxi station threaten to quit rather than work for their new boss an arab the french team is stunned by phelps and the 4x100m relay team israel and the us behind the georgian aggression do not believe tv neither russian nor georgian there are much more victims riots are still going on in montreal canada because police murdered a boy on saturday china to overtake us as largest manufacturer war in south ossetia pics israeli physicians group condemns state torture russia has just beaten the united states over the head with peak oil perhaps the question about the georgia russia conflict russia is so much better at war so this is what its come to trading sex for food


#### Setting max_df and min_df
By reducing the influence of words that occurr too often, over 50 perecent of the time, and words that occurr to infrequently, the bias that these removed words may have can be eliminated.

In [22]:
from sklearn.feature_extraction import stop_words as stop
sw = stop.ENGLISH_STOP_WORDS
tfidf = TfidfVectorizer(stop_words = sw, max_df = .50, min_df = .2)
tfidf.fit(df_concat['Top1'])
print(tfidf.get_feature_names()[0:1000], len(tfidf.get_feature_names()))

['afghanistan', 'american', 'amp', 'army', 'arrested', 'attack', 'attacks', 'australia', 'australian', 'ban', 'bank', 'bbc', 'billion', 'british', 'calls', 'canada', 'canadian', 'children', 'chinese', 'city', 'countries', 'country', 'court', 'crisis', 'day', 'dead', 'death', 'drug', 'end', 'eu', 'europe', 'european', 'forces', 'france', 'french', 'gaza', 'german', 'germany', 'global', 'group', 'human', 'india', 'international', 'internet', 'iran', 'iraq', 'islamic', 'israeli', 'japan', 'just', 'killed', 'korea', 'law', 'leader', 'like', 'man', 'men', 'mexico', 'military', 'million', 'minister', 'news', 'north', 'nuclear', 'official', 'officials', 'oil', 'pakistan', 'palestinian', 'power', 'president', 'prime', 'protest', 'public', 'report', 'rights', 'russia', 'russian', 'said', 'saudi', 'say', 'security', 'set', 'south', 'state', 'states', 'stop', 'syria', 'syrian', 'thousands', 'time', 'troops', 'uk', 'united', 'use', 'video', 'war', 'west', 'woman', 'women', 'worlds', 'year'] 102


In [23]:
features_concat = tfidf.transform(df_concat['Top1'])
data_labels = df_concat['Label']
display(features_concat)
print(features_concat)

<1986x102 sparse matrix of type '<class 'numpy.float64'>'
	with 56897 stored elements in Compressed Sparse Row format>

  (0, 101)	0.07569580540710538
  (0, 96)	0.3140104708716182
  (0, 95)	0.0942663161180859
  (0, 91)	0.18870910471123906
  (0, 83)	0.5412995517653361
  (0, 77)	0.369644380306084
  (0, 76)	0.26922219001937914
  (0, 74)	0.07634022903924191
  (0, 73)	0.09114181294020089
  (0, 72)	0.08971254961103152
  (0, 64)	0.09400286842874288
  (0, 63)	0.07927083753694732
  (0, 61)	0.07511482601478157
  (0, 60)	0.07176127373697831
  (0, 58)	0.1429506644750222
  (0, 54)	0.18458622650111228
  (0, 50)	0.0654456600861287
  (0, 49)	0.08194676420696917
  (0, 47)	0.21701970770540954
  (0, 45)	0.08924819121605444
  (0, 44)	0.29230483382749645
  (0, 32)	0.08679385454179811
  (0, 31)	0.08971254961103152
  (0, 24)	0.09453166140617489
  (0, 23)	0.09382827752584184
  :	:
  (1985, 57)	0.13901629124389447
  (1985, 56)	0.13811013233646277
  (1985, 55)	0.22169796203630235
  (1985, 50)	0.19248604769762137
  (1985, 44)	0.21492869852759777
  (1985, 43)	0.12924807869494
  (1985, 41)	0.11913531532151937
  (1985, 40)	0.2379094

## Training Mark II

In [24]:
x_train, x_test, y_train, y_test = train_test_split(
    features_concat,
    data_labels,
    train_size = 0.80,
    random_state = 1234
)
display(x_train)
display(x_test)



<1588x102 sparse matrix of type '<class 'numpy.float64'>'
	with 45622 stored elements in Compressed Sparse Row format>

<398x102 sparse matrix of type '<class 'numpy.float64'>'
	with 11275 stored elements in Compressed Sparse Row format>

### Naive Bayes

In [25]:
from sklearn.naive_bayes import GaussianNB
gnbc = GaussianNB()
start = time.time()
gnbc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(gnbc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

49.25% || 0.01 seconds


### Decision Tree

In [26]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
start = time.time()
dtc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(dtc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

53.52% || 0.12 seconds


### Logistic Regression Classifier

In [27]:
from sklearn.linear_model import LogisticRegression
lgc = LogisticRegression()
start = time.time()
lgc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(lgc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

47.74% || 0.01 seconds


## Ensemble Methods

### Random Forest (Gini)

In [28]:
from sklearn.ensemble import RandomForestClassifier
rfc_gini = RandomForestClassifier(n_estimators = 100, criterion = 'gini')
start = time.time()
rfc_gini.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_gini.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

51.26% || 0.81 seconds


In [29]:
rfc_gini = RandomForestClassifier(n_estimators = 1000, criterion = 'gini')
start = time.time()
rfc_gini.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_gini.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

49.50% || 8.55 seconds


In [30]:
rfc_gini = RandomForestClassifier(n_estimators = 100, criterion = 'gini', min_samples_leaf = 200)
start = time.time()
rfc_gini.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_gini.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

53.52% || 0.26 seconds


### Random Forest (Information Gain)

In [31]:
rfc_info = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
start = time.time()
rfc_info.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_info.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

48.99% || 1.48 seconds


### Extremly Randomized Trees

In [32]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators = 100, criterion = 'gini')
start = time.time()
etc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(etc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

51.01% || 0.59 seconds


### AdaBoost

In [33]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
start = time.time()
abc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(abc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

52.01% || 0.52 seconds


It appears that removing the overly frequent or infrequent words from the dictionary and tdidf matrix had none, or even a detrimental, effect to the accuracy of the classifiers used in relation to the first training and baseline accuracies obtained

## Data Preperation Mark III
Neither the original use of TFIDF nor the changed hyperparamter TFIDF from Mark II affected the general accuracies

### Weight each headline

Weighting each headline in some fashion, like the 1st headline gets its frequencies multiplied by 25, 2nd gets 24, 3rd gets 23, etc., may lead to an increase in accuracy as it potentially more closely models the interpretation of headline rankings by humans. This closer interpetation may prove useful as humans buying and selling stocks ultimately decide the movement of the DJIA.

Also, because the weights are being created outside of tfidf, a count vectorizer will be used instead. TFIDF is meant to be a means of weighting word frequencies already -- artificially increasing the word frequencies may not be beneficial for improving accuracies.

#### Reinitialize Data

In [34]:
df = df_no_nan
columns = df.iloc[:, 2:]
concat_column = df_concat_str_columns_weighted(df, columns)
concat_column = pd.DataFrame(concat_column)

#### Concat Columns Weighted
As described above, if the headline is in column 'Top 1', it will be concatenated 25 times to weight the words contained in that headline more in the TFIDF matrix. If the headline is in column 'Top 2', it will be concatenated 24 times, and so on.

In [35]:
df_no_col = df.drop(labels = columns.columns, axis = 1)
df_concat = pd.concat([df_no_col, concat_column], axis = 1)
df_concat.iloc[0][2]

' georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two russian warplanes as countries move to brink of war georgia downs two 

#### Re-Vectorize Data

In [37]:
from sklearn.feature_extraction import stop_words as stop
from sklearn.feature_extraction.text import CountVectorizer
sw = stop.ENGLISH_STOP_WORDS
cv = CountVectorizer(stop_words = sw)
cv.fit(df_concat['Top1'])
print(cv.get_feature_names()[0:1000], len(cv.get_feature_names()))

['000', '0001', '001', '003', '004', '005', '006', '007', '0077', '00s', '01', '011', '014', '014ckwh', '017', '02', '0200', '021hour', '022', '0220', '0221', '025', '03', '030', '035017', '04', '05', '050', '05eurgb', '06', '060', '062', '068as', '07', '07232014', '075', '07baku1268', '07pc', '08', '089m', '08baku671', '09', '0900', '0930', '0935', '10', '100', '1000', '10000', '100000', '1000000', '1000000000', '100000man', '100000person', '100000th', '10000s', '10000strong', '10000yearold', '10001500', '1000km', '1000megawatt', '1000per', '1000s', '1000strong', '1000x', '1000year', '1000yearold', '100200', '1004', '1004am', '1006', '100abarrel', '100apack', '100billion', '100bn', '100day', '100ds', '100fold', '100foot', '100ft', '100k', '100km', '100m', '100mil', '100million', '100mstretch', '100mw', '100page', '100s', '100th', '100x', '100year', '100yearold', '101', '1011', '1012', '1017', '101st', '102', '1020', '102000', '1021', '10262010', '102day', '102yearold', '103', '1030', 

In [38]:
features_concat = cv.transform(df_concat['Top1'])
data_labels = df_concat['Label']
display(features_concat)
print(features_concat)

<1986x39900 sparse matrix of type '<class 'numpy.int64'>'
	with 489780 stored elements in Compressed Sparse Row format>

  (0, 47)	6
  (0, 243)	18
  (0, 408)	20
  (0, 2572)	10
  (0, 2741)	8
  (0, 2746)	14
  (0, 2952)	12
  (0, 2983)	21
  (0, 3490)	9
  (0, 3604)	14
  (0, 3652)	5
  (0, 3848)	14
  (0, 4323)	7
  (0, 4546)	22
  (0, 4798)	15
  (0, 4803)	5
  (0, 5134)	9
  (0, 5159)	5
  (0, 5340)	2
  (0, 5357)	8
  (0, 5671)	14
  (0, 6745)	6
  (0, 6750)	43
  (0, 6850)	25
  (0, 7165)	12
  :	:
  (1985, 37750)	21
  (1985, 37755)	5
  (1985, 37832)	18
  (1985, 37914)	10
  (1985, 38030)	3
  (1985, 38094)	3
  (1985, 38220)	15
  (1985, 38234)	6
  (1985, 38297)	15
  (1985, 38314)	15
  (1985, 38517)	25
  (1985, 38589)	18
  (1985, 38591)	5
  (1985, 38660)	13
  (1985, 38816)	1
  (1985, 38836)	15
  (1985, 39023)	15
  (1985, 39150)	18
  (1985, 39247)	25
  (1985, 39258)	28
  (1985, 39300)	2
  (1985, 39317)	9
  (1985, 39356)	10
  (1985, 39414)	15
  (1985, 39586)	25


## Training Mark III 

In [39]:
x_train, x_test, y_train, y_test = train_test_split(
    features_concat,
    data_labels,
    train_size = 0.80,
    random_state = 1234
)
display(x_train)
display(x_test)



<1588x39900 sparse matrix of type '<class 'numpy.int64'>'
	with 392171 stored elements in Compressed Sparse Row format>

<398x39900 sparse matrix of type '<class 'numpy.int64'>'
	with 97609 stored elements in Compressed Sparse Row format>

### Naive Bayes

In [40]:
from sklearn.naive_bayes import GaussianNB
gnbc = GaussianNB()
start = time.time()
gnbc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(gnbc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

50.00% || 2.23 seconds


### Decision Tree

In [41]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
start = time.time()
dtc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(dtc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

52.01% || 37.02 seconds


### Logistic Regression Classifier

In [42]:
from sklearn.linear_model import LogisticRegression
lgc = LogisticRegression()
start = time.time()
lgc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(lgc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

48.99% || 1.18 seconds


## Ensemble Methods

### Random Forest (Gini)

In [43]:
from sklearn.ensemble import RandomForestClassifier
rfc_gini = RandomForestClassifier(n_estimators = 100, criterion = 'gini')
start = time.time()
rfc_gini.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_gini.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

51.76% || 16.64 seconds


In [44]:
rfc_gini = RandomForestClassifier(n_estimators = 1000, criterion = 'gini')
start = time.time()
rfc_gini.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_gini.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

53.02% || 176.53 seconds


In [45]:
rfc_gini = RandomForestClassifier(n_estimators = 100, criterion = 'gini', min_samples_leaf = 200)
start = time.time()
rfc_gini.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_gini.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

53.77% || 1.44 seconds


### Random Forest (Information Gain)

In [46]:
rfc_info = RandomForestClassifier(n_estimators = 100, criterion = 'entropy')
start = time.time()
rfc_info.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(rfc_info.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

50.00% || 21.65 seconds


### Extremly Randomized Trees

In [47]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators = 100, criterion = 'gini')
start = time.time()
etc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(etc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

53.77% || 55.92 seconds


### AdaBoost

In [48]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
start = time.time()
abc.fit(x_train.toarray(), y_train)
stop = time.time()
print(make_percent(abc.score(x_test.toarray(), y_test)), '|| ' + '%.2f'%(stop - start) + ' seconds')

50.25% || 95.98 seconds


## Conclusions

For all the different methods attempted here to extract meaningful accuracies out of the data, there were no really "good" accuracies achieved. The best accuracies from the random seed of "1234" for the train and test splitting were around 53 - 55 %. Nothing to meaningful, and probably within the standard deviation of the ability of the models.


If I had more time, I would do two major things to attempt to improve accuracies:
1. Take the previous x days of headlines and somehow incorporate them into the vectorization process, maybe weight them less than the current day. The idea is that previous days news also affects the current days stock movements, and this is an attempt to structure the data for the classification algorithms around that idea.

2. Take the previous x days of DJIA movements, finding y days where it consecutively went up/stayed the same (label 1), or went down (label 0). Depending on the overall analysis of the last x days movements, then somehow combine that "momentum" into the prediction of our labels with the headlines.

Both of these ideas seem like they may be promising in not only increasing the amount of information that the algorithms have to work with, but also more closely modeling the real world information available to humans -- who ultimately decide how the markets change and what news gets posted.