In [1]:
#This code is a practice coding project from https://www.kaggle.com/ndrewgele/omg-nlp-with-the-djia-and-reddit
# The data is also provided from https://www.kaggle.com/aaron7sun/stocknews


import numpy as np #linear algebra
import pandas as pd #data analysis tool for python
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from warnings import simplefilter


RedditNews = pd.read_csv("RedditNews.csv")    #reading in data
DJIA = pd.read_csv("DJIA_table.csv")
CombinedNews = pd.read_csv("Combined_News_DJIA.csv")
#CombinedNews.head()   # check the header of our data

#the Label variable will be a 1 if the DJIA stayed the same or rose on that date 
#or 0 if the DJIA fell on that date.


In [2]:
# Split the data into a training set and a testing set 
# All dates up to end of 2014 as training and rest as testing
Train = CombinedNews[CombinedNews['Date'] < '2015-01-01']   
Test = CombinedNews[CombinedNews['Date'] > '2014-12-31']


In [3]:
# Before we process data we need to do some text processing, 
# and we will accomplish that with sklearn functions that we have imported.

#    - Turn sentence into lower case letters 
#    - Split sentence into individual words
#    - remove meaningless words
#    - transforming that into a table of counts

In [4]:
# Use for loop to iterate through each row of data so that we can have it as a single
# list of data

TrainHeadlines = []
for row in range(0,len(Train.index)):    #for loop throught the rows of the data 
    TrainHeadlines.append(' '.join(str(x) for x in Train.iloc[row, 2:27])) 
    # x is columns, from column 2 to 27 is the headlines.
    # remember to seperate headlines with a space ' ' for later seperation.
    
#TrainHeadlines[:10]

In [5]:
# Initialize CountVectorizer() and find the table of words and the count of words

BasicVectorizer = CountVectorizer()
BasicTrain = BasicVectorizer.fit_transform(TrainHeadlines)
print(BasicTrain.shape)

(1611, 31675)


In [17]:
# Now we put the data into a logistic model
# Fit the X and Y values of our BasicModel

TrainModel = LogisticRegression()
TrainModel = TrainModel.fit(BasicTrain, Train["Label"])

NameError: name 'simplefilter' is not defined

In [7]:
# Word processing for our test data

TestHeadlines = []
for row in range(0,len(Test.index)):
    TestHeadlines.append(' '.join(str(x) for x in Test.iloc[row,2:27]))
    
# predicting our test model with our training model.
BasicTest = BasicVectorizer.transform(TestHeadlines)
Predictions = TrainModel.predict(BasicTest)

In [8]:
# Check frqueccy with crosstab table
pd.crosstab(Test["Label"], Predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,61,125
1,92,100


In [9]:
# calculate the accuracy of our prediction
Accuracy = (61+100)/(61+92+125+100)
print(Accuracy)
# Accuracy is just about 42%, so it seems like the model is not that reliable. 

0.42592592592592593


In [14]:
# Now we take a look at the coefficients of the model

BasicWords = BasicVectorizer.get_feature_names()
BasicCoefficients = TrainModel.coef_.tolist()[0]
coeffdf = pd.DataFrame({"Word" : BasicWords,
                        "Coeff" : BasicCoefficients})

coeffdf = coeffdf.sort_values(['Coeff', 'Word'], ascending=[0, 1])
coeffdf.head(10) # most positive coefficients


Unnamed: 0,Word,Coeff
19419,nigeria,0.497924
25261,self,0.452526
29286,tv,0.428011
15998,korea,0.425863
20135,olympics,0.425716
15843,kills,0.411636
26323,so,0.411267
29256,turn,0.394855
10874,fears,0.388555
28274,territory,0.384031


In [16]:
# Most negative coefficients
coeffdf.tail(10) 

# Now we can see which words actually affect our model, and the larger the coefficient the higher chance  
# it appears on the headline

# This model gets the chance of a word appearing in the headline. Which usually isnt that reliable because 
# words are more meaningful when it is in sentences. 

Unnamed: 0,Word,Coeff
27299,students,-0.424441
8478,did,-0.427079
6683,congo,-0.431925
12818,hacking,-0.444069
7139,country,-0.44857
16949,low,-0.463116
3651,begin,-0.470454
25433,sex,-0.494555
24754,sanctions,-0.549725
24542,run,-0.587794
