In [1]:
#This code is a practice coding project from https://www.kaggle.com/ndrewgele/omg-nlp-with-the-djia-and-reddit
# The data is also provided from https://www.kaggle.com/aaron7sun/stocknews


import numpy as np #linear algebra
import pandas as pd #data analysis tool for python
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from warnings import simplefilter


RedditNews = pd.read_csv("RedditNews.csv")    #reading in data
DJIA = pd.read_csv("DJIA_table.csv")
CombinedNews = pd.read_csv("Combined_News_DJIA.csv")
#CombinedNews.head()   # check the header of our data

#the Label variable will be a 1 if the DJIA stayed the same or rose on that date 
#or 0 if the DJIA fell on that date.


In [2]:
# Split the data into a training set and a testing set 
# All dates up to end of 2014 as training and rest as testing
Train = CombinedNews[CombinedNews['Date'] < '2015-01-01']   
Test = CombinedNews[CombinedNews['Date'] > '2014-12-31']


In [3]:
# Before we process data we need to do some text processing, 
# and we will accomplish that with sklearn functions that we have imported.

#    - Turn sentence into lower case letters 
#    - Split sentence into individual words
#    - remove meaningless words
#    - transforming that into a table of counts

In [4]:
# Use for loop to iterate through each row of data so that we can have it as a single
# list of data

TrainHeadlines = []
for row in range(0,len(Train.index)):    #for loop throught the rows of the data 
    TrainHeadlines.append(' '.join(str(x) for x in Train.iloc[row, 2:27])) 
    # x is columns, from column 2 to 27 is the headlines.
    # remember to seperate headlines with a space ' ' for later seperation.
    
#TrainHeadlines[:10]

In [5]:
# Initialize CountVectorizer() and find the table of words and the count of words

BasicVectorizer = CountVectorizer()
BasicTrain = BasicVectorizer.fit_transform(TrainHeadlines)
print(BasicTrain.shape)

(1611, 31675)


In [6]:
# Now we put the data into a logistic model
# Fit the X and Y values of our BasicModel

TrainModel = LogisticRegression()
TrainModel = TrainModel.fit(BasicTrain, Train["Label"])



In [7]:
# Word processing for our test data

TestHeadlines = []
for row in range(0,len(Test.index)):
    TestHeadlines.append(' '.join(str(x) for x in Test.iloc[row,2:27]))
    
# predicting our test model with our training model.
BasicTest = BasicVectorizer.transform(TestHeadlines)
Predictions = TrainModel.predict(BasicTest)

In [8]:
# Check frqueccy with crosstab table
pd.crosstab(Test["Label"], Predictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,61,125
1,92,100


In [9]:
# calculate the accuracy of our prediction
Accuracy = (61+100)/(61+92+125+100)
print(Accuracy)
# Accuracy is just about 42%, so it seems like the model is not that reliable. 

0.42592592592592593


In [10]:
# Now we take a look at the coefficients of the model

BasicWords = BasicVectorizer.get_feature_names()
BasicCoefficients = TrainModel.coef_.tolist()[0]
coeffdf = pd.DataFrame({"Word" : BasicWords,
                        "Coeff" : BasicCoefficients})

coeffdf = coeffdf.sort_values(['Coeff', 'Word'], ascending=[0, 1])
coeffdf.head(10) # most positive coefficients


Unnamed: 0,Word,Coeff
19419,nigeria,0.497924
25261,self,0.452526
29286,tv,0.428011
15998,korea,0.425863
20135,olympics,0.425716
15843,kills,0.411636
26323,so,0.411267
29256,turn,0.394855
10874,fears,0.388555
28274,territory,0.384031


In [11]:
# Most negative coefficients
coeffdf.tail(10) 

# Now we can see which words actually affect our model, and the larger the coefficient the higher chance  
# it appears on the headline

# This model gets the chance of a word appearing in the headline. Which usually isnt that reliable because 
# words are more meaningful when it is in sentences. 

Unnamed: 0,Word,Coeff
27299,students,-0.424441
8478,did,-0.427079
6683,congo,-0.431925
12818,hacking,-0.444069
7139,country,-0.44857
16949,low,-0.463116
3651,begin,-0.470454
25433,sex,-0.494555
24754,sanctions,-0.549725
24542,run,-0.587794


In [15]:
# the above is my practice of the n-bag model from the source. And now for an more advanced model called
# the n-model, where n is the sequence of words counted. So the model above is a model where n = 1, 
# and now we will try to run the n-model where n = 2.

# we now define a new CountVectorizer with the parameter set n=2.

AdvancedVectorizer = CountVectorizer(ngram_range = (2,2))
AdvancedTrain = AdvancedVectorizer.fit_transform(TrainHeadlines)
print(AdvancedTrain.shape)

# This time we have 366721 variables representing two-word combonations.

(1611, 366721)


In [17]:
# define Model
AdvancedModel = LogisticRegression()
# Fit X and Y values into the advanced model
AdvancedModel = AdvancedModel.fit(AdvancedTrain, Train["Label"])



In [20]:
# Word processing for our test data

TestHeadlines = []
for row in range(0,len(Test.index)):
    TestHeadlines.append(' '.join(str(x) for x in Test.iloc[row,2:27]))
    
# predicting our test model with our training model.
AdvancedTest = AdvancedVectorizer.transform(TestHeadlines)
AdvancedPredictions = AdvancedModel.predict(AdvancedTest)

In [21]:
# Show data with Crosstab

pd.crosstab(Test["Label"], AdvancedPredictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,66,120
1,45,147


In [22]:
AdvancedAccuracy = (66+147)/(66+45+120+147)
print(AdvancedAccuracy)

# Now we see that the accuaracy has improved a bit, but its still not too reliable.

0.5634920634920635


In [23]:
# Lets Check our coefficients again

AdvancedWords = AdvancedVectorizer.get_feature_names()
AdvancedCoefficients = AdvancedModel.coef_.tolist()[0]
Advcoeffdf = pd.DataFrame({"Word" : AdvancedWords,
                        "Coeff" : AdvancedCoefficients})

Advcoeffdf = Advcoeffdf.sort_values(['Coeff', 'Word'], ascending=[0, 1])
Advcoeffdf.head(10) # most positive coefficients


Unnamed: 0,Word,Coeff
272047,right to,0.286533
24710,and other,0.275274
285392,set to,0.274698
316194,the first,0.262873
157511,in china,0.227943
159522,in south,0.224184
125870,found in,0.21913
124411,forced to,0.216726
173246,it has,0.211137
322590,this is,0.209239


In [24]:
Advcoeffdf.tail(10) #most negative

Unnamed: 0,Word,Coeff
326846,to help,-0.198495
118707,fire on,-0.201654
155038,if he,-0.209702
242528,people are,-0.211303
31669,around the,-0.213362
321333,there is,-0.215699
327113,to kill,-0.221812
340714,up in,-0.226289
358917,with iran,-0.227516
315485,the country,-0.331153


In [50]:
# now lets practive the entire process again with n = 4 and see what happens
# define a new CountVectorizer with n=4.

PracticeVectorizer = CountVectorizer(ngram_range = (4,4))
PracticeTrain = PracticeVectorizer.fit_transform(TrainHeadlines)
print(PracticeTrain.shape)

(1611, 673498)


In [51]:
# the 673498 variables are a bit less than what we had when n=2

# define logistic model
PracticeModel = LogisticRegression()
# assign X and Y values 
PracticeModel = PracticeModel.fit(PracticeTrain, Train["Label"])


In [52]:
# Word processing 

TestHeadlines = []
for row in range(0,len(Test.index)):
    TestHeadlines.append(' '.join(str(x) for x in Test.iloc[row,2:27]))
    
# predicting our Practice model with our training model.
PracticeTest = PracticeVectorizer.transform(TestHeadlines)
PracticePredictions = PracticeModel.predict(PracticeTest)

In [53]:
pd.crosstab(Test["Label"], PracticePredictions, rownames=["Actual"], colnames=["Predicted"])

Predicted,1
Actual,Unnamed: 1_level_1
0,186
1,192


In [58]:
PracticeAccuracy = (192)/(186+192)
print(PracticeAccuracy)

# the accuracy decreases compared to the n=2 model

0.5079365079365079


In [59]:
# Lets Check our coefficients 

PracticeWords = PracticeVectorizer.get_feature_names()
PracticeCoefficients = PracticeModel.coef_.tolist()[0]
Pcoeffdf = pd.DataFrame({"Word" : PracticeWords,
                        "Coeff" : PracticeCoefficients})

Pcoeffdf = Pcoeffdf.sort_values(['Coeff', 'Word'], ascending=[0, 1])
Pcoeffdf.head(10) # most positive coefficients

Unnamed: 0,Word,Coeff
206072,for the first time,0.121389
245112,have the right to,0.099642
556317,the first time in,0.090985
562878,the nobel peace prize,0.090115
243343,have been killed in,0.088778
500344,sentenced to death for,0.082785
266095,in an attempt to,0.080939
570338,the united states to,0.07734
614581,turn out to be,0.075059
280236,in the wake of,0.07085


In [60]:
Pcoeffdf.tail(10)

# The results are a bit more interesting compared to the other models as with 4 words we can identify the meaning 
# of the short sentences.

Unnamed: 0,Word,Coeff
279977,in the state of,-0.059821
411085,on one of the,-0.060467
547953,that there was no,-0.061727
196474,first country in the,-0.06183
558081,the head of the,-0.072152
68014,aung san suu kyi,-0.073747
412247,on the brink of,-0.076092
223339,german chancellor angela merkel,-0.07975
669774,years in jail for,-0.086701
138758,country in the world,-0.090888
