In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('./stocknews/Combined_News_DJIA.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1989 entries, 0 to 1988
Data columns (total 27 columns):
Date     1989 non-null object
Label    1989 non-null int64
Top1     1989 non-null object
Top2     1989 non-null object
Top3     1989 non-null object
Top4     1989 non-null object
Top5     1989 non-null object
Top6     1989 non-null object
Top7     1989 non-null object
Top8     1989 non-null object
Top9     1989 non-null object
Top10    1989 non-null object
Top11    1989 non-null object
Top12    1989 non-null object
Top13    1989 non-null object
Top14    1989 non-null object
Top15    1989 non-null object
Top16    1989 non-null object
Top17    1989 non-null object
Top18    1989 non-null object
Top19    1989 non-null object
Top20    1989 non-null object
Top21    1989 non-null object
Top22    1989 non-null object
Top23    1988 non-null object
Top24    1986 non-null object
Top25    1986 non-null object
dtypes: int64(1), object(26)
memory usage: 419.6+ KB


In [6]:
train = data[data['Date'] < '2015-01-01']
test = data[data['Date'] > '2014-12-31']

In [23]:
# Text Preprocessing
example = train.iloc[2,3]
example

'b"Russia \'ends Georgia operation\'"'

In [24]:
example2 = example.lower()
example2

'b"russia \'ends georgia operation\'"'

In [25]:
example3 = CountVectorizer().build_tokenizer()(example2)
example3

['russia', 'ends', 'georgia', 'operation']

In [26]:
pd.DataFrame([[x,example3.count(x)] for x in set(example3)], columns=['Word', 'Count'])

Unnamed: 0,Word,Count
0,ends,1
1,georgia,1
2,operation,1
3,russia,1


In [28]:
# Basic Model Training and Testing
headlines_train = []
for row in range(0, len(train.index)):
    headlines_train.append(''.join(str(x) for x in train.iloc[row,2:27]))

In [31]:
basicvectorizer = CountVectorizer()
basictrain = basicvectorizer.fit_transform(headlines_train)
print(basictrain.shape)

(1611, 45865)


In [32]:
basicmodel = LogisticRegression()
basicmodel = basicmodel.fit(basictrain, train['Label'])

In [37]:
headlines_test = []
for row in range(0, len(test.index)):
    headlines_test.append(''.join(str(x) for x in test.iloc[row,2:27]))
basictest = basicvectorizer.transform(headlines_test)
predictions = basicmodel.predict(basictest)

In [38]:
pd.crosstab(test['Label'], predictions, rownames=['Actual'], colnames=['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,64,122
1,93,99


In [40]:
basicwords = basicvectorizer.get_feature_names()
basiccoeffs = basicmodel.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word':basicwords, 'Coefficient':basiccoeffs})
coeffdf = coeffdf.sort_values(['Coefficient','Word'], ascending=[0, 1])
coeffdf.head(10)

Unnamed: 0,Coefficient,Word
36421,0.459907,self
7064,0.422589,canadian
22868,0.422344,korea
12779,0.417731,doctors
22675,0.39806,kills
45734,0.39283,zealand
37796,0.392142,so
42210,0.388265,tv
29808,0.386322,past
42176,0.386013,turn


In [42]:
coeffdf.tail(10)

Unnamed: 0,Coefficient,Word
38244,-0.401871,speech
37829,-0.431712,society
40097,-0.437845,system
12169,-0.439449,did
10127,-0.460636,country
5032,-0.465286,begin
24327,-0.470275,low
36656,-0.479133,sex
35288,-0.536537,run
35602,-0.540681,sanctions


Advanced Modeling

In [43]:
advanced_vectorizer = CountVectorizer(ngram_range=(2,2))
advanced_train = advanced_vectorizer.fit_transform(headlines_train)

In [45]:
print(advanced_train.shape)

(1611, 371577)


In [46]:
advanced_model = LogisticRegression()
advanced_model = advanced_model.fit(advanced_train, train['Label'])

In [50]:
headlines_test = []
for row in range(0, len(test.index)):
    headlines_test.append(''.join(str(x) for x in test.iloc[row,2:27]))
    advanced_test = advanced_vectorizer.transform(headlines_test)
    advanced_predictions = advanced_model.predict(advanced_test)

In [51]:
pd.crosstab(test['Label'], advanced_predictions, rownames=['Actual'], colnames=['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,72,114
1,47,145


In [55]:
advanced_words = advanced_vectorizer.get_feature_names()
advanced_coeffs = advanced_model.coef_.tolist()[0]
advanced_coeffdf = pd.DataFrame({'Words':advanced_words, 'Coefficient':advanced_coeffs})
advanced_coeffdf = advanced_coeffdf.sort_values(['Coefficient', 'Words'], ascending=[0,1])
advanced_coeffdf.head(10)

Unnamed: 0,Coefficient,Words
25016,0.279436,and other
275433,0.278784,right to
288886,0.276995,set to
320043,0.255453,the first
125464,0.223548,forced to
158846,0.221361,in china
175420,0.217889,it has
361632,0.217877,will be
126933,0.213953,found in
146250,0.207799,have to


In [57]:
advanced_coeffdf.tail(10)

Unnamed: 0,Coefficient,Words
119549,-0.209019,fire on
49189,-0.20934,bin laden
156179,-0.211252,if he
222388,-0.214659,nuclear weapons
667,-0.219754,10 000
32005,-0.221851,around the
345236,-0.223192,up in
325492,-0.224133,there is
331498,-0.224201,to kill
319269,-0.318302,the country
