In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer 
START_TOKEN = '<START>'
END_TOKEN = '<END>'

In [2]:
def process_news(news):
    _news = news.replace('b\"', "")
    _news = _news.replace('b\'', "")
    _news = _news.lower()
    _news = re.sub("[^a-zA-Z]", " ",_news)
    _news = re.sub('[\s]+', ' ', _news)
    
    _news = _news.split(" ")
    if "" in _news:
        _news.remove("")
    
    #lemmatizer = WordNetLemmatizer() 
    
    #_news = [lemmatizer.lemmatize(w) for w in _news]
    
    _news = ' '.join(_news)
     
    return _news

In [3]:
def read_data():

    data = pd.read_csv("../Datasets/djia/Combined_News_DJIA.csv")
    
    print(len(data))
    
    dfs = []
    data["News"] = ""
    for i in range(1,25):
        col = "Top"+str(i)
        data["News"] = data["News"] +" "+ data[col]
    data = data.dropna()
    data['Tokens'] = data['News'].map(process_news)
    
    data = data[['Date', 'News', 'Tokens', 'Label']]

    
    
    
    Xy_train = data[:int(len(data)*0.8)]
    Xy_test = data[int(len(data)*0.8):]
    
    Xy_train = data[data['Date'] < '20150101']
    Xy_test = data[data['Date'] > '20141231']
    
    return data, Xy_train, Xy_test


In [4]:
news, Xy_train, Xy_test = read_data()

1989


In [5]:
print(len(news))
print(len(news[news['Date'] < '20141231']))
print(len(news[news['Date'] > '20141231']))

1986
1608
378


In [6]:
news[news['Date'] < '20150101'].tail(20)

Unnamed: 0,Date,News,Tokens,Label
1843,2015-12-03,UK Parliament Vote in Favor of Airstrikes in ...,uk parliament vote in favor of airstrikes in s...,0
1844,2015-12-04,"World's largest Muslim group, Sunni movement ...",world s largest muslim group sunni movement la...,1
1845,2015-12-07,Beijing has issued its first ever Red Alert o...,beijing has issued its first ever red alert ov...,0
1846,2015-12-08,Resistance to last-resort antibiotic has now ...,resistance to last resort antibiotic has now s...,0
1847,2015-12-09,Saudi Arabia accused of trying to wreck Paris...,saudi arabia accused of trying to wreck paris ...,0
1848,2015-12-10,"""The US State Department has approved a $1.29...",the us state department has approved a billion...,1
1849,2015-12-11,"France will not ban Wi-Fi or Tor, prime minis...",france will not ban wi fi or tor prime ministe...,0
1850,2015-12-14,Saudi Arabia elects up to 17 female councillo...,saudi arabia elects up to female councillors i...,1
1851,2015-12-15,"Just miles from the US border, Lexmark fires ...",just miles from the us border lexmark fires lo...,1
1852,2015-12-16,Donald Trump loses Scottish windfarm appeal N...,donald trump loses scottish windfarm appeal no...,1


In [7]:
news[news['Date'] > '20141231'].head(20)

Unnamed: 0,Date,News,Tokens,Label
1611,2015-01-02,Most cases of cancer are the result of sheer ...,most cases of cancer are the result of sheer b...,1
1612,2015-01-05,Moscow-&gt;Beijing high speed train will redu...,moscow gt beijing high speed train will reduce...,0
1613,2015-01-06,US oil falls below $50 a barrel Toyota gives ...,us oil falls below a barrel toyota gives away ...,0
1614,2015-01-07,'Shots fired' at French magazine HQ 90% of Bi...,shots fired at french magazine hq of bibi neta...,1
1615,2015-01-08,New Charlie Hebdo issue to come out next week...,new charlie hebdo issue to come out next week ...,1
1616,2015-01-09,Muslim politician from India who offered $8M ...,muslim politician from india who offered m to ...,0
1617,2015-01-12,World's largest indoor farm in Japan is 100 t...,world s largest indoor farm in japan is times ...,0
1618,2015-01-13,China has just banned the burqa in its bigges...,china has just banned the burqa in its biggest...,0
1619,2015-01-14,Cameroon Army Kills 143 Boko Haram Fighters A...,cameroon army kills boko haram fighters air fr...,0
1620,2015-01-15,Saudi man sentenced 10 years jail and weekly ...,saudi man sentenced years jail and weekly publ...,0


In [8]:
1989*0.8

1591.2

In [9]:
len(Xy_train)

1860

In [10]:
1860/1989

0.9351432880844646

In [11]:
len(Xy_test)

378

In [12]:
378/1989

0.19004524886877827

In [13]:
Xy_train['Tokens'][1]

'why wont america and nato help us if they wont help us now why did we help them in iraq bush puts foot down on georgian conflict jewish georgian minister thanks to israeli training we re fending off russia georgian army flees in disarray as russians advance gori abandoned to russia without a shot fired olympic opening ceremony fireworks faked what were the mossad with fraudulent new zealand passports doing in iraq russia angered by israeli military sale to georgia an american citizen living in s ossetia blames u s and georgian leaders for the genocide of innocent people welcome to world war iv now in high definition georgia s move a mistake of monumental proportions russia presses deeper into georgia u s says regime change is goal abhinav bindra wins first ever individual olympic gold medal for india u s ship heads for arctic to define territory drivers in a jerusalem taxi station threaten to quit rather than work for their new boss an ara the french team is stunned by phelps and the 

In [14]:
vectorizer = CountVectorizer(ngram_range=(1,2))
X_train = vectorizer.fit_transform(Xy_train['Tokens'].values)

In [15]:
print(X_train.shape)

(1860, 422925)


In [16]:
X_train[0][0][0]

<1x422925 sparse matrix of type '<class 'numpy.int64'>'
	with 562 stored elements in Compressed Sparse Row format>

In [17]:
model = LogisticRegression()
model = model.fit(X_train, Xy_train["Label"])



In [18]:
len(Xy_test)

378

In [19]:
X_test = vectorizer.transform(Xy_test['Tokens'].values)
predictions = model.predict(X_test)

In [20]:
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix

print (classification_report(Xy_test["Label"], predictions))
print (accuracy_score(Xy_test["Label"], predictions))

              precision    recall  f1-score   support

           0       0.83      0.81      0.82       186
           1       0.82      0.83      0.83       192

    accuracy                           0.82       378
   macro avg       0.82      0.82      0.82       378
weighted avg       0.82      0.82      0.82       378

0.8227513227513228
