In [3]:
from urllib import request
import json
import dateutil
import datetime
import re
import pandas as pd
import os
from pymongo import MongoClient
from pymongo.errors import BulkWriteError, DuplicateKeyError
from gensim.models import Word2Vec
import nltk
import numpy as np
import string
import functools
import spacy
from sklearn.preprocessing import scale

### Paper Implementation
[Leverage Financial News to Predict Stock Price Movements Using Word Embeddings and Deep Neural Networks](http://aclweb.org/anthology/N16-1041)

Please make sure that you have already feed news data and price data into mongodb

In [None]:
MONGO_URL=None
MONGO_USERNAME=None
MONGO_PASSWORD=None

In [None]:
client = MongoClient(MONGO_URL)
db = client.stockdb
db.authenticate(name=MONGO_USERNAME, password=MONGO_PASSWORD)
news_coll=db.news_latest
stock_coll=db.stockcoll
company_coll=db.sp500company
nlp = spacy.load('en_core_web_lg')

In [564]:
samples=list(news_coll.find().limit(50000))

In [12]:
companies=pd.read_csv('https://datahub.io/core/s-and-p-500-companies-financials/r/constituents-financials.csv')
companies.columns=list(map(lambda x:x.strip().lower(),companies.columns))
# companies=companies[companies.symbol.isin(['GOOGL','IBM','ORCL','AAPL','YHOO','FB'])]
companies.index=companies['symbol']
companies=companies[['symbol','name','sector']]
company_names=companies['name'].values
company_symbols=companies['symbol'].values
company_info=companies[['symbol','name','name']].values

In [13]:
stop_company_name=['&','the','company','inc','inc.','plc','corp','corp.','co','co.','worldwide','corporation','group','']
# stop_company_name=[]
splitted_companies=list(map(lambda x:([x[0]]+[x[1]]+list(filter(lambda y: y.lower() not in stop_company_name ,x[2].split(' ')))),company_info))
splitted_companies=list(map(lambda x:[x[0]]+[x[1]]+[re.sub(pattern='[^a-zA-Z0-9\s-]',repl='',string=functools.reduce(lambda y,z:y+' '+z,x[2:]))],splitted_companies))

In [649]:
def tokenize_remove_stopwords_extract_companies_with_spacy(text,sample_date,companies):
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.append('would')
    stopwords.append('kmh')
    stopwords.append('mph')
    stopwords.append('u')
    stopwords.extend(list(string.ascii_lowercase))
    stop_symbols=['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC','MON','TUE','WED','THU','FRI','SAT','SUN']
    processed_data=[]
    regex = re.compile(r'[^A-Za-z-]')
    doc=nlp(text)
    sentences=list(doc.sents)
    for sentence in sentences:
        tokens=list(map(str,sentence))
        complete_sentence=str(sentence)
        sent_doc=nlp(complete_sentence)
        entities=list(map(str,sent_doc.ents))
        for company in companies:
            if company[1] in entities or company[2] in complete_sentence or company[0] in entities :
                future_price_data=list(stock_coll.find({'symbol':company[0],'date':{'$gte':sample_date}}).limit(2))
                past_price_data=pd.DataFrame(list(stock_coll.find({'symbol':company[0],'date':{'$lte':sample_date}}).sort('date',-1).limit(7)))
                if len(past_price_data)!=7:continue
                past_price_data=scale(past_price_data['adj_close'].values[0:-1]-past_price_data['adj_close'].values[1:])
                if len(future_price_data)<2:continue
                if (future_price_data[0]['date']-sample_date).days>3: continue
                price_label=np.sign(future_price_data[1]['adj_close']-future_price_data[0]['adj_close'])
                processed_data.append((complete_sentence,tokens,sent_doc,company[0],company[1],company[2],price_label,past_price_data,sample_date))
    return processed_data

In [650]:
processed_samples=[]
for sample in samples:
    p_sample=tokenize_remove_stopwords_extract_companies_with_spacy(sample['content'],sample['date'],splitted_companies)
    if len(p_sample)==0:continue
    p_sample=np.array(p_sample)
    processed_samples.extend(p_sample)

In [651]:
len(processed_samples)

36830

In [11]:
# import _thread
# import threading
# processed_samples=[]
# class Processor(threading.Thread):
#     def __init__(self, samples,companies):
#         threading.Thread.__init__(self)
#         self.samples = samples
#         self.companies=companies
#     def run(self):
#         for sample in self.samples:
#             processed_sample=tokenize_remove_stopwords_extract_companies(sample['content'],sample['date'],self.companies)
#             if len(processed_sample)==0:continue
#             processed_sample=np.array(processed_sample)
#             processed_samples.extend(processed_sample)

In [12]:
# processed_samples=[]
# for i in range(8):
#     processer=Processor(samples=samples[i*int(len(samples)/8):(i+1)*int(len(samples)/8)],companies=splitted_companies)
#     processer.start()

In [653]:
processed_samples=np.array(processed_samples)
sentences=processed_samples[:,0]
sentences_terms=processed_samples[:,1]
sentence_handler=processed_samples[:,2]
labels=processed_samples[:,3:]

In [654]:
model=Word2Vec(sentences=sentences_terms,min_count=2)

In [705]:
bag_of_keywords=set(['rise','drop','fall','gain','surge','shrink','jump','slump'])
stop=False
bok_size=1000
for i in range(10):
    new_words=[]
    if stop:break
    for k in bag_of_keywords:
        if k in model.wv.vocab.keys():
            new_words.extend(model.most_similar(k))
    for n in new_words:
        if n[0].islower() and len(n[0])>3 and n[0].isalpha():
            bag_of_keywords.add(n[0])
            if len(bag_of_keywords)==bok_size:
                stop=True
                break

In [706]:
bag_of_keywords=np.array(list(bag_of_keywords))

In [707]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [708]:
bok_tfidf=TfidfVectorizer(lowercase=False,min_df=1,use_idf=True,vocabulary=bag_of_keywords)

In [709]:
X_bok_tfidf=bok_tfidf.fit_transform(sentences)
X_bok_tfidf=X_bok_tfidf.toarray()

In [710]:
bok_count=CountVectorizer(lowercase=False,min_df=1,vocabulary=bag_of_keywords)

In [711]:
X_bok_count=bok_count.fit_transform(sentences)
X_bok_count=X_bok_count.toarray()

In [712]:
bok_freq_w_pos=np.array(np.sum(X_bok_count[labels[:,3]==1.0],axis=0)*X_bok_count.shape[0]).reshape(X_bok_count.shape[1],)
bok_freq_w_pos[bok_freq_w_pos==0]=1
bok_freq_w=np.sum(X_bok_count,axis=0)
bok_freq_pos=np.sum(labels[:,3]==1.0)
bok_PMI_pos=np.log(bok_freq_w_pos*sentences.shape[0]/bok_freq_w*bok_freq_pos)

In [713]:
bok_freq_w_neg=np.array(np.sum(X_bok_count[labels[:,3]==-1.0],axis=0)*X_bok_count.shape[0]).reshape(X_bok_count.shape[1],)
bok_freq_w_neg[bok_freq_w_neg==0]=1
bok_freq_w=np.sum(X_bok_count,axis=0)
bok_freq_neg=np.sum(labels[:,3]==-1.0)
bok_PMI_neg=np.log(bok_freq_w_neg*sentences.shape[0]/bok_freq_w*bok_freq_neg)

In [714]:
bok_PS=bok_PMI_pos-bok_PMI_neg

In [715]:
category_tags=set(['published','presented','unveil','investment','bankrupt','government','acquisition','suit'])
stop=False
cate_size=1000
for i in range(10):
    new_words=[]
    if stop:break
    for k in category_tags:
        if k in model.wv.vocab.keys():
            new_words.extend(model.most_similar(k))
    for n in new_words:
        if n[0].islower() and len(n[0])>3 and n[0].isalpha():
            category_tags.add(n[0])
            if len(category_tags)==cate_size:
                stop=True
                break

In [716]:
category_tags=np.array(list(category_tags))

In [717]:
ct_count=CountVectorizer(lowercase=False,min_df=1,vocabulary=category_tags)
X_ct_count=ct_count.fit_transform(sentences)
X_ct_count=X_ct_count.toarray()

In [718]:
ct_tfidf=TfidfVectorizer(lowercase=False,min_df=1,vocabulary=category_tags)
X_ct_tfidf=ct_tfidf.fit_transform(sentences)
X_ct_tfidf=X_ct_tfidf.toarray()

In [719]:
ct_freq_w_pos=np.array(np.sum(X_ct_count[labels[:,3]==1.0],axis=0)*X_ct_count.shape[0]).reshape(X_ct_count.shape[1],)
ct_freq_w_pos[ct_freq_w_pos==0]=1
ct_freq_w=np.sum(X_ct_count,axis=0)
ct_freq_pos=np.sum(labels[:,3]==1.0)
ct_PMI_pos=np.log(ct_freq_w_pos*sentences.shape[0]/ct_freq_w*ct_freq_pos)

In [720]:
ct_freq_w_neg=np.array(np.sum(X_ct_count[labels[:,3]==-1.0],axis=0)*X_ct_count.shape[0]).reshape(X_ct_count.shape[1],)
ct_freq_w_neg[ct_freq_w_neg==0]=1
ct_freq_w=np.sum(X_ct_count,axis=0)
ct_freq_neg=np.sum(labels[:,3]==-1.0)
ct_PMI_neg=np.log(ct_freq_w_neg*sentences.shape[0]/ct_freq_w*freq_neg)

In [721]:
ct_PS=ct_PMI_pos-ct_PMI_neg

In [722]:
full_dict=np.concatenate((bag_of_keywords,category_tags))

In [723]:
full_dict,full_idx=np.unique(full_dict,return_index=True)

In [724]:
all_PS=np.concatenate((bok_PS,ct_PS))[full_idx]

In [725]:
full_tfidf=TfidfVectorizer(lowercase=False,min_df=1,vocabulary=full_dict,use_idf=False)
X_full_tfidf=full_tfidf.fit_transform(sentences)
X_full_tfidf=X_full_tfidf.toarray()

In [726]:
for i,h in enumerate(sentence_handler):
    for nc in h.noun_chunks:
        if labels[i,0] in nc.text or labels[i,1] in nc.text or labels[i,2] in nc.text:
            if nc.root.head.text in full_dict:
                kwd_idx=np.where(full_dict==nc.root.head.text)[0][0]
                if nc.root.dep_ == spacy.symbols.nsubj:
                    X_full_tfidf[i,kwd_idx]=X_full_tfidf[i,kwd_idx]*all_PS[kwd_idx]
                if nc.root.dep_ == spacy.symbols.nsubjpass:
                    X_full_tfidf[i,kwd_idx]=X_full_tfidf[i,kwd_idx]*(-all_PS[kwd_idx])

In [727]:
X_price=np.zeros((labels.shape[0],6))
for i,pvec in enumerate(labels[:,4]):
    X_price[i]=pvec

In [728]:
X=np.concatenate((X_full_tfidf,X_price),axis=1)

In [729]:
X.shape

(36830, 1734)

In [730]:
import keras
from keras.layers import Dense,Dropout
from keras.models import Sequential
from keras.utils import to_categorical

In [731]:
y=np.array(labels[:,-3],dtype='int32')
y[y==1]=0

In [732]:
y=to_categorical(y,num_classes=2)

In [733]:
np.sum(y[:,0]==1)

19605

In [734]:
np.sum(y[:,1]==1)

17225

In [738]:
nnmodel=Sequential()
nnmodel.add(Dense(1024,activation='relu',input_dim=X_full_tfidf.shape[1]))
nnmodel.add(Dropout(0.5))
nnmodel.add(Dense(1024,activation='relu'))
nnmodel.add(Dropout(0.5))
nnmodel.add(Dense(1024,activation='relu'))
nnmodel.add(Dropout(0.5))
nnmodel.add(Dense(1024,activation='relu'))
nnmodel.add(Dropout(0.5))
nnmodel.add(Dense(2,activation='softmax'))
nnmodel.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
nnmodel.fit(X_full_tfidf, y,
          batch_size=128,
          epochs=10,
          verbose=1,validation_split=0.2)

Train on 29464 samples, validate on 7366 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc51a0d4e10>