In [3]:
from urllib import request
import json
import dateutil
import datetime
import re
import pandas as pd
import os
from pymongo import MongoClient
from pymongo.errors import BulkWriteError, DuplicateKeyError
from gensim.models import Word2Vec
import numpy as np
import string
import functools
from spacy.en import English
import textacy
from pycorenlp import StanfordCoreNLP
import time
import tensorflow as tf
from multiprocessing.dummy import Pool
import pickle
from functools import partial

Using TensorFlow backend.


### Paper Implementation
[
Deep learning for event-driven stock prediction](http://dl.acm.org/citation.cfm?id=2832415.2832572)

In [None]:
MONGO_URL=None
MONGO_USERNAME=None
MONGO_PASSWORD=None
# add your servers here if you want
STANFORD_SERVERS=[None]

In [None]:
client = MongoClient(MONGO_URL)
db = client.stockdb
db.authenticate(name=MONGO_USERNAME, password=MONGO_PASSWORD)
news_coll=db.news_latest
stock_coll=db.stockcoll
nlp = spacy.load('en_core_web_lg')

In [4]:
stanford=StanfordCoreNLP('http://127.0.0.1:9000')
stanford_server_list=[stanford]

In [21]:
%%time
samples=list(news_coll.find({'content': {'$regex':'(Apple\s|Google|Amazon|Microsoft|Facebook|IBM|Twitter|Nvidia)'}}))

In [22]:
len(samples)

23205

In [24]:
sample_dates=list(sorted(list(set(map(lambda x:x['date'],samples)))))

In [247]:
companies=pd.read_csv('https://datahub.io/core/s-and-p-500-companies-financials/r/constituents-financials.csv')
companies.columns=list(map(lambda x:x.strip().lower(),companies.columns))
# companies=companies[companies.symbol.isin(['GOOGL','IBM','ORCL','AAPL','YHOO','FB'])]
companies.index=companies['_id']
companies=companies[['symbol','name','sector']]
company_names=companies['name'].values
company_symbols=companies['symbol'].values
company_info=companies[['symbol','name','name']].values

In [248]:
stop_company_name=['&','the','company','inc','inc.','plc','corp','corp.','co','co.','worldwide','corporation','group','']
# stop_company_name=[]
splitted_companies=list(map(lambda x:([x[0]]+[x[1]]+list(filter(lambda y: y.lower() not in stop_company_name ,x[2].split(' ')))),company_info))
splitted_companies=list(map(lambda x:[x[0]]+[x[1]]+[re.sub(pattern='[^.a-zA-Z0-9\s-]',repl='',string=functools.reduce(lambda y,z:y+' '+z,x[2:]))],splitted_companies))

In [5]:
def spacy_extractor(text,server):
    doc=nlp(text)
    svo_triples=list(textacy.extract.subject_verb_object_triples(doc))
    str_triples=map(lambda x:(str(x[0]),str(x[1]),str(x[2])) if len(x)>0 else _,svo_triples)
    return svo_triples

In [6]:
def corenlp_extractor(text,debug=False,server=0):
    svo_triples=[]
    output=stanford_server_list[server].annotate(text, properties={
        'annotators': 'openie',
        'outputFormat': 'json'
        })
    if type(output)==dict:
        for sentence in output['sentences']:
            svo_triples.extend(list(map(lambda x: (x['subject'],x['relation'],x['object']),sentence['openie'])))
    return svo_triples

In [7]:
def to_SVO_embedding(text,word_embedding):
    return np.array([ word_embedding[str(term)] for term in list(text) ]).mean(axis=0)

In [8]:
def core_nlp_to_SVO_embedding(text,word_embedding):
    embeddings=[ word_embedding.wv[str(term)] if str(term) in word_embedding.wv else None for term in list(nlp(text))]
    embeddings=list(filter(lambda x: x is not None,embeddings))
    if len(embeddings)==0:
        return None
    return np.array(embeddings).mean(axis=0)

In [313]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('would')
stopwords.append('kmh')
stopwords.append('mph')
stopwords.append('u')
stopwords.extend(list(string.ascii_lowercase))
stop_symbols=['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC','MON','TUE','WED','THU','FRI','SAT','SUN',
              'company','Inc','Company','inc','Inc.','plc','Plc','corp','Corp.','Co','co','Co.','co.','worldwide','corporation','group']
regex = re.compile(r'[^A-Za-z-]')
def extract_events_and_companies_mp(sample_server,companies,word_embedding,events_extractor=spacy_extractor,**kwargs):
    processed_data=[]
    sample=sample_server[0]
    server=sample_server[1]
    sample_date=sample['date']
    text=sample['content']
    processed_data=None
    svo_triples=events_extractor(text,server=server,**kwargs)
    client = MongoClient(MONGO_URL)
    db = client.stockdb
    db.authenticate(name=MONGO_USERNAME, password=MONGO_PASSWORD)
    news_coll=db.news_latest
    stock_coll=db.stockcoll
    company_coll=db.sp500company
    if len(svo_triples) == 0:
        return processed_data
    svo_result=[]
    for svo in svo_triples:
        if len(svo) == 0:
            return processed_data
        o1=svo[0]
        p=svo[1]
        o2=svo[2]
        if events_extractor==spacy_extractor:
            svo_embedding=(to_SVO_embedding(o1,word_embedding),to_SVO_embedding(p,word_embedding),to_SVO_embedding(o2,word_embedding))
        else:
            svo_embedding=(core_nlp_to_SVO_embedding(o1,word_embedding),core_nlp_to_SVO_embedding(p,word_embedding),core_nlp_to_SVO_embedding(o2,word_embedding))
        if svo_embedding[0] is None or svo_embedding[1] is None or svo_embedding[2] is None:
            return processed_data
        o1=str(o1)
        p=str(p)
        o2=str(o2)
        got=False
        for company in companies:
            if got == True: 
                break
            for cpy in company:
                if len(o1)>2 and o1 not in stop_symbols and o1 in cpy:
                    future_price_data=list(stock_coll.find({'symbol':company[0],'date':{'$gte':sample_date}}).limit(2))
                    past_price_data=pd.DataFrame(list(stock_coll.find({'symbol':company[0],'date':{'$lte':sample_date}}).sort('date',-1).limit(7)))
                    if len(past_price_data)!=7:break
                    past_price_data=scale(past_price_data['adj_close'].values[0:-1]-past_price_data['adj_close'].values[1:])
                    if len(future_price_data)<2:break
                    if (future_price_data[0]['date']-sample_date).days>2: break
                    price_label=np.sign(future_price_data[1]['adj_close']-future_price_data[0]['adj_close'])
                    svo_result.append((svo,svo_embedding,company,price_label,past_price_data,sample_date))
                    got=True
                    break
                if len(p)>2 and o1 not in stop_symbols and p in cpy:
                    future_price_data=list(stock_coll.find({'symbol':company[0],'date':{'$gte':sample_date}}).limit(2))
                    past_price_data=pd.DataFrame(list(stock_coll.find({'symbol':company[0],'date':{'$lte':sample_date}}).sort('date',-1).limit(7)))
                    if len(past_price_data)!=7:break
                    past_price_data=scale(past_price_data['adj_close'].values[0:-1]-past_price_data['adj_close'].values[1:])
                    if len(future_price_data)<2:break
                    if (future_price_data[0]['date']-sample_date).days>2: break
                    price_label=np.sign(future_price_data[1]['adj_close']-future_price_data[0]['adj_close'])
                    svo_result.append((svo,svo_embedding,company,price_label,past_price_data,sample_date))
                    got=True
                    break
                if len(o2)>2 and o2 not in stop_symbols and o2 in cpy:
                    future_price_data=list(stock_coll.find({'symbol':company[0],'date':{'$gte':sample_date}}).limit(3))
                    past_price_data=pd.DataFrame(list(stock_coll.find({'symbol':company[0],'date':{'$lte':sample_date}}).sort('date',-1).limit(7)))
                    if len(past_price_data)!=7:break
                    past_price_data=scale(past_price_data['adj_close'].values[0:-1]-past_price_data['adj_close'].values[1:])
                    if len(future_price_data)<2:break
                    if (future_price_data[0]['date']-sample_date).days>3: break
                    price_label=np.sign(future_price_data[1]['adj_close']-future_price_data[0]['adj_close'])
                    svo_result.append((svo,svo_embedding,company,price_label,past_price_data,sample_date))
                    got=True
                    break
    if len(svo_result)>0:
        processed_data={'result':svo_result}
    return processed_data

In [38]:
from sklearn.preprocessing import scale
def extract_all_events_for_company_mp(sample_server,company,word_embedding,events_extractor=spacy_extractor,**kwargs):
    processed_data=[]
    sample=sample_server[0]
    server=sample_server[1]
    sample_date=sample['date']
    text=sample['content']
    processed_data=None
    future_price_data=list(stock_coll.find({'symbol':company,'date':{'$gte':sample_date}}).limit(2))
    past_price_data=pd.DataFrame(list(stock_coll.find({'symbol':company,'date':{'$lte':sample_date}}).sort('date',-1).limit(7)))
    if len(past_price_data)!=7:return processed_data
    past_price_data=scale(past_price_data['adj_close'].values[0:-1]-past_price_data['adj_close'].values[1:])
    if len(future_price_data)<2:return processed_data
    if (future_price_data[0]['date']-sample_date).days>2: return processed_data
    price_label=np.sign(future_price_data[1]['adj_close']-future_price_data[0]['adj_close'])
    svo_triples=events_extractor(text,server=server,**kwargs)
    if len(svo_triples) == 0:
        return processed_data
    svo_result=[]
    for svo in svo_triples:
        if len(svo) == 0:
            continue
        o1=svo[0]
        p=svo[1]
        o2=svo[2]
        svo_embedding=[core_nlp_to_SVO_embedding(o1,word_embedding),core_nlp_to_SVO_embedding(p,word_embedding),core_nlp_to_SVO_embedding(o2,word_embedding)]
        if svo_embedding[0] is None or svo_embedding[1] is None or svo_embedding[2] is None:
            continue
        svo_result.append(np.array(svo_embedding))
    if len(svo_result)>0:
        processed_data=[np.array(svo_result),price_label,past_price_data,sample_date]
    return processed_data

In [34]:
server_index=(list(range(stanford_server_list))*2000)[:len(samples)]

In [35]:
sample_server=list(zip(samples,server_index))

In [33]:
def process_sample_to_sentences(sample):
    return list(map(lambda x: str(x),list(nlp(sample['content'])) ))

In [28]:
test=nlp(samples[0]['content'])

In [36]:
%%time
pool=Pool(8)
sentences=pool.map(process_sample_to_sentences,samples)

CPU times: user 10min 15s, sys: 58.4 s, total: 11min 13s
Wall time: 2min 58s


In [13]:
len(sentences)

23205

In [37]:
%%time
word_embedding=Word2Vec(sentences,min_count=1,workers=8,size=150,sg=1)



CPU times: user 9min 47s, sys: 696 ms, total: 9min 48s
Wall time: 1min 20s


In [332]:
test_result=[]
test_result=extract_events_and_companies_mp(sample_server[0],[['AAPL', 'Apple Inc.', 'Apple']],word_embedding,events_extractor=corenlp_extractor)

In [31]:
test_result=[]
test_result=extract_all_events_for_company_mp(sample_server[0],'AAPL',word_embedding,events_extractor=corenlp_extractor)

For multiple companies

In [339]:
%%time
pool=Pool(8)
partial_work = partial(extract_events_and_companies_mp,companies=[['GOOG', 'Alphabet Inc', 'Google']],word_embedding=word_embedding,events_extractor=corenlp_extractor) 
test_result = pool.map(partial_work, sample_server)

CPU times: user 7min 8s, sys: 24.9 s, total: 7min 33s
Wall time: 13min 40s


For single company

In [39]:
%%time
pool=Pool(8)
partial_work = partial(extract_all_events_for_company_mp,company='AAPL',word_embedding=word_embedding,events_extractor=corenlp_extractor) 
test_result = pool.map(partial_work, sample_server)

CPU times: user 9min 12s, sys: 34.6 s, total: 9min 47s
Wall time: 15min 30s


In [40]:
final_result=list(filter(lambda x: x is not None, test_result))

In [41]:
len(final_result)

7669

###  Neural Tensor Network

Use the script in models to train NTN and get embeddings

In [14]:
svo_embeddings=tuple([ result[0] for result in final_result])

In [15]:
events=np.vstack(tup=svo_embeddings)

In [65]:
with open('extracted_data','wb') as f:
    pickle.dump(final_result,f)

In [56]:
np.save('data',np.array(events))

In [7]:
data = np.load('data.npy')
O1_ndarray = data[:, 0, :]
P_ndarray = data[:, 1, :]
O2_ndarray = data[:, 2, :]

##  Classification

In [23]:
embedding_result=np.load('tmp_result.npy')

In [24]:
embedding_result.shape

(717423, 100)

In [30]:
X=embedding_result

In [355]:
with open('result_dict','rb') as f:
    embedding_dict=pickle.load(f)

In [360]:
embedding_index=np.array(sorted(embedding_dict.keys()))

In [457]:
y_label=[ label[3] for result in final_result for label in result['result']]

In [458]:
y_label=np.array(y_label)

In [364]:
x=embedding_result

In [75]:
O1_ndarray = events[:, 0, :]
P_ndarray = events[:, 1, :]
O2_ndarray = events[:, 2, :]

In [11]:
X=np.hstack((O1_ndarray,P_ndarray,O2_ndarray))

In [27]:
y=np.array([ result[1] for result in final_result for r in result[0]])
price_sequence=np.array([ result[2] for result in final_result for _ in result[0]])

In [13]:
X=np.hstack((X,price_sequence))

In [36]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import regularizers, optimizers
from keras.utils import to_categorical

In [15]:
event_dates=list(map(lambda x:x[3],final_result))

In [383]:
event_label=list(map(lambda x:x['result'][0][3],final_result))
event_price=list(map(lambda x:x['result'][0][4],final_result))

In [18]:
event_dates=list(sorted(list(set(event_dates))))

In [527]:
all_data=[]
tmp_index=0
for result in final_result:
    data_in_one_doc=[]
    for event in result['result']:
#         date=event[5]
        label=event[3]
        price=event[4]
        embedding=embedding_result[tmp_index]
        feature_vector=np.concatenate((embedding,price,np.array([label])))
        data_in_one_doc.append(feature_vector)
        tmp_index+=1
    all_data.append(np.mean(data_in_one_doc,axis=0))
x=np.array(all_data)

In [484]:
all_dates=list(sorted(list(set(event_dates)),reverse=True))

In [518]:
all_data=[]

for date in all_dates:
    data_in_one_day=[]
    tmp_index=0
    for result in final_result:
        for event in result['result']:
            event_date=event[5]
            if date==event_date:
                label=event[3]
                price=event[4]
                embedding=embedding_result[tmp_index]
                feature_vector=np.concatenate((embedding,price,np.array([label])))
                data_in_one_day.append(feature_vector)
            tmp_index+=1
    all_data.append(np.mean(data_in_one_day,axis=0)) 
x=np.array(all_data)

In [31]:
y_label=np.array(list(map(lambda x: np.array([0,1]) if x>0 else np.array([1,0]),y)))

In [510]:
X=embedding_result

In [511]:
y=np.array([ label[3] for result in final_result for label in result['result']])
y[y==-1]=0
y_label=np.array(list(map(lambda x: np.array([0,1]) if x>0 else np.array([1,0]),y)))

In [47]:
model = Sequential()
# model.add(Flatten(input_shape=(x.shape, input_dim)))
model.add(Dense(32, input_dim=100,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# process Training
model.fit(X, y_label, batch_size=128, verbose=1,validation_split=0.3, epochs=10,shuffle=True)

Train on 502196 samples, validate on 215227 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f42193f5278>