In [199]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from itertools import cycle

from gensim.models import Word2Vec
import gensim.downloader as api

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import to_categorical

In [2]:
data = pd.read_csv("data/tweets_prices.csv", index_col=0, parse_dates=[0])
tweets = data[data["text"].notna()]
len(tweets)

298325

In [3]:
tweets.head()

Unnamed: 0,open,high,low,close,marketVolume,profit,id,text,favorite_count,retweet_count,user,screen_name,cleaned_text
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,1.106282e+18,@Tesla Design it to be the perfect looking car...,0.0,0.0,katherine,katherine828,tesla design perfect looking car hey thief get...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,1.106282e+18,@Seanmmvi @Erdayastronaut @Tesla @SpaceX This ...,0.0,0.0,Joseph Lustig,boadickia,seanmmvi erdayastronaut tesla spacex also comb...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,1.106282e+18,Amazing! I wish $AAPL would do the same in the...,0.0,0.0,Titus Pacis,TitusPacis,amazing wish aapl would software updates oppos...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,1.106282e+18,@Tesla Wow. The S3XY family will be complete. ...,0.0,0.0,Lukas,Lukas94597338,tesla wow family complete looking forward
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,1.106282e+18,It's time to call it a night. I'm tired after ...,0.0,0.0,",`,",anna88058347,time call night tired reading post gon na sleep


## Preprocessing

In [4]:
tweets = tweets.drop(["favorite_count", "retweet_count", "id"], axis=1)

In [5]:
tweets["cleaned_text"] = tweets["cleaned_text"].apply(str)

In [6]:
tweets.head()

Unnamed: 0,open,high,low,close,marketVolume,profit,text,user,screen_name,cleaned_text
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,@Tesla Design it to be the perfect looking car...,katherine,katherine828,tesla design perfect looking car hey thief get...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,@Seanmmvi @Erdayastronaut @Tesla @SpaceX This ...,Joseph Lustig,boadickia,seanmmvi erdayastronaut tesla spacex also comb...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,Amazing! I wish $AAPL would do the same in the...,Titus Pacis,TitusPacis,amazing wish aapl would software updates oppos...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,@Tesla Wow. The S3XY family will be complete. ...,Lukas,Lukas94597338,tesla wow family complete looking forward
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,It's time to call it a night. I'm tired after ...,",`,",anna88058347,time call night tired reading post gon na sleep


In [7]:
vocab = Counter()

In [8]:
for text in tweets.cleaned_text:
    for word in text.split(" "):
        vocab[word] += 1

In [9]:
len(vocab)

88019

In [10]:
max_vocab_size = 50_000
vocab_most_common = vocab.most_common(max_vocab_size)
vocab_final = [(word, count) for (word, count) in vocab_most_common if count > 15]

In [11]:
vocab_size = len(vocab_final)
vocab_size

10030

Ponieważ nie mamy zbyt dużej ilości danych, do modelu użyjemy przetrenowanych embeddingów word2vec. Ponieważ słów w języku angielskim jest dużo, a każde słowo będziemy reprezentować przez 300 elementowy wektor to niestety zjamiemy dużo pamięci (3_00_000_000 słów * 300 liczb/słowo * 32 bitów = 3.6 GB). http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/

In [13]:
tweets["up"] = [1 if profit > 0 else 0 for profit in tweets.profit]

In [14]:
sample = tweets.sample(10)

In [15]:
data = [([get_vector(word) for word in tweet["cleaned_text"].split(" ")], tweet["up"]) for _idx, tweet in sample.iterrows()]

In [16]:
len(data)

10

In [17]:
train = sample

In [18]:
def ohe(data, label_name="up"):
    """One hot encodes column named label_name of data"""
    label_array = np.zeros((len(data), 2), dtype=np.int8)
    for index, label in enumerate(data[label_name]):
        label_array[index, label] = 1
    return label_array

ohe(sample)

array([[0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [0, 1]], dtype=int8)

In [19]:
to_categorical(tweets.up)

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [20]:
def train_generator(tweets):
    while True:
        sequence_length = np.random.randint(10, 100)
        x_train = np.random.random((1000, sequence_length, 5))
        
        x_train = np.array([[get_vector(word) for word in tweet["cleaned_text"].split(" ")] for _idx, tweet in tweets.iterrows()])
        
        y_train = ohe(tweets)
        yield x_train, y_train

In [166]:
sentences = sample["cleaned_text"].values

In [167]:
sentences

array(['elonmusk wanting tesla broke afford one firstworldproblems',
       'tesla model new prius everywhere',
       'mortchad tesla mmm rims though classy https',
       'tesla horses courses guess cup tea though',
       'tesla fintech smart homes disruptors radar banyan hill publishing https https',
       'first quarter tesla recorded drop solar installations mw compared https',
       'iliketeslas danahull think dana risks career would surely mgmt pressure https',
       'posted https',
       'elonmusk make lawnmowers sound systems basically anything possibly wake every fucking day https',
       'norbertelekes case thought one idea another stats post amount iphone https'],
      dtype=object)

In [37]:
words = [y.split(" ") for y in tweets.cleaned_text.values]

In [38]:
max_len = max([len(w) for w in words])

In [39]:
max_len

30

In [40]:
glove_vectors = api.load("glove-twitter-50")  # may take a while

In [80]:
GLOVE_SIZE = 50

In [61]:
def get_vector(word):
    """For a given word returns it's vector representation to be fed into model"""
    try:
        vec = glove_vectors.get_vector(word)
    except KeyError:
        vec = np.random.normal(0, 0.0001, 50)
    return vec

In [62]:
get_vector(" ")

array([ 8.12179539e-05,  2.24099400e-05, -2.26701089e-04,  3.57957283e-05,
        1.85892230e-04, -2.89448765e-05,  2.25337932e-05, -5.43789226e-05,
       -3.55197046e-06,  6.97891724e-07, -1.58316943e-05,  2.51386827e-05,
        1.14340114e-04,  1.90829048e-04, -1.72009951e-04, -1.95580379e-04,
       -1.15109928e-05,  1.57885760e-04,  1.68249919e-04, -1.16418951e-04,
       -1.56109615e-04,  1.20772182e-04,  1.24681265e-04,  2.33244286e-05,
        9.63188233e-05,  6.18320459e-05, -1.73639648e-05,  5.65897951e-05,
        2.26300010e-05, -5.41309129e-05, -2.99663831e-05,  6.15746170e-05,
        1.12563459e-04, -9.22889307e-05,  6.17750413e-05, -1.31972729e-04,
       -1.46133583e-04,  6.78083342e-05, -2.67585628e-05, -8.41189597e-05,
       -2.05879688e-05, -5.45523164e-05, -1.56278976e-04, -2.08213262e-05,
       -4.86686610e-05,  7.31208930e-05,  2.07883830e-06, -1.40768900e-04,
        3.30593730e-05,  1.82175944e-04])

In [212]:
def train_generator(tweets, batch_size):
    """Generates data form tweets in batches of size batch_size"""
        
    sentences = [sentence.split(" ") for sentence in tweets["cleaned_text"]]
    labels = ohe(tweets)
    
    max_len = max([len(sentence) for sentence in sentences])
    
    x_train = np.zeros((batch_size, max_len, GLOVE_SIZE))
    y_train = np.zeros((batch_size, 2))
    i = 0
    for sentence, label in cycle(zip(sentences, labels)):
        for j, word in enumerate(sentence):
            x_train[i, j, :] = get_vector(word)
        y_train[i, :] = label
        if i == batch_size - 1:
            i = 0
            yield x_train, y_train
        else:
            i += 1

In [217]:
tweets_train, tweets_test, y_train, y_test = train_test_split(tweets, tweets.up, test_size=0.10, random_state=42)

# Model

In [219]:
model = Sequential()

model.add(Masking(mask_value=0., input_shape=(None, 50)))

# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))

# Recurrent layer
model.add(LSTM(64, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Create callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=5), ModelCheckpoint('model.h5', save_best_only=True, save_weights_only=False)]

model.fit_generator(train_generator(tweets_train, 1000), 
                    validation_data=train_generator(tweets_test, 1), 
                    steps_per_epoch=30, 
                    validation_steps=30, 
                    epochs=10,
                    verbose=1,
                    callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<keras.callbacks.History at 0x7fae852bcba8>