In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from itertools import cycle

from gensim.models import Word2Vec
import gensim.downloader as api

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
data = pd.read_csv("../data/tweets_prices.csv", index_col=0, parse_dates=[0])
tweets = data[data["text"].notna()]
len(tweets)

298325

In [3]:
tweets.head()

Unnamed: 0,open,high,low,close,marketVolume,profit,id,text,favorite_count,retweet_count,user,screen_name,cleaned_text
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,1.106282e+18,@Tesla Design it to be the perfect looking car...,0.0,0.0,katherine,katherine828,tesla design perfect looking car hey thief get...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,1.106282e+18,@Seanmmvi @Erdayastronaut @Tesla @SpaceX This ...,0.0,0.0,Joseph Lustig,boadickia,seanmmvi erdayastronaut tesla spacex also comb...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,1.106282e+18,Amazing! I wish $AAPL would do the same in the...,0.0,0.0,Titus Pacis,TitusPacis,amazing wish aapl would software updates oppos...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,1.106282e+18,@Tesla Wow. The S3XY family will be complete. ...,0.0,0.0,Lukas,Lukas94597338,tesla wow family complete looking forward
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,1.106282e+18,It's time to call it a night. I'm tired after ...,0.0,0.0,",`,",anna88058347,time call night tired reading post gon na sleep


## Preprocessing

In [4]:
tweets = tweets.drop(["favorite_count", "retweet_count", "id"], axis=1)

In [5]:
tweets["cleaned_text"] = tweets["cleaned_text"].apply(str)

In [6]:
tweets.head()

Unnamed: 0,open,high,low,close,marketVolume,profit,text,user,screen_name,cleaned_text
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,@Tesla Design it to be the perfect looking car...,katherine,katherine828,tesla design perfect looking car hey thief get...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,@Seanmmvi @Erdayastronaut @Tesla @SpaceX This ...,Joseph Lustig,boadickia,seanmmvi erdayastronaut tesla spacex also comb...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,Amazing! I wish $AAPL would do the same in the...,Titus Pacis,TitusPacis,amazing wish aapl would software updates oppos...
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,@Tesla Wow. The S3XY family will be complete. ...,Lukas,Lukas94597338,tesla wow family complete looking forward
2019-03-14 19:55:00,290.905,290.95,289.81,289.83,50356.2,1.075,It's time to call it a night. I'm tired after ...,",`,",anna88058347,time call night tired reading post gon na sleep


In [7]:
vocab = Counter()

In [8]:
for text in tweets.cleaned_text:
    for word in text.split(" "):
        vocab[word] += 1

In [9]:
len(vocab)

88019

In [10]:
max_vocab_size = 50_000
vocab_most_common = vocab.most_common(max_vocab_size)
vocab_final = [(word, count) for (word, count) in vocab_most_common if count > 15]

In [11]:
vocab_size = len(vocab_final)
vocab_size

10030

Ponieważ nie mamy zbyt dużej ilości danych, do modelu użyjemy przetrenowanych embeddingów word2vec. Ponieważ słów w języku angielskim jest dużo, a każde słowo będziemy reprezentować przez 300 elementowy wektor to niestety zjamiemy dużo pamięci (3_00_000_000 słów * 300 liczb/słowo * 32 bitów = 3.6 GB). http://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/

In [12]:
tweets["up"] = [1 if profit > 0 else 0 for profit in tweets.profit]

In [15]:
def ohe(data, label_name="up"):
    """One hot encodes column named label_name of data"""
    label_array = np.zeros((len(data), 2), dtype=np.int8)
    for index, label in enumerate(data[label_name]):
        label_array[index, label] = 1
    return label_array

In [16]:
to_categorical(tweets.up)

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

Załadujemy poniżej 50-wymiarowe wektory reprezentujące każde słowo. Możemy też użyć więcej wymiarowych wektorów (np. 100 zamiast 50), ale zajmą wtedy one 2 razy więcej miejsca (te zajmują 200 MB).

In [17]:
glove_vectors = api.load("glove-twitter-50")  # may take a while

In [18]:
GLOVE_SIZE = 50

In [21]:
def get_vector(word):
    """For a given word returns it's vector representation to be fed into model"""
    try:
        vec = glove_vectors.get_vector(word)
    except KeyError:
        vec = np.random.normal(0, 0.0001, 50)  # we don't want it to be all zeros, because all zeros vectors will be masked (ignored)
    return vec

In [22]:
get_vector(" ")

array([ 2.14686696e-05,  5.65701298e-05,  1.82052083e-04,  1.08248206e-04,
        2.82508985e-05,  1.74204412e-05,  1.55807298e-06,  3.68303046e-05,
        7.61260135e-05, -1.69741957e-04, -1.03244280e-04, -6.27697645e-05,
        1.44851437e-04, -9.02763941e-05, -2.62296049e-05,  2.58652942e-05,
       -1.91685304e-05,  5.02680212e-05, -1.56463837e-04,  8.45588081e-05,
        2.27471609e-07, -7.16238505e-05, -1.08695950e-04,  7.20380195e-05,
        1.34209816e-05, -2.12639268e-04, -5.00663427e-05, -9.76742954e-05,
        8.97838605e-05, -4.83875823e-05, -1.91569245e-04,  5.45593536e-05,
        5.67270170e-06,  1.04160722e-04, -1.54585461e-04, -7.11700763e-05,
        4.29185321e-05, -1.50988706e-05, -1.90328661e-05, -1.01883222e-04,
       -4.40567698e-05,  9.68468917e-06, -7.99816601e-05, -1.04211174e-04,
       -6.37025267e-05, -1.86169539e-05,  1.27310058e-04,  2.14417463e-05,
        9.15777838e-05,  6.52406783e-07])

In [23]:
def train_generator(tweets, batch_size):
    """Generates data form tweets in batches of size batch_size"""
        
    sentences = [sentence.split(" ") for sentence in tweets["cleaned_text"]]
    labels = ohe(tweets)
    
    max_len = max([len(sentence) for sentence in sentences])
    
    x_train = np.zeros((batch_size, max_len, GLOVE_SIZE))
    y_train = np.zeros((batch_size, 2))
    i = 0
    for sentence, label in cycle(zip(sentences, labels)):
        for j, word in enumerate(sentence):
            x_train[i, j, :] = get_vector(word)
        y_train[i, :] = label
        if i == batch_size - 1:
            i = 0
            yield x_train, y_train
        else:
            i += 1

In [24]:
tweets_train, tweets_test, y_train, y_test = train_test_split(tweets, tweets.up, test_size=0.10, random_state=43)

# Model

In [26]:
model = Sequential()

model.add(Masking(mask_value=0., input_shape=(None, 50)))

# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))

# Recurrent layer
model.add(LSTM(64, return_sequences=False, 
               dropout=0.1, recurrent_dropout=0.1))

# Fully connected layer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(
    optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Create callbacks
callbacks = [EarlyStopping(monitor='val_loss', patience=10), ModelCheckpoint('model.h5', save_best_only=True, save_weights_only=False)]

model.fit_generator(train_generator(tweets_train, 1000), 
                    validation_data=train_generator(tweets_test, 1), 
                    steps_per_epoch=50, 
                    validation_steps=50, 
                    epochs=10,
                    verbose=1,
                    callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8bbd24ef98>

In [222]:
y_train.values.mean()

0.7507039315882782