In [None]:
# Load Packages

In [1]:
import json
import nltk
import os
import random
import re
import torch

from torch import nn, optim
import torch.nn.functional as F

In [2]:
with open('twits.json') as f:
    twits = json.load(f)
    
print(twits['data'][:10])

[{'sentiment': 2, 'message_body': '$FITB great buy at 26.00...ill wait', 'timestamp': '2018-07-01T00:00:09Z'}, {'sentiment': 1, 'message_body': '@StockTwits $MSFT', 'timestamp': '2018-07-01T00:00:42Z'}, {'sentiment': 2, 'message_body': '#STAAnalystAlert for $TDG : Jefferies Maintains with a rating of Hold setting target price at USD 350.00. Our own verdict is Buy  http://www.stocktargetadvisor.com/toprating', 'timestamp': '2018-07-01T00:01:24Z'}, {'sentiment': 1, 'message_body': '$AMD I heard there’s a guy who knows someone who thinks somebody knows something - on StockTwits.', 'timestamp': '2018-07-01T00:01:47Z'}, {'sentiment': 0, 'message_body': '$AMD reveal yourself!', 'timestamp': '2018-07-01T00:02:13Z'}, {'sentiment': 1, 'message_body': '$AAPL Why the drop? I warren Buffet taking out his position?', 'timestamp': '2018-07-01T00:03:10Z'}, {'sentiment': -2, 'message_body': '$BA bears have 1 reason on 06-29 to pay more attention https://dividendbot.com?s=BA', 'timestamp': '2018-07-01T

In [None]:
# Get Message Body and Sentiment Score

In [3]:
messages = [twit['message_body'] for twit in twits['data']]
sentiments = [twit['sentiment']+2 for twit in twits['data']]

In [4]:
print(messages[:10])
print(sentiments[:10])

['$FITB great buy at 26.00...ill wait', '@StockTwits $MSFT', '#STAAnalystAlert for $TDG : Jefferies Maintains with a rating of Hold setting target price at USD 350.00. Our own verdict is Buy  http://www.stocktargetadvisor.com/toprating', '$AMD I heard there’s a guy who knows someone who thinks somebody knows something - on StockTwits.', '$AMD reveal yourself!', '$AAPL Why the drop? I warren Buffet taking out his position?', '$BA bears have 1 reason on 06-29 to pay more attention https://dividendbot.com?s=BA', '$BAC ok good we&#39;re not dropping in price over the weekend, lol', '$AMAT - Daily Chart, we need to get back to above 50.', '$GME 3% drop per week after spike... if no news in 3 months, back to 12s... if BO, then bingo... what is the odds?']
[4, 3, 4, 3, 2, 3, 0, 3, 4, 0]


In [None]:

# Preprocessing the Data

In [5]:
# remove useless info
# convert the message into token

nltk.download('wordnet')

def preprocess(message):
    """
    This function takes a string as input, then performs these operations: 
        - lowercase
        - remove URLs
        - remove ticker symbols 
        - removes punctuation
        - tokenize by splitting the string on whitespace 
        - removes any single character tokens
    
    Parameters
    ----------
        message : The text message to be preprocessed.
        
    Returns
    -------
        tokens: The preprocessed text into tokens.
    """ 
    text = message.lower()
    text = re.sub(r"http?://[^\s]+"," ", text)
    text = re.sub(r"\$[a-zA-Z0-9]*"," ", text)
    text = re.sub(r"@[a-zA-Z0-9]*"," ", text)
    text = re.sub(r"[^a-z]"," ", text)
    tokens = text.split()
    wnl = nltk.stem.WordNetLemmatizer()
    tokens = [wnl.lemmatize(w) for w in tokens if len(w) > 1]
    return tokens



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zhuangsheng/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
tokenized = [preprocess(message) for message in messages]
print(tokenized[:3])

[['great', 'buy', 'at', 'ill', 'wait'], [], ['staanalystalert', 'for', 'jefferies', 'maintains', 'with', 'rating', 'of', 'hold', 'setting', 'target', 'price', 'at', 'usd', 'our', 'own', 'verdict', 'is', 'buy']]


In [7]:
# remove empty ones

print("Number of tokens before removing empty ones:{}".format(len(tokenized)))
good_tokens_idx = [idx for idx, token in enumerate(tokenized) if len(token) > 0]
tokenized = [tokenized[idx] for idx in good_tokens_idx]
sentiments = [sentiments[idx] for idx in good_tokens_idx]

print("Number of tokens after removing empty ones:{}".format(len(tokenized)))

Number of tokens before removing empty ones:1548010
Number of tokens after removing empty ones:1510516


In [8]:
# Bag of words

from collections import Counter

## flatten the nest list
stacked_tokens = [word for twit in tokenized for word in twit]
## genenrate bow
bow = Counter(stacked_tokens)


"""
Set the following variables:
    freqs
    low_cutoff
    high_cutoff
    K_most_common
"""
### Dictionart that contains the Frequency of words appearing in messages.
### The key is the token and the value is the frequency of that word in the corpus.

total_num_words = len(stacked_tokens)
freqs = {key:value/total_num_words for key, value in bow.items()}

### Float that is the frequency cutoff. Drop words with a frequency that is lower or equal to this number.
low_cutoff = 5e-6

### Integer that is the cut off for most common words. Drop words that are the `high_cutoff` most common words.
high_cutoff = 15

### The k most common words in the corpus. Use `high_cutoff` as the k.
K_most_common = [word[0] for word in bow.most_common(high_cutoff)]

### remove high qand low freq words
filtered_words = [word for word in freqs if (freqs[word] > low_cutoff and word not in K_most_common)]
print(K_most_common)
print(len(filtered_words))

['the', 'to', 'is', 'for', 'on', 'http', 'of', 'and', 'in', 'this', 'com', 'it', 'at', 'will', 'amp']
7034


In [None]:
# Preprare model used Data

In [9]:
from tqdm import tqdm

vocab = {word:ii+1 for ii, word in enumerate(filtered_words)}
id2vocab = {ii:word for ii, word in enumerate(filtered_words)}

filtered = []
for twit in tqdm(tokenized):
    filtered.append([word for word in twit if word in filtered_words])

100%|██████████| 1510516/1510516 [21:37<00:00, 1164.55it/s]


In [None]:
# Balancing the classes

In [10]:
## balancing the classes

balanced = {'messages': [], 'sentiments':[]}

n_neutral = sum(1 for each in sentiments if each == 2)
N_examples = len(sentiments)
keep_prob = (N_examples - n_neutral)/4/n_neutral

for idx, sentiment in enumerate(sentiments):
    message = filtered[idx]
    if len(message) == 0:
        # skip this message because it has length zero
        continue
    elif sentiment != 2 or random.random() < keep_prob:
        balanced['messages'].append(message)
        balanced['sentiments'].append(sentiment)
        
## check the result

n_neutral = sum(1 for each in balanced['sentiments'] if each == 2)
N_examples = len(balanced['sentiments'])
n_neutral/N_examples

0.19937528069962393

In [11]:
## convert tokens into integer ids used in network
token_ids = [[vocab[word] for word in message] for message in balanced['messages']]
sentiments = balanced['sentiments']

In [12]:
## Prepare the training data ans test data
## Data Loader

def dataloader(messages, labels, sequence_length, batch_size, shuffle=False):
    # shulffe the data
    if shuffle:
        indices = list(range(len(messages)))
        random.shuffle(indices)
        messages = [messages[idx] for idx in indices]
        sentiments = [messages[idx] for idx in indices]
    # get total numbers of messages
    total_sequences = len(messages)
    
    for ii in range(0,total_sequences, batch_size):
        batch_messages = messages[ii:ii+batch_size] # output:[[idx1,idx2,idx3],[idx4],[idx5,idx6],...]
        # all 0 
        batch = torch.zeros((sequence_length, len(batch_messages)), dtype=torch.int64)
        for batch_num, tokens in enumerate(batch_messages):
            token_tensor = torch.tensor(tokens)
            # LEFT PAD!
            start_idx = max(sequence_length - len(token_tensor),0)
            batch[start_idx:,batch_num] = token_tensor[:sequence_length] # replace with column
        label_tensor = torch.tensor(labels[ii:ii+len(batch_messages)])
        yield batch, label_tensor

In [13]:
## training data and validation set
valid_split = int(0.9 * len(token_ids))

train_features = token_ids[:valid_split]
valid_features = token_ids[valid_split:]
train_labels = sentiments[:valid_split]
valid_labels = sentiments[valid_split:]

In [14]:
## view text_batch and labels
text_batch, labels = next(iter(dataloader(train_features, 
                                          train_labels, 
                                          sequence_length=40, 
                                          batch_size = 512)))
print(text_batch)
print(labels)

tensor([[   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0],
        ...,
        [2365, 3082,  488,  ..., 6160, 6509, 1555],
        [2558, 5952, 3137,  ..., 3232, 5848, 5324],
        [5767, 2365,  748,  ..., 3005, 4889, 5777]])
tensor([4, 4, 3, 3, 0, 3, 4, 0, 4, 0, 4, 4, 4, 3, 4, 4, 3, 3, 3, 0, 1, 3, 3, 0,
        4, 4, 0, 3, 2, 1, 3, 3, 4, 3, 4, 3, 3, 2, 3, 3, 3, 3, 1, 4, 3, 4, 0, 3,
        2, 0, 1, 4, 1, 3, 2, 4, 1, 0, 4, 4, 2, 4, 2, 3, 4, 0, 2, 0, 0, 3, 1, 1,
        2, 3, 0, 2, 0, 4, 4, 2, 2, 3, 3, 4, 0, 4, 3, 3, 2, 4, 2, 4, 1, 1, 4, 3,
        2, 2, 4, 1, 2, 0, 3, 0, 4, 2, 4, 3, 2, 3, 1, 4, 1, 2, 2, 3, 2, 0, 0, 2,
        2, 0, 1, 3, 2, 3, 0, 4, 2, 4, 4, 1, 0, 2, 1, 3, 4, 1, 4, 3, 0, 3, 1, 2,
        3, 2, 2, 3, 2, 2, 1, 4, 1, 1, 3, 0, 3, 3, 1, 4, 3, 0, 3, 1, 3, 3, 3, 4,
        2, 2, 0, 1, 2, 0, 0, 0, 0, 3, 0, 1, 3, 4, 3, 2, 3, 1, 1, 4, 2, 3, 3, 3,
        1, 1, 2, 1, 0, 3, 4, 2, 4,

In [47]:
# LSTM 

In [48]:
# Implement the classifier

In [15]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, 
                 lstm_size, output_size, lstm_layers = 1, dropout = 0.1):
        """
        Initialize the model by setting up the layers.
        
        Parameters
        ----------
            vocab_size : The vocabulary size.
            embed_size : The embedding layer size.
            lstm_size : The LSTM layer size.
            output_size : The output size.
            lstm_layers : The number of LSTM layers.
            dropout : The dropout probability.

        """
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.lstm_size = lstm_size
        self.output_size = output_size
        self.lstm_layers = lstm_layers
        self.dropout = dropout
        
        # Embedding layer
        self.embedding = nn.Embedding(num_embeddings = self.vocab_size,
                                     embedding_dim = self.embed_size
                                     )
        # LSTM 
        self.lstm = nn.LSTM(input_size = self.embed_size,
                            hidden_size = self.lstm_size,
                            num_layers = self.lstm_layers,
                            batch_first = False,
                            dropout = self.dropout
                           )
        # Dropout
        self.dropout = nn.Dropout(p=0.3)
        
        # Fully Connected Layer
        self.fc = nn.Linear(in_features = self.lstm_size,
                            out_features = self.output_size)
        # 
        self.log_smax = nn.LogSoftmax(dim = 1)
    
    def init_hidden(self, batch_size):
        """ 
        Initializes hidden state
        
        Parameters
        ----------
            batch_size : The size of batches.
        
        Returns
        -------
            hidden_state
            
        """
        
        # TODO Implement 
        
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        #train_on_gpu = False
        
        if False: #torch.cuda.isavailable():
            hidden = (weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_().cuda(),
                      weight.new(self.lstm_layers, batch_size, self.lstm_size).zero_().cuda())
        else:
            hidden = (weight.new(self.lstm_layers,batch_size, self.lstm_size).zero_(),
                      weight.new(self.lstm_layers,batch_size, self.lstm_size).zero_())
        
        return hidden
    
    def forward(self, nn_input, hidden_state):
        """
        Perform a forward pass of our model on nn_input.
        
        Parameters
        ----------
            nn_input : The batch of input to the NN.
            hidden_state : The LSTM hidden state.

        Returns
        -------
            logps: log softmax output
            hidden_state: The new hidden state.
        """
        embed = self.embedding(nn_input)
        lstm_out, hidden_state = self.lstm(embed, hidden_state)
        lstm_out = lstm_out[-1]
        
        logps = self.log_smax(self.dropout(self.fc(lstm_out)))
        return logps, hidden_state
    

In [16]:
## Model View Test
model = TextClassifier(len(vocab), 10,6,5, dropout = 0.1, lstm_layers = 2)
model.embedding.weight.data.uniform_(-1,1)
input = torch.randint(0,1000,(5,4), dtype = torch.int64)

hidden = model.init_hidden(4)

logps,_ = model.forward(input, hidden)
print(logps)

tensor([[-1.4411, -1.7025, -2.1722, -2.0416, -1.0866],
        [-1.7919, -1.5801, -2.0999, -1.9775, -1.0037],
        [-1.9319, -1.6593, -1.4829, -2.1431, -1.1376],
        [-1.8996, -1.6231, -2.2218, -1.4598, -1.1635]],
       grad_fn=<LogSoftmaxBackward>)


In [51]:
# Training

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TextClassifier(len(vocab)+1, 1024, 512, 5, lstm_layers = 2, dropout = 0.2)
model.embedding.weight.data.uniform_(-1,1)
model.to(device)

TextClassifier(
  (embedding): Embedding(7035, 1024)
  (lstm): LSTM(1024, 512, num_layers=2, dropout=0.2)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=5, bias=True)
  (log_smax): LogSoftmax()
)

In [18]:
"""
Train your model with dropout. Make sure to clip your gradients.
Print the training loss, validation loss, and validation accuracy for every 100 steps.
"""

epochs = 3
batch_size = 512
sequence_length = 40
learning_rate = 0.003
clip = 5
best_val_acc = 0

print_every = 100
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
model.train()

for epoch in range(epochs):
    print('Starting epoch {}'.format(epoch + 1))
    
    steps = 0
    hidden = model.init_hidden(batch_size)
    
    for text_batch, labels in dataloader(
            train_features, train_labels, batch_size=batch_size, sequence_length=sequence_length, shuffle=True):
        if text_batch.size()!=torch.Size([sequence_length,batch_size]):
            continue
        steps += 1
        hidden = tuple([each.data for each in hidden])
        
        # Set Device
        text_batch, labels = text_batch.to(device), labels.to(device)
        for each in hidden:
            each.to(device)
        
        # TODO Implement: Train Model----------------------------------------------------------------*
        model.zero_grad()
        log_ps, hidden = model.forward(text_batch, hidden)
        loss = criterion(log_ps, labels)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if steps % print_every == 0:
            model.eval()
            val_losses = []
            val_accuracy = []
            val_hidden = model.init_hidden(batch_size)
            
            for val_text_batch, val_labels in dataloader(
            valid_features, valid_labels, batch_size = batch_size, sequence_length = sequence_length):
                if val_text_batch.size() != torch.Size([sequence_length, batch_size]):
                    continue
                val_text_batch, val_labels = val_text_batch.to(device), val_labels.to(device)
                val_hidden = tuple([each.data for each in val_hidden])
                for each in val_hidden:
                    each.to(device)
                val_log_ps, hidden = model.forward(val_text_batch, val_hidden)
                
                val_loss = criterion(val_log_ps.squeeze(), val_labels)
                val_losses.append(val_loss.item())
                
                val_ps = torch.exp(val_log_ps)
                top_p, top_class = val_ps.topk(1, dim=1)
                equals = top_class == val_labels.view(*top_class.shape)
                val_accuracy.append(torch.mean(equals.type(torch.FloatTensor)).item())
                
            # TODO Implement: Print metrics---------------------------------------------------------------*
            model.train()
            this_val_acc = sum(val_accuracy)/len(val_accuracy)
            
            print("Epoch:{}/{}...".format(epoch+1, epochs),
                  "Step:{}...".format(steps),
                  "Loss:{:.6f}...".format(loss.item()),
                  "Val Loss:{:.6f}...".format(sum(val_losses)/len(val_losses)),
                  "Val Accuracy:{:.6f}...".format(this_val_acc))
            if this_val_acc > best_val_acc:
                torch.save({
                'epoch': epoch,
                'step':steps,
                #'model_state_dict': model.state_dic(),
                #'optimizer_state_dict': optimizer.stat_dic(),
                'loss': loss,
                }, 'best_model')
                best_val_acc = this_val_acc
                print("New best accuracy - model saved")

Starting epoch 1
Epoch:1/3... Step:100... Loss:1.570675... Val Loss:1.594005... Val Accuracy:0.272268...
New best accuracy - model saved
Epoch:1/3... Step:200... Loss:1.593549... Val Loss:1.592026... Val Accuracy:0.272403...
New best accuracy - model saved
Epoch:1/3... Step:300... Loss:1.573412... Val Loss:1.594591... Val Accuracy:0.272403...
Epoch:1/3... Step:400... Loss:1.553703... Val Loss:1.592939... Val Accuracy:0.272403...
Epoch:1/3... Step:500... Loss:1.572526... Val Loss:1.592968... Val Accuracy:0.272403...
Epoch:1/3... Step:600... Loss:1.582663... Val Loss:1.597873... Val Accuracy:0.272403...
Epoch:1/3... Step:700... Loss:1.593295... Val Loss:1.593351... Val Accuracy:0.272403...
Epoch:1/3... Step:800... Loss:1.563199... Val Loss:1.594091... Val Accuracy:0.272403...
Epoch:1/3... Step:900... Loss:1.516518... Val Loss:1.599866... Val Accuracy:0.272403...
Epoch:1/3... Step:1000... Loss:1.570485... Val Loss:1.594632... Val Accuracy:0.272403...
Epoch:1/3... Step:1100... Loss:1.58950

In [31]:
torch.save(model, 'model.pkl')

  "type " + obj.__name__ + ". It won't be checked "


In [None]:
# Predictions

In [25]:
def predict(text, model, vocab):
    """ 
    Make a prediction on a single sentence.

    Parameters
    ----------
        text : The string to make a prediction on.
        model : The model to use for making the prediction.
        vocab : Dictionary for word to word ids. The key is the word and the value is the word id.

    Returns
    -------
        pred : Prediction vector
    """    
    
    # TODO Implement
    
    tokens = preprocess(text)
    
    # Filter non-vocab words
    tokens = [word for word in tokens if word in filtered_words]
    
    # Convert words to ids
    tokens = [vocab[word] for word in tokens]
        
    # Adding a batch dimension
    text_input = torch.tensor(tokens).unsqueeze(1)

    # Get the NN output
    hidden = model.init_hidden(text_input.size(1))
    logps, _ = model.forward(text_input,hidden)
    
    # Take the exponent of the NN output to get a range of 0 to 1 for each label.
    pred = torch.exp(logps)
    
    return pred.detach().numpy()

In [26]:
## Use one message as test

text = "Google is working on self driving cars, I'm bullish on $goog"
model.eval()
model.to("cpu")
predict(text, model, vocab)

array([[0.1770924 , 0.18436691, 0.20150499, 0.24226455, 0.19477111]],
      dtype=float32)

In [None]:
# Testing

In [27]:
with open('test_twits.json') as f:
    test_data = json.load(f)
    


In [28]:
## Twit Stream
def twit_stream():
    for twit in test_data['data']:
        yield twit

next(twit_stream())

{'message_body': '$JWN has moved -1.69% on 10-31. Check out the movement and peers at  https://dividendbot.com?s=JWN',
 'timestamp': '2018-11-01T00:00:05Z'}

In [29]:
def score_twits(stream, model, vocab, universe):
    """ 
    Given a stream of twits and a universe of tickers, return sentiment scores for tickers in the universe.
    """
    for twit in stream:
        text = twit['message_body']
        symbols = re.findall(r"\$[A-Z]{2,4}", text)
        score = predict(text, model, vocab)
        
        for symbol in symbols:
            if symbol in universe:
                yield {'symbol': symbol, 'score': score, 'timestamp': twit['timestamp']}

In [30]:
universe = {'$BBRY', '$AAPL', '$AMZN', '$BABA', '$YHOO', '$LQMT', '$FB', '$GOOG', '$BBBY', '$JNUG', '$SBUX', '$MU'}
score_stream = score_twits(twit_stream(), model, vocab, universe)

next(score_stream)

{'score': array([[0.17843668, 0.18518604, 0.20136043, 0.24037914, 0.19463772]],
       dtype=float32), 'symbol': '$AAPL', 'timestamp': '2018-11-01T00:00:18Z'}