In [None]:
!pip install pytorch_pretrained_bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 6.5 MB/s 
Collecting boto3
  Downloading boto3-1.26.26-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 56.5 MB/s 
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.30.0,>=1.29.26
  Downloading botocore-1.29.26-py3-none-any.whl (10.2 MB)
[K     |████████████████████████████████| 10.2 MB 71.3 MB/s 
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 1.4 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 5.3 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.wh

In [74]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
from torch.utils.data import TensorDataset,RandomSampler,SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.autograd import Variable
from torch import optim
import time  
from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig, BertAdam
from tqdm import tqdm 

In [75]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Get the Data

In [76]:
data = pd.read_csv('/content/drive/MyDrive/CS410Project/410_dataset.csv', sep=',')
data = data[data.datetime > '2019-01-01']
data = data.dropna()
data = data[(data.label == 1) | (data.label == -1)]
data.label = data.label.apply(lambda x:0 if x==-1 else 1)

In [77]:
print(data.shape)

(611764, 8)


In [78]:
data.head()

Unnamed: 0,symbol,message,datetime,user,message_id,Date,Time,label
0,AAPL,peak profit last 6 expired option alerts aapl ...,2020-07-19 09:49:35,1442893,229008387,2020-07-19,09:49:35,1
1,AAPL,aapl jul 17 382 50 calls option volume 144 44 ...,2020-07-19 09:47:26,1442893,229008357,2020-07-19,09:47:26,1
2,AAPL,tsla market true bubble territory profitable c...,2020-07-19 09:01:25,1115913,229007569,2020-07-19,09:01:25,1
3,AAPL,aapl analyzed 26 analysts buy consensus 86 ana...,2020-07-19 08:13:00,47688,229006733,2020-07-19,08:13:00,1
4,AAPL,aapl new article dogs dow august 4 adopt ignore,2020-07-19 07:54:05,1555408,229006403,2020-07-19,07:54:05,1


In [79]:
data.groupby("symbol").message.count()

symbol
AAPL     262204
AMZN     127202
FB        45864
GOOGL     23205
NFLX     153289
Name: message, dtype: int64

# Vectorize + Machine Learning Model

In [80]:
# Prepare the data

X = data.message
y = data.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=2022)

In [81]:
# Evaluate the performance

def evaluate_performance(y_pred,y_test):
  accuracy = np.mean(y_pred==y_test)
  print(f"The accuarcy is {accuracy}")
  print(classification_report(y_test,y_pred))

In [82]:
# model part

def vectorize_machine_learning(X_train, X_test, y_train, y_test,vectorize,ml):

  '''
  For vectorize model, 1 represents CountVectorizer, 2 represents TfidfVectorizer()
  For machine learning model, 1 represents logistic regression, 2 represents Naive Bayes, 3 represents Random Forest
  '''

  vectorize_model_set = {1:CountVectorizer(),2:TfidfVectorizer()}
  ml_model_set = {1:LogisticRegression(penalty="l1",solver="liblinear",C=10000),2:MultinomialNB(),
                  3:RandomForestClassifier(max_depth=5,n_estimators=200,class_weight='balanced')}
  vectorize = vectorize_model_set[vectorize]
  model = ml_model_set[ml]
  train_features = vectorize.fit_transform(X_train)
  test_features = vectorize.transform(X_test)
  model.fit(train_features,y_train)
  y_pred = model.predict(test_features)

  evaluate_performance(y_pred,y_test)

In [44]:
# CountVectorizer + LogisticRegression

vectorize_machine_learning(X_train, X_test, y_train, y_test,1,1)

The accuarcy is 0.5912352681563333
              precision    recall  f1-score   support

           0       0.54      0.34      0.42     26358
           1       0.61      0.78      0.69     34819

    accuracy                           0.59     61177
   macro avg       0.58      0.56      0.55     61177
weighted avg       0.58      0.59      0.57     61177



In [None]:
# TfidfVectorizer + Naive Bayes

vectorize_machine_learning(X_train, X_test, y_train, y_test,2,2)

The accuarcy is 0.5885545221243278
              precision    recall  f1-score   support

           0       0.58      0.16      0.25     26358
           1       0.59      0.91      0.72     34819

    accuracy                           0.59     61177
   macro avg       0.59      0.54      0.48     61177
weighted avg       0.59      0.59      0.51     61177



In [None]:
# CountVectorizer + RandomForest

vectorize_machine_learning(X_train, X_test, y_train, y_test,1,3)

The accuarcy is 0.5712277489906338
              precision    recall  f1-score   support

           0       0.50      0.40      0.44     26358
           1       0.61      0.70      0.65     34819

    accuracy                           0.57     61177
   macro avg       0.55      0.55      0.55     61177
weighted avg       0.56      0.57      0.56     61177



# Deep Learning Model - LSTM

In [83]:
# Prepare the data

train_data_lstm = pd.DataFrame(X_train)
train_data_lstm["label"] = y_train
test_data_lstm = pd.DataFrame(X_test)
test_data_lstm["label"] = y_test

In [84]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [85]:
mydict = Dictionary()
mydict.add_word("UNK")
for item in train_data_lstm.iterrows():
    line = item[1]["message"] 
    words = line.split(" ") 
    for word in words:
        word = word.strip()
        if word:
            mydict.add_word(word)

In [86]:
class Corpus(object):
    def __init__(self,length,data,corpus_dict):
        self.dictionary = corpus_dict
        self.data = data
        self.texts,self.labels = self.tokenize(length)

    def tokenize(self,length):
        token_text = []
        labels = []
        for item in self.data.iterrows():
            line = item[1]["message"] 
            labels.append(int(item[1]["label"])) 
            words = line.split(" ") 
            text = torch.LongTensor(np.zeros(length, dtype=np.int64))
            for index,word in enumerate(words[:length]):
                word = word.strip()
                if word:
                  if word in self.dictionary.word2idx:
                    text[index] = self.dictionary.word2idx[word]
                  else:
                    text[index] = self.dictionary.word2idx["UNK"]
            token_text.append(text)
        return token_text,labels

In [87]:
class MyDataset(Dataset):
    def __init__(self,length, corpus):
        corpus = corpus
        self.token_text = corpus.texts
        self.labels = corpus.labels
        self.length = length

    def __getitem__(self, index):
        text = self.token_text[index]
        label = torch.LongTensor([self.labels[index]])
        return text, label

    def __len__(self):
        return len(self.labels)

In [93]:
# construct dataset

train_corpus = Corpus(100,train_data_lstm,mydict)
train_set = MyDataset(100, train_corpus)
train_loader = DataLoader(train_set,
                          batch_size=64,
                          shuffle=True)

test_corpus = Corpus(100,test_data_lstm,mydict)
test_set = MyDataset(100, test_corpus)
test_loader = DataLoader(test_set,
                          batch_size=64,
                          shuffle=True)

In [88]:
# model part

class MyClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, batch_size,device):
        super(MyClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.device = device
        self.embedding = nn.Embedding(vocab_size, embedding_dim,device=self.device)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,num_layers=1,device=self.device)
        self.fc = nn.Linear(hidden_dim, label_size,device=self.device)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        h0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim).to(self.device))
        c0 = Variable(torch.zeros(1, self.batch_size, self.hidden_dim).to(self.device))
        return (h0, c0)

    def forward(self, sentence):
        embedding = self.embedding(sentence.to(self.device))
        embedding = embedding.to(self.device)
        x = embedding.view(len(sentence), self.batch_size, -1)
        output, _ = self.lstm(x.to(self.device), self.hidden)
        result = self.fc(output[-1])
        return result


In [89]:
embedding_dim = 100
hidden_dim = 64
vocab_size = len(train_corpus.dictionary)
label_size = 2
batch_size = 64
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
LSTM_model = MyClassifier(embedding_dim, hidden_dim, vocab_size, label_size, batch_size, device)

In [90]:
optimizer = optim.Adam(LSTM_model.parameters(), lr=0.01)
loss_function = nn.CrossEntropyLoss()

In [91]:
print(LSTM_model)

MyClassifier(
  (embedding): Embedding(79574, 100)
  (lstm): LSTM(100, 64)
  (fc): Linear(in_features=64, out_features=2, bias=True)
)


In [None]:
# train and test - need to run again to train model with a different dataset

train_loss = []
train_acc = []
test_loss = []
test_acc = []
for epoch in range(10):
    total_acc = 0
    total_loss = 0
    total = 0
    for iter, traindata in enumerate(train_loader):
        train_inputs, train_labels = traindata
        train_labels = torch.squeeze(train_labels)
        if train_labels.shape[0] !=64:
          continue
        train_inputs = train_inputs.to(device)
        train_labels = train_labels.to(device)

        optimizer.zero_grad()
        output = LSTM_model(train_inputs.t())
        loss = loss_function(output, Variable(train_labels))
        loss.backward()
        optimizer.step()
        _, predicted = torch.max(output.data, 1)
        total_acc = total_acc + (predicted == train_labels).sum()
        total_loss = total_loss+loss.item()
        total = total + len(train_labels)
    train_loss.append(total_loss / total)
    train_acc.append(total_acc / total)

    total_acc = 0
    total_loss = 0
    total = 0
    for iter, testdata in enumerate(test_loader):
        test_inputs, test_labels = testdata
        test_labels = torch.squeeze(test_labels)
        if test_labels.shape[0] !=64:
          continue
        test_inputs = test_inputs.to(device)
        test_labels = test_labels.to(device)

        output = LSTM_model(test_inputs.t())
        loss = loss_function(output, Variable(test_labels))
        _, predicted = torch.max(output.data, 1)
        total_acc = total_acc + (predicted == test_labels).sum()
        total_loss = total_loss+loss.item()
        total = total + len(test_labels)
    test_loss.append(total_loss / total)
    test_acc.append(total_acc / total)

    print(f"Epoch {epoch+1}:train loss:{train_loss[epoch]},train acc:{train_acc[epoch]},test loss:{test_loss[epoch]},test acc:{test_acc[epoch]}")


Epoch 1:train loss:0.010682805093454395,train acc:0.5712025761604309,test loss:0.010705553671998503,test acc:0.569126307964325
Epoch 2:train loss:0.010689118815123883,train acc:0.5712298154830933,test loss:0.010710821121306944,test acc:0.5688972473144531
Epoch 3:train loss:0.010687578676307001,train acc:0.5712952017784119,test loss:0.010697662590451889,test acc:0.5688154697418213
Epoch 4:train loss:0.010686696929390716,train acc:0.5712624788284302,test loss:0.010689954758080513,test acc:0.5690935850143433
Epoch 5:train loss:0.010687253583340424,train acc:0.5710354447364807,test loss:0.010693868696299524,test acc:0.5691426992416382
Epoch 6:train loss:0.010680538808424143,train acc:0.5708792209625244,test loss:0.010692382478082055,test acc:0.5688809156417847
Epoch 7:train loss:0.010671421227977979,train acc:0.5703688263893127,test loss:0.01069180381664743,test acc:0.5691754221916199
Epoch 8:train loss:0.01066876040105538,train acc:0.5695078372955322,test loss:0.010699975919661098,test ac

In [None]:
# save model - need to run again if model is trained with a different dataset

PATH = "lstm_model.pth"
torch.save(LSTM_model.state_dict(), PATH)

In [94]:
# load model - run with current dataset

LSTM_model = MyClassifier(embedding_dim, hidden_dim, vocab_size, label_size, batch_size, device)
LSTM_model.load_state_dict(torch.load("/content/drive/MyDrive/CS410Project/lstm_model.pth"))

total_acc = 0
total_loss = 0
total = 0
for iter, testdata in enumerate(test_loader):
    test_inputs, test_labels = testdata
    test_labels = torch.squeeze(test_labels)
    if test_labels.shape[0] !=64:
      continue
    test_inputs = test_inputs.to(device)
    test_labels = test_labels.to(device)

    output = LSTM_model(test_inputs.t())
    loss = loss_function(output, Variable(test_labels))
    _, predicted = torch.max(output.data, 1)
    total_acc = total_acc + (predicted == test_labels).sum()
    total_loss = total_loss+loss.item()
    total = total + len(test_labels)
print(f"Total Loss {total_loss / total}. Total Acc {total_acc / total}")

Total Loss 0.01070122863141654. Total Acc 0.5690281391143799


# BERT Model

In [57]:
# Prepare for the data

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
input_ids = []     
input_types = []   
input_masks = []   
label = []         
pad_size = 32      
 
for i in range(data.shape[0]): 
    x = data.iloc[i,1]
    y = data.iloc[i,7]
    x = tokenizer.tokenize(x)
    x = x[:510]
    tokens = ["[CLS]"] + x + ["[SEP]"]
    ids = tokenizer.convert_tokens_to_ids(tokens)
    types = [0] *(len(ids))
    masks = [1] * len(ids)
    if len(ids) < pad_size:
        types = types + [1] * (pad_size - len(ids))
        masks = masks + [0] * (pad_size - len(ids))
        ids = ids + [0] * (pad_size - len(ids))
    else:
        types = types[:pad_size]
        masks = masks[:pad_size]
        ids = ids[:pad_size]
    input_ids.append(ids)
    input_types.append(types)
    input_masks.append(masks)
    label.append([int(y)])


In [58]:
split_index = list(range(len(input_ids)))
np.random.seed(2022)  
np.random.shuffle(split_index)
 
input_ids_train = np.array([input_ids[i] for i in split_index[:int(len(input_ids)*0.8)]])
input_types_train = np.array([input_types[i] for i in split_index[:int(len(input_ids)*0.8)]])
input_masks_train = np.array([input_masks[i] for i in split_index[:int(len(input_ids)*0.8)]])
y_train = np.array([label[i] for i in split_index[:int(len(input_ids) * 0.8)]])
 
input_ids_test = np.array([input_ids[i] for i in split_index[int(len(input_ids)*0.8):]])
input_types_test = np.array([input_types[i] for i in split_index[int(len(input_ids)*0.8):]])
input_masks_test = np.array([input_masks[i] for i in split_index[int(len(input_ids)*0.8):]])
y_test = np.array([label[i] for i in split_index[int(len(input_ids) * 0.8):]])


In [59]:
input_ids_train = input_ids_train[:489408]
input_types_train = input_types_train[:489408]
input_masks_train = input_masks_train[:489408]
y_train = y_train[:489408]

input_ids_test = input_ids_test[:122304]
input_types_test = input_types_test[:122304]
input_masks_test = input_masks_test[:122304]
y_test = y_test[:122304]

In [60]:
batch_size = 64
train_data = TensorDataset(torch.LongTensor(input_ids_train), 
                           torch.LongTensor(input_types_train), 
                           torch.LongTensor(input_masks_train), 
                           torch.LongTensor(y_train))
train_sampler = RandomSampler(train_data)  
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
 
test_data = TensorDataset(torch.LongTensor(input_ids_test), 
                          torch.LongTensor(input_types_test), 
                         torch.LongTensor(input_masks_test),
                          torch.LongTensor(y_test))
test_sampler = SequentialSampler(test_data)
test_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [61]:
# model part

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased") 
        for param in self.bert.parameters():
            param.requires_grad = True 
        self.fc = nn.Linear(768, 2)
 
    def forward(self, x):
        context = x[0]
        types = x[1]
        mask = x[2]
        _, output = self.bert(context, token_type_ids=types, 
                              attention_mask=mask, 
                              output_all_encoded_layers=False)
        result = self.fc(output)
        return result


In [62]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Model().to(DEVICE)
print(model) 

Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
 

In [63]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) 

In [64]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (x1,x2,x3, y) in enumerate(train_loader):
        x1,x2,x3, y = x1.to(device), x2.to(device), x3.to(device), y.to(device)
        y_pred = model([x1, x2, x3]) 
        model.zero_grad()       
        loss = F.cross_entropy(y_pred, y.squeeze()) 
        loss.backward()
        optimizer.step()
        if(batch_idx + 1) % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.2f}%)] Loss: {:.6f}'.format(epoch, (batch_idx+1) * len(x1), 
                                                                           len(train_loader.dataset),
                                                                           100. * batch_idx / len(train_loader), 
                                                                           loss.item())) 
 
def test(model, device, test_loader): 
    model.eval()
    test_loss = 0 
    acc = 0
    for batch_idx, (x1,x2,x3, y) in enumerate(test_loader):
        x1,x2,x3, y = x1.to(device), x2.to(device), x3.to(device), y.to(device)
        with torch.no_grad():
            y_ = model([x1,x2,x3])
        test_loss += F.cross_entropy(y_, y.squeeze())
        pred = y_.max(-1, keepdim=True)[1]  
        acc += pred.eq(y.view_as(pred)).sum().item()   
    test_loss /= len(test_loader)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
          test_loss, acc, len(test_loader.dataset),
          100. * acc / len(test_loader.dataset)))


In [None]:
# train and test then same the model - need to run again with a different dataset

PATH = 'bert_model.pth'
epoch = 1
train(model, DEVICE, train_loader, optimizer, epoch)
test(model, DEVICE, test_loader)
torch.save(model.state_dict(), PATH)

Test set: Average loss: 0.6671, Accuracy: 72524/122304 (59.30%)


In [65]:
# load model - run with current dataset

model = Model().to(DEVICE)
model.load_state_dict(torch.load("/content/drive/MyDrive/CS410Project/bert_model.pth"))
test(model, DEVICE, test_loader)

Test set: Average loss: 0.6671, Accuracy: 72524/122304 (59.30%)
