In [1]:
import pandas as pd
import numpy as np
import os
from glob import glob
import torch
from torch.utils.data import Dataset, DataLoader
from ast import literal_eval

from transformers import DistilBertTokenizerFast

In [2]:
ticker_dir = os.path.dirname(os.path.abspath("explore.ipynb")) + "/data"
PATH = ticker_dir
EXT = "*.csv"

all_csv_files = [file
                 for path, subdir, files in os.walk(PATH)
                 for file in glob(os.path.join(path, EXT))]


parse_csv = lambda file: pd.read_csv(file, parse_dates=['created_at'])

messages = pd.concat([parse_csv(f) for f in all_csv_files], ignore_index=True, sort=False)


messages.set_index('id', inplace=True)
messages = messages[~messages.index.duplicated(keep='first')]

In [13]:
messages["sentiment"] = messages["sentiment"].replace({-1: 2})
messages["sentiment"].value_counts()

-69    32598
 1      8136
 2      1967
 0      1509
Name: sentiment, dtype: int64

In [6]:
labeled = messages[messages['sentiment'] != -69]
labeled = labeled.drop(labeled.query('sentiment == 1').sample(frac=.5).index)
msk = np.random.rand(len(labeled)) < 0.8
labeled_training = labeled[msk]
labeled_test = labeled[~msk]

In [6]:
class StockTwitsDataset(Dataset):
    MAX_TOKEN_LENGTH = 160

    def __init__(self, messages, sentiments, tokenizer):
        self.messages = messages
        self.sentiments = sentiments
    
    def __len__(self):
        return len(self.messages)

    def tokenize(self, input_string):
        return tokenizer.encode_plus(
            input_string, 
            max_length=self.MAX_TOKEN_LENGTH, 
            pad_to_max_length=True, 
            return_tensors='pt', 
            truncation=True)

    def __getitem__(self, idx):
        message = self.messages.iloc[idx]
        sentiment = self.sentiments.iloc[idx]

        encoding = self.tokenize(message)

        res = {
            'message_text': str(message),
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentiments': sentiment
        }

        return res

In [7]:
from transformers import DistilBertTokenizerFast
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'

tokenizer = DistilBertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)

for symbols in labeled_training['symbols']:
    if isinstance(symbols, str) and "[" in symbols and "]" in symbols:
        tokenizer.add_tokens(literal_eval(symbols))

In [8]:
train_set = StockTwitsDataset(labeled_training['body'], labeled_training['sentiment'], tokenizer)
test_set = StockTwitsDataset(labeled_test['body'], labeled_test['sentiment'], tokenizer)

train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
test_loader = DataLoader(test_set, batch_size=16, shuffle=True)

In [9]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, DistilBertConfig

config = DistilBertConfig(vocab_size=len(tokenizer))

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

config = DistilBertConfig(num_labels=3, return_dict=True)

model = DistilBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, config=config)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(31485, 768)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Li

In [40]:
from transformers import AdamW, 
model.train()

optim = AdamW(model.parameters(), lr=3e-5)

for epoch in range(2):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['sentiments'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()

KeyboardInterrupt: 

In [31]:
model.eval()
import torch.nn.functional as F 

# for batch in test_loader:
#     input_ids = batch['input_ids']
#     attention_mask = batch['attention_mask']
#     sentiments = batch['sentiments']
#     message = batch['message_text']
#     output = model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
#     print(output)
    
#     break
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

print(outputs)

    

(tensor([[ 0.1358,  0.0398, -0.0044]], grad_fn=<AddmmBackward>),)
