In [1]:
import pandas as pd
import numpy as np
import os
import re
from glob import glob
from ast import literal_eval
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import DistilBertForSequenceClassification, DistilBertConfig, DistilBertTokenizerFast, AdamW, get_linear_schedule_with_warmup

In [2]:
# CONFIG VARIABLES
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
LOADING = True # True if loading from storage, False if generating variables from scratch
BATCH_SIZE = 56
EPOCHS = 2
LEARNING_RATE = 3e-5
WORKING_DIR = './' #

In [3]:
if LOADING:
    messages = pd.read_parquet(WORKING_DIR + "all_messages.parquet")
else:
    ticker_dir = WORKING_DIR + 'short-financial-messages/data/stocktwits'
    PATH = ticker_dir
    EXT = "*.csv"

    all_csv_files = [file
                     for path, subdir, files in os.walk(PATH)
                     for file in glob(os.path.join(path, EXT))]

    parse_csv = lambda file: pd.read_csv(file, parse_dates=['created_at'])

    messages = pd.concat((parse_csv(f) for f in all_csv_files), ignore_index=True, sort=False)

    messages.set_index('id', inplace=True)
    messages.index = messages.index.map(str)
    messages = messages[~messages.index.duplicated(keep='first')]

    filter_urls = lambda text: re.sub(r"http\S+", "", str(text))
    messages['body'] = messages['body'].apply(filter_urls)

    messages["sentiment"] = messages["sentiment"].replace({-1: 0})
    messages.to_parquet("all_messages")

In [4]:
labeled = messages[messages['sentiment'] != -69]
SAMPLE_SIZE = int(len(messages[messages['sentiment'] == 0]) * 0.8)

bull_indices = labeled[labeled['sentiment'] == 1].index
random_bull_indices = np.random.choice(bull_indices, SAMPLE_SIZE, replace=False)
bull_sample = labeled.loc[random_bull_indices]

bear_indices = labeled[labeled['sentiment'] == 0].index
random_bear_indices = np.random.choice(bear_indices, SAMPLE_SIZE, replace=True)
bear_sample = labeled.loc[random_bear_indices]

labeled_training = pd.concat([bull_sample, bear_sample])
labeled_test = labeled.drop(random_bull_indices).drop(random_bear_indices)
labeled_test, labeled_val = train_test_split(labeled_test, test_size=0.2)

1    418621
0    158565
Name: sentiment, dtype: int64

In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)
vocab_set = set()
for symbols in labeled['symbols']:
    if symbols is not None and len(symbols) > 0:
        for w in symbols:
            vocab_set.add(w)
tokenizer.add_tokens(list(vocab_set))
    
def tokenize(input_strings):
    return tokenizer(
        input_strings, 
        max_length=MAX_TOKEN_LENGTH, 
        padding="max_length",
        return_tensors='pt', 
        truncation=True)

In [7]:
class StockTwitsDataset(Dataset):
    MAX_TOKEN_LENGTH = 160

    def __init__(self, messages, sentiments):
        self.encodings = tokenize(messages.tolist())
        self.sentiments = sentiments
    
    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        sentiment = self.sentiments.iloc[idx]
        encoding = self.encodings[idx]

        res = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'sentiments': sentiment
        }

        return res

In [9]:
train_set = StockTwitsDataset(labeled_training['body'], labeled_training['sentiment'])
test_set = StockTwitsDataset(labeled_test['body'], labeled_test['sentiment'])
val_set = StockTwitsDataset(labeled_val['body'], labeled_val['sentiment'])

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

TypeError: __init__() takes 3 positional arguments but 4 were given

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

config = DistilBertConfig(num_labels=2, return_dict=True)
model = DistilBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, config=config)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

optim = AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optim,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

In [None]:
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    total_train_loss = 0
    batches_trained = 0
    
    model = model.train()
    for batch in train_loader:        
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['sentiments'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        train_loss = outputs.loss
        total_train_loss += train_loss
        batches_trained += 1
        train_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optim.step()
        scheduler.step()
    print(f'Train loss: {total_train_loss / batches_trained}')

    total_val_loss = 0
    batches_valed = 0
        
    model = model.eval()
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['sentiments'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            total_val_loss += val_loss
            batches_valed += 1
    print(f'Validation loss: {total_val_loss / batches_valed}')
            
    print("")

In [None]:
model.save_pretrained("sentiment_model")

In [None]:
model.evaluate(test_dataset, verbose=1)