In [1]:
import pandas as pd
import numpy as np
import os
import re
from glob import glob
from ast import literal_eval
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import DistilBertForSequenceClassification, DistilBertConfig, DistilBertTokenizerFast, AdamW, get_linear_schedule_with_warmup

In [14]:
# CONFIG VARIABLES
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
LOADING = True # True if loading from storage, False if generating variables from scratch
BATCH_SIZE = 56
MAX_TOKEN_LENGTH = 160
EPOCHS = 2
LEARNING_RATE = 3e-5
WORKING_DIR = '../data/' #

In [4]:
if LOADING:
    messages = pd.read_parquet(WORKING_DIR + "all_messages.parquet")
else:
    ticker_dir = WORKING_DIR + 'stocktwits'
    PATH = ticker_dir
    EXT = "*.csv"

    all_csv_files = [file
                     for path, subdir, files in os.walk(PATH)
                     for file in glob(os.path.join(path, EXT))]

    parse_csv = lambda file: pd.read_csv(file, parse_dates=['created_at'])

    messages = pd.concat((parse_csv(f) for f in all_csv_files), ignore_index=True, sort=False)

    messages.set_index('id', inplace=True)
    messages.index = messages.index.map(str)
    messages = messages[~messages.index.duplicated(keep='first')]

    filter_urls = lambda text: re.sub(r"http\S+", "", str(text))
    messages['body'] = messages['body'].apply(filter_urls)

    messages["sentiment"] = messages["sentiment"].replace({-1: 0})
    messages.to_parquet("all_messages")

In [5]:
messages["is_spam"] = -69 * np.ones(len(messages), dtype=np.int)
labeled = messages[messages['sentiment'] != -69]
labeled_alt = labeled.copy()

In [6]:
spam_words = [
    "smartoptions®",
    "technical alerts",
    ": available to subscribers",
    "evolution trading",
    "trade alerts",
    "trading community",
    "trading alerts",
    "sweepcast.com",
    "optionpros",
    "freedomstocks.ca",
    "thetradexchange",
    "capotrades",
    "thetradexchange",
    "pineapplestocks.com",
    "alert triggered",
    "xtradesb",
    "option-alerts.com",
    "options alert"
    "alerts triggered",
    "assetdash.com",
    "beststocksnowapp.com",
    "drstoxx.com",
    "echelon-1.com",
    "wallstjesus.com",
    "trendspider.com",
    "gainers watchlist",
    "freedom stocks",
    "#optionstradingpulse",
    "vwapindicator",
    "on notifications",
    "trade ideas",
    "(delayed)",
    'follow for',
    "📈🚀 symbol:",
    "delayed]",
    "today&#39;s biggest market cap"
]

spam_indices = [
    "189934349",
    "142590793",
    "185792536",
    "182362237",
    "226578494",
    "174519289",
    "240723002",
    "242183678",
    "248681269",
    "245656196",
    "243413941",
    "239273922",
    "230980738",
    "255520798",
    "158019671",
    "252711617",
    "252527668",
    "247522334",
    "251021498",
    "207262771"
]

false_negatives = [
    "210916827",
    "86743375",
    "216738976",
    "236216134",
    "203164333",
    "180138622",
    "206200249",
    "127735161",
    "218513852",
    "211814549",
    "215246245",
    "251010890",
    "207338547",
    "233435151",
    "240829277",
    "220170011",
    "136139256",
    "219269972",
    "231359105",
    "166400184",
    "246096363",
    "136017785",
    "222582653",
    "247547045",
    "210906734",
    "247247993",
    "201056424",
    "256665740",
    "114878188",
    "241643844",
    "192309512",
    "86743375",
    "173490639",
    "210916827",
    "173353164"
]

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)
vocab_set = set()
for symbols in labeled['symbols']:
    if symbols is not None and len(symbols) > 0:
        for w in symbols:
            vocab_set.add(w)
tokenizer.add_tokens(list(vocab_set))
    
def tokenize(input_strings):
    return tokenizer(
        input_strings, 
        max_length=MAX_TOKEN_LENGTH, 
        padding="max_length",
        return_tensors="pt"', 
        truncation=True)

In [9]:
## It seems that FIQA and FPB data serves as good enough for nonspams, not going to pick out 1000 examples by hand
good_indices = labeled.iloc[:4242].index.union(pd.Index(false_negatives))
labeled.loc[good_indices, "is_spam"] = 0

spams = labeled["body"].str.contains('|'.join(spam_words), regex=True)
bad_indices = spams[spams == True].index.union(pd.Index(spam_indices))
labeled.loc[bad_indices, "is_spam"] = 1

all_indices = good_indices.union(bad_indices)

dataset = labeled.loc[all_indices]

In [54]:
class StockTwitsDataset(Dataset):
    def __init__(self, messages, labels):
        self.encodings = tokenize(messages.tolist())
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        labels = self.labels.iloc[idx]
        input_ids = self.encodings["input_ids"]
        attention_masks = self.encodings["attention_mask"]

        res = {
            'input_ids': input_ids[idx],
            'attention_mask': attention_masks[idx],
            'labels': labels
        }

        return res

In [55]:
x_train, x_test, y_train, y_test = train_test_split(dataset, dataset["is_spam"], shuffle=True)
x_test, x_val, y_test, y_val = train_test_split(x_test, x_test["is_spam"], shuffle=True)

train_set = StockTwitsDataset(x_train['body'], y_train)
test_set = StockTwitsDataset(x_test['body'], y_test)
val_set = StockTwitsDataset(x_val['body'], y_val)

In [58]:
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)

In [59]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

config = DistilBertConfig(num_labels=2, return_dict=True)
model = DistilBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, config=config)
model.resize_token_embeddings(len(tokenizer))
model.to(device)

optim = AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optim,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [60]:
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    total_train_loss = 0
    batches_trained = 0
    
    model = model.train()
    for batch in train_loader:        
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        train_loss = outputs.loss
        total_train_loss += train_loss
        batches_trained += 1
        train_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optim.step()
        scheduler.step()
    print(f'Train loss: {total_train_loss / batches_trained}')

    total_val_loss = 0
    batches_valed = 0
        
    model = model.eval()
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss = outputs.loss
            total_val_loss += val_loss
            batches_valed += 1
    print(f'Validation loss: {total_val_loss / batches_valed}')
            
    print("")

Epoch 1/2
----------


KeyboardInterrupt: 

In [None]:
model.save_pretrained("model")