In [1]:
import pandas as pd
import numpy as np
import os
import re
from glob import glob
from ast import literal_eval
from sklearn.model_selection import train_test_split

import tensorflow as tf

import transformers
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification, DistilBertConfig

In [2]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    user_credential = user_secrets.get_gcloud_credential()
    user_secrets.set_tensorflow_credential(user_credential)
    print("REPLICAS: ", strategy.num_replicas_in_sync)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

In [3]:
# CONFIG VARIABLES
AUTO = tf.data.experimental.AUTOTUNE
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
LOADING = True # True if loading from storage, False if generating variables from scratch
TRAIN_BATCH_SIZE = 16 * strategy.num_replicas_in_sync
TEST_BATCH_SIZE = 160 * strategy.num_replicas_in_sync
MAX_TOKEN_LENGTH = 160
EPOCHS = 2
LEARNING_RATE = 3e-5

In [4]:
if LOADING:
    messages = pd.read_parquet("all_messages.parquet")
else:
    WORKING_DIR = '/kaggle/input/'
    ticker_dir = WORKING_DIR + 'short-financial-messages/data/'
    PATH = ticker_dir
    EXT = "*.csv"

    all_csv_files = [file
                     for path, subdir, files in os.walk(PATH)
                     for file in glob(os.path.join(path, EXT))]

    parse_csv = lambda file: pd.read_csv(file, parse_dates=['created_at'])

    messages = pd.concat((parse_csv(f) for f in all_csv_files), ignore_index=True, sort=False)

    messages.set_index('id', inplace=True)
    messages.index = messages.index.map(str)
    messages = messages[~messages.index.duplicated(keep='first')]

    filter_urls = lambda text: re.sub(r"http\S+", "", str(text))
    messages['body'] = messages['body'].apply(filter_urls)

    messages["sentiment"] = messages["sentiment"].replace({-1: 0})
    messages.to_parquet("all_messages")

In [5]:
messages["is_spam"] = -69 * np.ones(len(messages), dtype=np.int)
labeled = messages[messages['sentiment'] != -69]
labeled_alt = labeled.copy()

In [6]:
spam_words = [
    "smartoptions®",
    "technical alerts",
    ": available to subscribers",
    "evolution trading",
    "trade alerts",
    "trading community",
    "trading alerts",
    "sweepcast.com",
    "optionpros",
    "freedomstocks.ca",
    "thetradexchange",
    "capotrades",
    "thetradexchange",
    "pineapplestocks.com",
    "alert triggered",
    "xtradesb",
    "option-alerts.com",
    "options alert"
    "alerts triggered",
    "assetdash.com",
    "beststocksnowapp.com",
    "drstoxx.com",
    "echelon-1.com",
    "wallstjesus.com",
    "trendspider.com",
    "gainers watchlist",
    "freedom stocks",
    "#optionstradingpulse",
    "vwapindicator",
    "on notifications",
    "trade ideas",
    "(delayed)",
    'follow for',
    "📈🚀 symbol:",
    "delayed]",
    "today&#39;s biggest market cap"
]

spam_indices = [
    "189934349",
    "142590793",
    "185792536",
    "182362237",
    "226578494",
    "174519289",
    "240723002",
    "242183678",
    "248681269",
    "245656196",
    "243413941",
    "239273922",
    "230980738",
    "255520798",
    "158019671",
    "252711617",
    "252527668",
    "247522334",
    "251021498",
    "207262771"
]

In [9]:
false_negatives = [
    "210916827",
    "86743375",
    "216738976",
    "236216134",
    "203164333",
    "180138622",
    "206200249",
    "127735161",
    "218513852",
    "211814549",
    "215246245",
    "251010890",
    "207338547",
    "233435151",
    "240829277",
    "220170011",
    "136139256",
    "219269972",
    "231359105",
    "166400184",
    "246096363",
    "136017785",
    "222582653",
    "247547045",
    "210906734",
    "247247993",
    "201056424",
    "256665740",
    "114878188",
    "241643844",
    "192309512",
    "86743375",
    "173490639",
    "210916827",
    "173353164"
]

In [10]:
tokenizer = DistilBertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)

vocab_set = set()
for symbols in labeled['symbols']:
    if symbols is not None and len(symbols) > 0:
        for w in symbols:
            vocab_set.add(w)
tokenizer.add_tokens(list(vocab_set))
    
def tokenize(input_strings):
    return tokenizer.batch_encode_plus(
        input_strings, 
        max_length=MAX_TOKEN_LENGTH, 
        padding="max_length",
        return_tensors='tf', 
        truncation=True)

In [11]:
## It seems that FIQA and FPB data serves as good enough for nonspams, not going to pick out 1000 examples by hand
good_indices = labeled.iloc[:4242].index.union(pd.Index(false_negatives))
labeled.loc[good_indices, "is_spam"] = 0

spams = labeled["body"].str.contains('|'.join(spam_words), regex=True)
bad_indices = spams[spams == True].index.union(pd.Index(spam_indices))
labeled.loc[bad_indices, "is_spam"] = 1

all_indices = good_indices.union(bad_indices)

dataset = labeled.loc[all_indices]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(dataset, dataset["is_spam"], shuffle=True)
x_test, x_val, y_test, y_val = train_test_split(x_test, x_test["is_spam"], shuffle=True)

x_train_tokens = tokenize(x_train['body'].tolist())
x_test_tokens = tokenize(x_test['body'].tolist())
x_val_tokens = tokenize(x_val['body'].tolist())

In [13]:
with strategy.scope():
    config = DistilBertConfig(num_labels=2, return_dict=True)
    model = TFDistilBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, config=config)
    model.resize_token_embeddings(len(tokenizer))
    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, epsilon=1e-08)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'classifier', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use i

In [14]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((dict(x_train_tokens), y_train))
    .batch(TRAIN_BATCH_SIZE)
    .prefetch(AUTO)
)

val_dataset = (
    tf.data.Dataset
    .from_tensor_slices((dict(x_val_tokens), y_val))
    .batch(TEST_BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((dict(x_test_tokens), y_test))
    .batch(TEST_BATCH_SIZE)
)

In [None]:
train_history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
)

In [None]:
model.evaluate(test_dataset)

In [16]:
model.predict(val_dataset)

array([[ 3.80906016e-02,  1.65916840e-03],
       [ 3.17729637e-03,  2.32099090e-02],
       [ 3.44521925e-03, -1.41107496e-02],
       [ 4.93055061e-02,  2.95408592e-02],
       [ 2.36942209e-02,  2.78213453e-02],
       [ 4.82579805e-02,  3.25214900e-02],
       [ 4.20689844e-02,  1.42855272e-02],
       [ 8.91697779e-03,  1.65469758e-02],
       [ 3.31422277e-02,  9.20446182e-04],
       [ 6.82879612e-03,  1.15887262e-02],
       [ 3.54890004e-02,  4.45995815e-02],
       [-3.84572893e-04,  2.44105235e-02],
       [ 7.34997094e-02,  5.05116768e-02],
       [ 5.54336309e-02,  1.61918495e-02],
       [ 2.19659507e-02,  3.94760668e-02],
       [ 6.01617657e-02,  3.26231644e-02],
       [ 5.02718538e-02,  1.43160205e-02],
       [ 1.88344792e-02,  3.89322713e-02],
       [ 1.41992383e-02,  2.07294151e-02],
       [ 2.85372436e-02,  4.12547290e-02],
       [ 3.31439078e-02,  8.94204155e-03],
       [ 4.67274338e-02,  1.80383697e-02],
       [ 2.15153508e-02,  4.29519126e-03],
       [ 6.

In [None]:
all_tokens = tokenize(labeled_alt['body'].tolist())
    
res = model(all_tokens)

try:
    all_results = np.argmax(res.logits, axis=1)
    labeled_alt["is_spam"] = all_results
    labeled_alt.to_parquet("spam_be_gone.parquet")
    print("yay")
except:
    print("noo")

In [None]:
model.save_pretrained("7k_examples_model")

In [None]:
## Nuclear Option:

## Train spam filter on the entire messages dataset, rather than just labelled.