In [None]:
import pandas as pd
import numpy as np
import os
import re
from glob import glob
from ast import literal_eval
from sklearn.model_selection import train_test_split

import tensorflow as tf

import transformers
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification, DistilBertConfig

In [None]:
# Detect hardware, return appropriate distribution strategy
tpu = None
strategy = tf.distribute.get_strategy()

In [None]:
# CONFIG VARIABLES
AUTO = tf.data.experimental.AUTOTUNE
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
LOADING = True # True if loading from storage, False if generating variables from scratch
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_TOKEN_LENGTH = 160
EPOCHS = 2
LEARNING_RATE = 3e-5

In [None]:
if LOADING:
    messages = pd.read_parquet("/kaggle/input/short-financial-messages/all_messages.parquet")
else:
    WORKING_DIR = '/kaggle/input/'
    ticker_dir = WORKING_DIR + 'short-financial-messages/data/'
    PATH = ticker_dir
    EXT = "*.csv"

    all_csv_files = [file
                     for path, subdir, files in os.walk(PATH)
                     for file in glob(os.path.join(path, EXT))]

    parse_csv = lambda file: pd.read_csv(file, parse_dates=['created_at'], converters={"symbols": literal_eval})

    messages = pd.concat((parse_csv(f) for f in all_csv_files), ignore_index=True, sort=False)

    messages.set_index('id', inplace=True)
    messages.index = messages.index.map(str)
    messages = messages[~messages.index.duplicated(keep='first')]

    filter_urls = lambda text: re.sub(r"http\S+", "", str(text))
    messages['body'] = messages['body'].apply(filter_urls)

    messages["sentiment"] = messages["sentiment"].replace({-1: 0})
    messages.to_parquet("all_messages")

In [None]:
messages_alt = messages.copy()
labeled = messages[messages['sentiment'] != -69]
labeled["is_spam"] = -69 * np.ones(len(labeled), dtype=np.int)

In [None]:
spam_words = [
    "smartoptions®",
    "technical alerts",
    "stop-loss: available to subscribers",
    "evolution trading",
    "trade alerts",
    "trading community",
    "trading alerts",
    "sweepcast.com",
    "optionpros",
    "freedomstocks.ca",
    "thetradexchange",
    "capotrades",
    "thetradexchange",
    "pineapplestocks.com",
    "alert triggered",
    "xtradesb",
    "option-alerts.com",
    "options alert"
    "alerts triggered",
    "assetdash.com",
    "beststocksnowapp.com",
    "drstoxx.com",
    "echelon-1.com",
    "wallstjesus.com",
    "trendspider.com",
    "gainers watchlist",
    "freedom stocks",
    "#optionstradingpulse",
    "vwapindicator",
    "on notifications",
    "trade ideas",
    "(delayed)",
    'follow for'
]

spam_indices = [
    "189934349",
    "142590793",
    "185792536",
    "182362237",
    "226578494",
    "174519289",
    "240723002",
    "242183678",
    "248681269",
    "245656196",
    "243413941",
    "239273922",
    "230980738",
    "255520798",
    "158019671",
    "252711617",
    "252527668",
    "247522334",
    "251021498"
]

In [None]:
duplicate_indices = [
    '59c3b6f2-9238-4198-a18c-7718168f17a4',
    '00c563b4-fb57-41a6-9639-36b29c9e895b',
    'e7bb7d78-3dca-47dd-9124-7c2b81cb60bf',
]

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)
vocab_set = set()
for symbols in labeled['symbols']:
    if symbols is not None and len(symbols) > 0:
        for w in symbols:
            vocab_set.add(w)
tokenizer.add_tokens(list(vocab_set))
    
def tokenize(input_strings):
    return tokenizer.batch_encode_plus(
        input_strings, 
        max_length=MAX_TOKEN_LENGTH, 
        padding="max_length",
        return_tensors='tf', 
        truncation=True)

In [None]:
## It seems that FIQA and FPB data serves as good enough for nonspams, not going to pick out 1000 examples by hand
good_indices = labeled[~labeled.index.isin(duplicate_indices)].iloc[:3000].index
labeled.loc[good_indices, "is_spam"] = 0

spams = labeled["body"].str.contains('|'.join(spam_words), regex=True)
bad_indices = spams[spams == True].index.union(pd.Index(spam_indices))
labeled.loc[bad_indices, "is_spam"] = 1

all_indices = good_indices.union(bad_indices)


dataset = labeled.loc[all_indices]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(dataset, dataset["is_spam"], shuffle=True)
x_test, x_val, y_test, y_val = train_test_split(x_test, x_test["is_spam"], shuffle=True)

x_train_tokens = tokenize(x_train['body'].tolist())
x_test_tokens = tokenize(x_test['body'].tolist())
x_val_tokens = tokenize(x_val['body'].tolist())

In [None]:
config = DistilBertConfig(num_labels=2, return_dict=True)
model = TFDistilBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, config=config)
model.resize_token_embeddings(len(tokenizer))
optimizer = transformers.AdamWeightDecay(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((dict(x_train_tokens), y_train))
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

val_dataset = (
    tf.data.Dataset
    .from_tensor_slices((dict(x_val_tokens), y_val))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((dict(x_test_tokens), y_test))
    .batch(BATCH_SIZE)
)

In [None]:
train_history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
)

In [None]:
model.evaluate(test_dataset, verbose=1)

In [None]:
all_tokens = tokenize(messages_alt['body'].tolist())

full_dataset =  (
    tf.data.Dataset
    .from_tensor_slices((dict(all_tokens)))
    .batch(2 * BATCH_SIZE)
)
    
res = model.predict(full_dataset)
all_results = np.argmax(res.logits)
messages_alt["is_spam"] = all_results
messages_alt.to_parquet("spam_be_gone.parquet")