In [1]:
import pandas as pd
import numpy as np
import os
import re
from glob import glob
from ast import literal_eval
from sklearn.model_selection import train_test_split
import random

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification, DistilBertConfig



In [2]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()
    
# Step 1: Get the credential from the Cloud SDK
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()

# Step 2: Set the credentials
user_secrets.set_tensorflow_credential(user_credential)

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470


REPLICAS:  8


In [3]:
# CONFIG VARIABLES
AUTO = tf.data.experimental.AUTOTUNE
PRE_TRAINED_MODEL_NAME = 'distilbert-base-uncased'
LOADING = True # True if loading from storage, False if generating variables from scratch
BATCH_SIZE = 128 * strategy.num_replicas_in_sync
MAX_TOKEN_LENGTH = 160
EPOCHS = 1
LEARNING_RATE = 3e-5

In [4]:
if LOADING:
    messages = pd.read_parquet("/kaggle/input/short-financial-messages/all_messages.parquet")
else:
    WORKING_DIR = '/kaggle/input/'
    ticker_dir = WORKING_DIR + 'short-financial-messages/data/'
    PATH = ticker_dir
    EXT = "*.csv"

    all_csv_files = [file
                     for path, subdir, files in os.walk(PATH)
                     for file in glob(os.path.join(path, EXT))]

    parse_csv = lambda file: pd.read_csv(file, parse_dates=['created_at'])

    messages = pd.concat((parse_csv(f) for f in all_csv_files), ignore_index=True, sort=False)

    messages.set_index('id', inplace=True)
    messages.index = messages.index.map(str)
    messages = messages[~messages.index.duplicated(keep='first')]

    filter_urls = lambda text: re.sub(r"http\S+", "", str(text))
    messages['body'] = messages['body'].apply(filter_urls)

    messages["sentiment"] = messages["sentiment"].replace({-1: 0})
    messages.to_parquet("all_messages")

In [5]:
labeled = messages[messages['sentiment'] != -69]
SAMPLE_SIZE = int(len(messages[messages['sentiment'] == 0]) * 0.8)

bull_indices = labeled[labeled['sentiment'] == 1].index
random_bull_indices = np.random.choice(bull_indices, SAMPLE_SIZE, replace=False)
bull_sample = labeled.loc[random_bull_indices]

bear_indices = labeled[labeled['sentiment'] == 0].index
random_bear_indices = np.random.choice(bear_indices, SAMPLE_SIZE, replace=True)
bear_sample = labeled.loc[random_bear_indices]

labeled_training = pd.concat([bull_sample, bear_sample])
labeled_test = labeled.drop(random_bull_indices).drop(random_bear_indices)
labeled_test, labeled_val = train_test_split(labeled_test, test_size=0.2)
len(labeled_training), len(labeled_val), len(labeled_test)
labeled['sentiment'].value_counts()

1    418621
0    158565
Name: sentiment, dtype: int64

In [6]:
# if LOADING:
#     tokenizer = DistilBertTokenizerFast.from_pretrained("/kaggle/input/tokenizer")
# else:
#     tokenizer = DistilBertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)
#     vocab_set = set()
#     for symbols in labeled['symbols']:
#         if isinstance(symbols, str) and "[" in symbols and "]" in symbols:
#                 l = literal_eval(symbols)
#                 for w in l:
#                     vocab_set.add(w)
#     tokenizer.add_tokens(list(vocab_set))
#     tokenizer.save_pretrained("tokenizer")

tokenizer = DistilBertTokenizerFast.from_pretrained(PRE_TRAINED_MODEL_NAME)
vocab_set = set()
for symbols in labeled['symbols']:
    if symbols is not None and len(symbols) > 0:
        for w in symbols:
            vocab_set.add(w)
tokenizer.add_tokens(list(vocab_set))
    
def tokenize(input_strings):
    return tokenizer.batch_encode_plus(
        input_strings, 
        max_length=MAX_TOKEN_LENGTH, 
        padding="max_length",
        return_tensors='tf', 
        truncation=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [7]:
x_train = tokenize(labeled_training['body'].tolist())
x_val = tokenize(labeled_val['body'].tolist())
x_test = tokenize(labeled_test['body'].tolist())

y_train = labeled_training['sentiment'].values
y_val = labeled_val['sentiment'].values
y_test = labeled_test['sentiment'].values

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((dict(x_train), y_train))
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

val_dataset = (
    tf.data.Dataset
    .from_tensor_slices((dict(x_val), y_val))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((dict(x_test), y_test))
    .batch(BATCH_SIZE)
)

In [None]:
 with strategy.scope():
    config = DistilBertConfig(num_labels=2, return_dict=True)
    model = TFDistilBertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, config=config)
    model.resize_token_embeddings(len(tokenizer))
    optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, epsilon=1e-08)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
train_history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS,
)

In [None]:
model.save_pretrained("sentiment_model")

In [None]:
model.evaluate(test_dataset, verbose=1)

In [None]:
# test_encoding = train_set.tokenize(test_sentence)

# test_out = model(input_ids=test_encoding['input_ids'].to(device), attention_mask=test_encoding['attention_mask'].to(device))
# F.softmax(test_out.logits, dim=1)
# print(torch.argmax(F.softmax(test_out.logits, dim=1), dim=1))