#  TC contest
 


In [1]:
import re
import string

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import transformers
from transformers import AutoTokenizer
from transformers import TFRobertaModel
from datasets import load_dataset

print(tf.__version__)
print(transformers.__version__)

ModuleNotFoundError: No module named 'pandas'

## Loading the Data

In [2]:
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")
train = dataset['train'].to_pandas()
test = dataset['validation'].to_pandas()

n_val = int(len(test)*0.5)
val = test[: n_val]
test = test[n_val :]

Using the latest cached version of the dataset since zeroshot/twitter-financial-news-sentiment couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\mairi\.cache\huggingface\datasets\zeroshot___twitter-financial-news-sentiment\default\0.0.0\ccbe24de388e287beb92dd393a335c376b350ac3 (last modified on Mon Apr 29 15:11:43 2024).


## Preprocessing

In [3]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def decontraction(text):
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)
    
    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"ma\'am", " madam", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"shan\'t", " shall not", text)
    text = re.sub(r"sha\n't", " shall not", text)
    text = re.sub(r"o\'clock", " of the clock", text)
    text = re.sub(r"y\'all", " you all", text)

    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    return text 

def seperate_alphanumeric(text):
    words = text
    words = re.findall(r"[^\W\d_]+|\d+", words)
    return " ".join(words)

def cont_rep_char(text):
    tchr = text.group(0) 
    
    if len(tchr) > 1:
        return tchr[0:2] 

def unique_char(rep, text):
    substitute = re.sub(r'(\w)\1+', rep, text)
    return substitute

train['text'] = train['text'].apply(lambda x : remove_url(x))
train['text'] = train['text'].apply(lambda x : remove_punct(x))
train['text'] = train['text'].apply(lambda x : remove_emoji(x))
train['text'] = train['text'].apply(lambda x : decontraction(x))
train['text'] = train['text'].apply(lambda x : seperate_alphanumeric(x))
train['text'] = train['text'].apply(lambda x : unique_char(cont_rep_char,x))

val['text'] = val['text'].apply(lambda x : remove_url(x))
val['text'] = val['text'].apply(lambda x : remove_punct(x))
val['text'] = val['text'].apply(lambda x : remove_emoji(x))
val['text'] = val['text'].apply(lambda x : decontraction(x))
val['text'] = val['text'].apply(lambda x : seperate_alphanumeric(x))
val['text'] = val['text'].apply(lambda x : unique_char(cont_rep_char,x))

test['text'] = test['text'].apply(lambda x : remove_url(x))
test['text'] = test['text'].apply(lambda x : remove_punct(x))
test['text'] = test['text'].apply(lambda x : remove_emoji(x))
test['text'] = test['text'].apply(lambda x : decontraction(x))
test['text'] = test['text'].apply(lambda x : seperate_alphanumeric(x))
test['text'] = test['text'].apply(lambda x : unique_char(cont_rep_char,x))


## Tokenization

In [4]:
seq_len = 256
batch_size = 16
l_train = len(train)
l_val = len(val)

model_name = 'cardiffnlp/twitter-roberta-base-sentiment'

tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

x_train = tokenizer(
    train['text'].tolist(), 
    max_length=seq_len, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True, 
    return_tensors='tf'
)

x_val = tokenizer(
    val['text'].tolist(), 
    max_length=seq_len, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True, 
    return_tensors='tf'
)




y_train = train['label'].values
y_val = val['label'].values


## Originally did one-hot encoding - I think sparse categorical makes more sense, but leaving this here just in case
# y_train_oh = tf.keras.utils.to_categorical(y_train, num_classes=3)
# y_val_oh = tf.keras.utils.to_categorical(y_val, num_classes=3)

def map_func(input_ids, attention_mask):
    return {'input_ids': input_ids, 'attention_mask': attention_mask}




train_ds = tf.data.Dataset.from_tensor_slices(
    (
        map_func(x_train['input_ids'], x_train['attention_mask']), 
        y_train
    )
)

val_ds = tf.data.Dataset.from_tensor_slices(
    (
        map_func(x_val['input_ids'], x_val['attention_mask']), 
        y_val
    )
)



In [5]:
train_ds = train_ds.shuffle(1000).batch(32)
val_ds = val_ds.shuffle(1000).batch(32)

## Model Building

In [10]:
model = TFRobertaModel.from_pretrained(model_name)

# Two inputs
input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(seq_len,), name='attention_mask', dtype='int32')

# Transformer

embeddings = model(input_ids, attention_mask=mask)[0]
embeddings = embeddings[:, 0, :]
# Classifier head
x = tf.keras.layers.Dense(512, activation='relu')(embeddings)
# x = tf.keras.layers.Dropout(0.1)(x)
y = tf.keras.layers.Dense(3, activation=None, name='outputs')(x) # setting activation to None instead of softmax and using from_logits=True for numerical stability

my_model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001) # this is the default value for Adam, but the example I took this from used 1e-5
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
acc = tf.keras.metrics.SparseCategoricalAccuracy()

my_model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

Some layers from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment were not used when initializing TFRobertaModel: ['classifier']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [None]:
history = my_model.fit(
    train_ds,
    epochs=1, # change to 10, this is for test
    batch_size=batch_size,
    validation_data=val_ds
)

  3/299 [..............................] - ETA: 6:28:27 - loss: 2.7068 - sparse_categorical_accuracy: 0.2708

### note - haven't run anything beyond here

In [None]:
x_test = tokenizer(
    test['text'].tolist(), 
    max_length=seq_len, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True, 
)

y_test = test['label'].values

test_ds = tf.data.Dataset.from_tensor_slices(
    (
        map_func(x_test['input_ids'], x_test['attention_mask']), 
        y_test
    )
)

val_ds = val_ds.shuffle(1000).batch(32)


## Model Evaluation

In [None]:
def plot_learning_evolution(r):
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 2, 1)
    plt.plot(r.history['loss'], label='Loss')
    plt.plot(r.history['val_loss'], label='val_Loss')
    plt.title('Loss evolution during trainig')
    plt.legend()

    plt.subplot(2, 2, 2)
    plt.plot(r.history['binary_accuracy'], label='binary_accuracy')
    plt.plot(r.history['val_binary_accuracy'], label='val_binary_accuracy')
    plt.title('Accuracy score evolution during trainig')
    plt.legend();

In [None]:
plot_learning_evolution(history)

In [None]:
my_model.evaluate(test_ds)

In [None]:
def prep_data(text):
    tokens = tokenizer(
        text, max_length=256, truncation=True, 
        padding='max_length', 
        add_special_tokens=True, 
        return_tensors='tf'
    )
    return {
        'input_ids': tokens['input_ids'], 
        'attention_mask': tokens['attention_mask']
    }

test['label'] = None

for i, row in test.iterrows():
    tokens = prep_data(row['text'])
#     probs = my_model.predict(tokens)
    probs = my_model.predict_on_batch(tokens)
    pred = np.argmax(probs)
    test.at[i, 'label'] = pred
    
test['label'] = test['label'].astype(int)

In [None]:
test.head()

In [None]:
test.label.value_counts()

# 5. Making submission

In [None]:
sub = pd.DataFrame({'id':sample_sub['id'].values.tolist(), 'label':test['label']})
sub.to_csv('submission.csv', index=False)

In [None]:
sub.head()