# Importing Required Modules

In [None]:
from nltk.tokenize import RegexpTokenizer
TOKENIZER = RegexpTokenizer(r'\w+')

In [None]:
import os
import torch
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Class and functions

In [None]:
class Model:
    def __init__(self, model_name=None, class_weight=None, model=None, model_args=None, patience=None):
        if model:
            self.model = model
        else:
            self.model = ClassificationModel(
                "bert",
                model_name,
                use_cuda=True,
                num_labels=2,
                weight=class_weight,
                args=model_args,
                cuda_device=0
            )
        self.patience = patience
        self.range = 0

    def fit(self, train_df, val_df=None, acc=accuracy_score):
        self.model.train_model(train_df, acc=acc)

    def predict(self, X: list):
        return self.model.predict(X)[0]
    
    def evaluate_model(self, y_true, y_pred):
        return f1_score(y_true, y_pred, average='macro'), accuracy_score(y_true, y_pred)

    def early_stopping(self, train_acc, val_acc):
        range = train_acc - val_acc
        if range > self.range:
            self.patience -= 1
            self.range = range

    def classification_report(self, y_true, y_pred):
        return classification_report(y_true, y_pred)

    def save_model(self, output_dir):
        with open(output_dir, 'wb') as f:
            pickle.dump(self.model, f)

In [None]:
def find_scores(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro'), accuracy_score(y_true, y_pred), balanced_accuracy_score(y_true, y_pred)

In [None]:
def create_auto_model(model_name, model_args, class_weight):
    model = ClassificationModel(
                "bert",
                model_name,
                use_cuda=True,
                num_labels=3,
                weight=class_weight,
                args=model_args,
                cuda_device=0
            )
    return model

# Data Preprocessing

In [None]:
ids_and_labels_path = "/data/raw/crowdbreaks_data/crowdbreaks_tweet_ids_and_labels.csv"
tweets_path = "/data/raw/crowdbreaks_data/crowdbreaks_tweets.csv"

In [None]:
ids_and_labels = pd.read_csv(ids_and_labels_path)
tweets = pd.read_csv(tweets_path)

In [None]:
tweet_and_label = ids_and_labels.join(tweets.rename(columns={'id':'tweet_id'}).set_index('tweet_id'), on='tweet_id', rsuffix='_')
tweet_and_label = tweet_and_label.dropna().drop(columns=['label_'])
tweet_and_label = tweet_and_label[tweet_and_label['agreement'] > 0.66].reset_index(drop=True)

In [None]:
mapping = {0:0, 1:1, -1:2}
tweet_and_label['label'] = tweet_and_label['label'].map(mapping)

In [None]:
agreement1 = tweet_and_label[tweet_and_label['agreement'] == 1.0].reset_index(drop=True)
agreement66 = tweet_and_label[(tweet_and_label['agreement'] >= 0.66) & (tweet_and_label['agreement'] < 1.0)].reset_index(drop=True)

## Data Preparation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(agreement1['text'].values,
                                                    agreement1['label'].astype('int').values,
                                                    test_size=int(tweet_and_label.shape[0] * 0.2), random_state=42,
                                                    stratify = agreement1['label'].astype('int').values)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
                                                    test_size=0.5, random_state=42,
                                                    stratify = y_test)

In [None]:
X_train = np.concatenate([X_train, agreement66['text'].values])
y_train = np.concatenate([y_train, agreement66['label'].values])

In [None]:
df_train = pd.DataFrame(np.concatenate([X_train.reshape((-1, 1)), y_train.reshape((-1, 1))], axis=1),
                                      columns=['text', 'labels'])
df_test = pd.DataFrame(np.concatenate([X_test.reshape((-1, 1)), y_test.reshape((-1, 1))], axis=1),
                                      columns=['text', 'labels'])
df_val = pd.DataFrame(np.concatenate([X_val.reshape((-1, 1)), y_val.reshape((-1, 1))], axis=1),
                                      columns=['text', 'labels'])

# Fine-Tuning

In [None]:
model_args = {
    "use_early_stopping": True,
    "early_stopping_patience": 5,
    "fp16": False,
    "num_train_epochs": 20,
    'overwrite_output_dir': True,
    'learning_rate': 1e-5,
    "save_steps": -1,
    "evaluate_during_training": True,
    "early_stopping_consider_epochs": True,
}

In [None]:
output_dir = "/models/sentiment_models"
y_preds_dir = "/preds"

In [None]:
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
if not os.path.exists(y_preds_dir):
    os.mkdir(y_preds_dir)

In [None]:
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

In [None]:
hyper_parameter_tuning = {"learning_rate":[1e-3, 1e-4, 1e-5],
               "adam_epsilon": [1e-7, 1e-8, 1e-9],
               "weight":[weights.tolist(), (weights**2).tolist()],
               "max_seq_length": [128, 64],
               "weight_decay":[0, 0.01, 0.0001]}

In [None]:
model_name = "digitalepidemiologylab/covid-twitter-bert-v2"
best_scores = [0, 0, 0]
all_best = [0, 0, 0]

grid_search = list(ParameterGrid(hyper_parameter_tuning))

for value in tqdm(grid_search):
    for k, v in value.items():
        if k != "weight":
            model_args[k] = v

    model = create_auto_model(model_name, model_args, value['weight'])
    model.train_model(df_train, eval_df=df_val, acc=accuracy_score)

    result, model_outputs, wrong_predictions = model.eval_model(df_test)
    y_pred = np.argmax(model_outputs, axis=1)

    scores = find_scores(df_test['labels'].values.astype(int), y_pred)
    
    if scores[0] > all_best[0] and scores[1] > all_best[1] and scores[2] > all_best[2]:
        all_best[0] = scores[0]
        all_best[1] = scores[1]
        all_best[2] = scores[2]
        with open(f"{output_dir}/best_model.db", 'wb') as f:
            pickle.dump(model, f)

        np.save(f"{y_preds_dir}/best_model_y_pred.npy", y_pred)

    if scores[0] > best_scores[0]:
        best_scores[0] = scores[0]
        with open(f"{output_dir}/best_f1.db", 'wb') as f:
            pickle.dump(model, f)

        np.save(f"{y_preds_dir}/best_f1_y_pred.npy", y_pred)

    if scores[1] > best_scores[1]:
        best_scores[1] = scores[1]
        with open(f"{output_dir}/best_acc.db", 'wb') as f:
            pickle.dump(model, f)

        np.save(f"{y_preds_dir}/best_acc_y_pred.npy", y_pred)

    if scores[2] > best_scores[2]:
        best_scores[2] = scores[2]
        with open(f"{output_dir}/best_balanced_acc.db", 'wb') as f:
            pickle.dump(model, f)

        np.save(f"{y_preds_dir}/best_balanced_acc_y_pred.npy", y_pred)