In [1]:
# load testsets

import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset
import datasets
import pandas as pd
import numpy as np
from transformers import AutoTokenizer

# df = pd.read_csv("./osr_tweets_without_T_U_U_v2.csv", engine='python')
df = pd.read_csv("./gpt_tweets_without_T_U_U.csv", engine='python')

#convert to list
docs = df.text

labels = df.topic

id2label = {}
label2id = {}
id_counter = 0
for i in range(len(labels)):
    label = labels.iloc[i]
    if label not in label2id:
        label2id[label] = id_counter
        id_counter += 1

for label, id in label2id.items():
    id2label[id] = label

for i in range(len(labels)):
    topic = labels.iloc[i]
    cur_id = label2id[topic]
    labels.iloc[i] = cur_id

# generate class weight list
id2counter = dict(id2label)
label2counter = dict(label2id)

for id, _ in id2counter.items():
    id2counter[id] = 0
    label = id2label[id]
    label2counter[label] = 0

for i in range(len(labels)):
    cur_id = labels.iloc[i]
    id2counter[cur_id] += 1
    cur_topic = id2label[cur_id]
    label2counter[cur_topic] += 1

for label, counter in label2counter.items():
    label2counter[label] = counter
    # label2counter[label] = counter/len(labels)

class_weight = []
id_counter = 0
for id_num, counter in id2counter.items():
    weight = 1/(counter/len(df))
    class_weight.append(weight)
    id_counter += 1

docs = docs.astype(str)
df = pd.concat([docs, labels], axis=1)
df.rename(columns={'topic':'label'}, inplace = True)

dataset = ds.dataset(pa.Table.from_pandas(df).to_batches())

### convert to Huggingface dataset
hg_dataset = Dataset(pa.Table.from_pandas(df))

train_dataset, test_dataset= hg_dataset.train_test_split(test_size=0.2, shuffle=True, seed=10).values()
db = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})


In [None]:
from transformers import DataCollatorWithPadding
import evaluate
from transformers import create_optimizer
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from transformers.keras_callbacks import KerasMetricCallback
from transformers.keras_callbacks import PushToHubCallback
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from torch import nn

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weight, device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_db = db.map(preprocess_function, batched=True)

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Performance measure metrics
def custom_metrics(eval_pred):
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")
    metric4 = evaluate.load("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

In [None]:
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

test_training_args = TrainingArguments("test_trainer")

# gpt_model = AutoModelForSequenceClassification.from_pretrained("LovenOO/distilBERT_gptdata_with_preprocessing_grid_search")

gpt_model = AutoModelForSequenceClassification.from_pretrained("LovenOO/distilBERT_gptdata_with_preprocessing_grid_search",revision='76a8e24648f9360c180b80b3cf176e2d6ada5c8a')

# distilbert_model = AutoModelForSequenceClassification.from_pretrained("LovenOO/distilBERT_with_preprocessing_grid_search", revision='e8460a5b00cb63ec52c235227b45a1fabf1f2056')

# distilbert_model_without_hashtag = AutoModelForSequenceClassification.from_pretrained("LovenOO/distilBERT_with_preprocessing_grid_search", revision='d8b7b2dd48cccecc48036255fb440b3b1d8ddff8')

# bert_model_without_hashtag = AutoModelForSequenceClassification.from_pretrained("LovenOO/BERT_with_preprocessing_grid_search", revision='1f5ba5fd1ece623860f24b0b5ecb5c5f3a4c9396')

# bert_large_model_without_hashtag = AutoModelForSequenceClassification.from_pretrained("LovenOO/BERT_large_with_preprocessing_grid_search", revision='dac309e589385dabbe9dab48693a351334620fc0')

# distilbert_model_with_hashtag = AutoModelForSequenceClassification.from_pretrained("LovenOO/distilBERT_without_preprocessing_grid_search", revision='ee83c5678b68ea6942e31508ccd286fc993f0f3d')

# bert_model_with_hashtag = AutoModelForSequenceClassification.from_pretrained("LovenOO/BERT_without_preprocessing_grid_search", revision='49e9c91823bd49e038747b14bdf07ec2dc9a3942')

# bert_large_model_with_hashtag = AutoModelForSequenceClassification.from_pretrained("LovenOO/BERT_large_without_preprocessing", revision='ba78578c0fb89f84f83e1be748c258d64c1ef52b')

test_trainer = Trainer(
    model=gpt_model,
    args=test_training_args,
    train_dataset=[],
    eval_dataset=tokenized_db['test'],
    compute_metrics=custom_metrics,
    data_collator=data_collator,
)

test_trainer.evaluate()

In [None]:
!pip install transformers 

In [None]:
# evaluate on ensemble model of DistilBert, BERT-base, BERT-large, DistilBert on GPT set

import pandas as pd
from transformers import pipeline
import numpy as np
from tqdm import tqdm

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/test_set.csv", engine='python')

gpt = pipeline(model="LovenOO/distilBERT_gptdata_with_preprocessing_grid_search",revision='76a8e24648f9360c180b80b3cf176e2d6ada5c8a')

distilbert_model_without_hashtag = pipeline(model="LovenOO/distilBERT_with_preprocessing_grid_search", revision='d8b7b2dd48cccecc48036255fb440b3b1d8ddff8')

bert_model_without_hashtag = pipeline(model="LovenOO/BERT_with_preprocessing_grid_search", revision='1f5ba5fd1ece623860f24b0b5ecb5c5f3a4c9396')

bert_large_model_without_hashtag = pipeline(model="LovenOO/BERT_large_with_preprocessing_grid_search", revision='dac309e589385dabbe9dab48693a351334620fc0')

predicted_list = []

for text in tqdm(df.text, total=df.shape[0]):
    cur_list = []
    cur_list.append(gpt(text)[0]['label'])
    cur_list.append(distilbert_model_without_hashtag(text)[0]['label'])
    cur_list.append(bert_model_without_hashtag(text)[0]['label'])
    cur_list.append(bert_large_model_without_hashtag(text)[0]['label'])

    sorted_idx = np.argsort(np.array(cur_list))
    sorted_list = np.array(cur_list)[sorted_idx]
    if sorted_list[0]==sorted_list[1] and sorted_list[2]==sorted_list[3]:
        predicted_list.append(cur_list[3])
    else:
        id = np.argmax(np.array(sorted_list))
        predicted_list.append(sorted_list[id])

# normalize text label
temp_list = []
for pred_label in predicted_list:
    temp_list.append(pred_label.replace(',',''))

id2label = {
0 : 'Children Education and Skills',
1 : 'Health and Social Care',
2 : 'Crime and Security' ,
3 : 'Economy',
4 : 'Housing Planning and Local Services',
5 : 'Labour Market and Welfare' ,
6 : 'Population and Society' ,
7 : 'Transport Environment and Climate Change'
}

# convert text label to id label
label2id = {}
for id, label in id2label.items():
    label2id[label] = id
predicted_id_list = []
for pred_label in temp_list:
    cur_id = label2id[pred_label]
    predicted_id_list.append(cur_id)

print(" Accuracy:", np.mean(np.array(predicted_id_list) == df.label.values))

In [None]:
# evaluate on ensemble model of DistilBert, BERT-base, BERT-large

import pandas as pd
from transformers import pipeline
import numpy as np
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/test_set.csv", engine='python')


distilbert_model_without_hashtag = pipeline(model="LovenOO/distilBERT_with_preprocessing_grid_search", revision='d8b7b2dd48cccecc48036255fb440b3b1d8ddff8')

bert_model_without_hashtag = pipeline(model="LovenOO/BERT_with_preprocessing_grid_search", revision='1f5ba5fd1ece623860f24b0b5ecb5c5f3a4c9396')

bert_large_model_without_hashtag = pipeline(model="LovenOO/BERT_large_with_preprocessing_grid_search", revision='dac309e589385dabbe9dab48693a351334620fc0')

predicted_list = []
for text in df.text:
    cur_list = []
    cur_list.append(distilbert_model_without_hashtag(text)[0]['label'])
    cur_list.append(bert_model_without_hashtag(text)[0]['label'])
    cur_list.append(bert_large_model_without_hashtag(text)[0]['label'])

    sorted_idx = np.argsort(np.array(cur_list))
    sorted_list = np.array(cur_list)[sorted_idx]
    id = np.argmax(np.array(sorted_list))
    predicted_list.append(sorted_list[id])

temp_list = []
for pred_label in predicted_list:
    temp_list.append(pred_label.replace(',',''))

id2label = {
0 : 'Children Education and Skills',
1 : 'Health and Social Care',
2 : 'Crime and Security' ,
3 : 'Economy',
4 : 'Housing Planning and Local Services',
5 : 'Labour Market and Welfare' ,
6 : 'Population and Society' ,
7 : 'Transport Environment and Climate Change'
}
label2id = {}
for id, label in id2label.items():
    label2id[label] = id
predicted_id_list = []
for pred_label in temp_list:
    cur_id = label2id[pred_label]
    predicted_id_list.append(cur_id)

print(" Accuracy:", np.mean(np.array(predicted_id_list) == df.label.values))
