In [None]:
!pip install transformers

In [None]:
!pip install accelerate -U

In [None]:
import transformers as tf
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import classification_report, accuracy_score

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to('cuda')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def process_data(data):

    conditions = [
        (data['label'] == 'NOT'),
        (data['label'] == 'ABU'),
    ]

    choices = [0, 1]
    data['label'] = np.select(conditions, choices, default=data['label'])

    processed_data = data[['comment', 'label']]

    return processed_data

file_train = pd.read_csv("/content/drive/MyDrive/dataset/ALYT_train.csv")
data_train = process_data(file_train)


X = list(data_train["comment"])
y = list(data_train["label"])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
file_test = pd.read_csv("/content/drive/MyDrive/dataset/ALYT_test.csv")
data_test = process_data(file_test)
data_test.to_csv("/content/drive/MyDrive/dataset/ALYT_test1.csv", index=False)

In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average="macro")
    precision = precision_score(y_true=labels, y_pred=pred, average="macro")
    f1 = f1_score(y_true=labels, y_pred=pred, average="macro")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Define Trainer
args = TrainingArguments(
    output_dir="output_alyt",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    learning_rate=1e-5
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Step,Training Loss
500,0.3219
1000,0.2838


TrainOutput(global_step=1358, training_loss=0.2942616177588394, metrics={'train_runtime': 947.8966, 'train_samples_per_second': 11.458, 'train_steps_per_second': 1.433, 'total_flos': 2857649172264960.0, 'train_loss': 0.2942616177588394, 'epoch': 1.0})

In [None]:
trainer.evaluate()

<class 'transformers.trainer_utils.EvalPrediction'>


{'eval_loss': 0.27025675773620605,
 'eval_accuracy': 0.9072164948453608,
 'eval_precision': 0.7992019154030328,
 'eval_recall': 0.7031695156695157,
 'eval_f1': 0.7383201581934383,
 'eval_runtime': 81.0928,
 'eval_samples_per_second': 33.492,
 'eval_steps_per_second': 4.193,
 'epoch': 1.0}

In [None]:
trainer.save_model('output_alyt/model_alyt_ep1')

In [None]:
mod1 = BertForSequenceClassification.from_pretrained('output_alyt/model_alyt_ep1')
mod1.to('cuda')

In [None]:
def predict_comment(comment, model):
    input = tokenizer(comment,padding = True, truncation = True, return_tensors='pt').to('cuda')
    with torch.no_grad():
        output = model(**input)
        predicted_class = torch.argmax(output.logits, dim=1).item()
    return predicted_class

In [None]:
data_test = pd.read_csv("/content/drive/MyDrive/dataset/ALYT_test1.csv")
data_test['predicted'] = data_test['comment'].apply(lambda x: predict_comment(x, mod1))

In [None]:
data_test.to_csv('/content/drive/MyDrive/prediction/ALYT_mod1_pred.csv', index=False)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/prediction/ALYT_mod1_pred.csv')
y_gold = df["label"]
y_pred = df["predicted"]
report = classification_report(y_gold, y_pred, digits=2)
print(report)

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      5164
           1       0.66      0.41      0.51       655

    accuracy                           0.91      5819
   macro avg       0.80      0.69      0.73      5819
weighted avg       0.90      0.91      0.90      5819

