In [1]:
!pip install evaluate -q

In [None]:
import re, regex, os, sys, warnings, random, gc, logging
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

import evaluate
from datasets import Dataset

import transformers
from transformers import (
    LineByLineTextDataset,
    DataCollatorWithPadding, DataCollatorForLanguageModeling,
    TrainingArguments, Trainer,
    AutoTokenizer, AutoModel, AutoModelForMaskedLM, AutoModelForSequenceClassification,
)


SEED = 42
transformers.set_seed(SEED)
warnings.filterwarnings('ignore')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.getLogger().setLevel(logging.WARNING)

INPUT_PATH = '/kaggle/input/translit-datasets/'

# Dataset

In [3]:
# Insert appropriate dataset reading code from dataset_readers.py

def read_dataset():
    pass

In [None]:
train_df, test_df, label_names, dataset_name, text_col = read_dataset()

train_df.label = train_df.label.cat.codes
test_df.label = test_df.label.cat.codes

class_weights = dict(enumerate(
    compute_class_weight(
        class_weight="balanced", 
        classes=np.unique(train_df['label']), 
        y=train_df['label']
    )
))

pd.set_option('max_colwidth', 200)
display(train_df.head())
display(test_df.head())

print(f'{len(train_df)=}, {len(test_df)=}')
print(label_names)

plt.figure(figsize=(6,2))
plt.bar(x=label_names, height=np.bincount(train_df['label']))

# Trainer code

In [5]:
EPOCHS = 6

def train_maskedlm(model_name, train_df, test_df):
    # Prepare data for training
    data = pd.concat([train_df, test_df])
    data['sentence'] = data['sentence'].apply(lambda x: x.replace('\n',''))
    
    text  = '\n'.join(data.sentence.tolist())
    with open('text.txt','w') as f:
        f.write(text)
        
    # Train model
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.save_pretrained(f"/kaggle/working/model_{model_name.replace('/', '-')}/tokenizer/")
    
    trainer = Trainer(
        model=model,
        train_dataset = LineByLineTextDataset(
            tokenizer=tokenizer,
            file_path="text.txt",
            block_size=256
        ),
        eval_dataset = LineByLineTextDataset(
            tokenizer=tokenizer,
            file_path="text.txt",
            block_size=256
        ),
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=True, mlm_probability=0.15
        ),
        args = TrainingArguments(
            output_dir=f"./checkpoints/",
            overwrite_output_dir=True,
            num_train_epochs=EPOCHS,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            evaluation_strategy= 'steps',
            save_total_limit=2,
            eval_steps=200,
            save_steps=400,
            metric_for_best_model='eval_loss',
            greater_is_better=False,
            load_best_model_at_end =True,
            prediction_loss_only=True,
            report_to = "none"
        ),
    )
    
    trainer.train()
    
    model_path = f"/kaggle/working/model_{model_name.replace('/', '-')}/maskedlm/"
    trainer.save_model(model_path)
    !rm -r './checkpoints/'
    
    return model_path
    
    
def train_model( 
    model_name: str,
    train_args: dict,
    seed: int = SEED,
    train_df: pd.DataFrame = train_df, 
    test_df: pd.DataFrame = test_df, 
    label_names: list[str] = label_names,
    save_model: bool = True,
    train_maskedLM = False
):
    # Setup
    transformers.set_seed(seed)
    n_labels = len(label_names)
    id2label = {i:name for i, name in enumerate(label_names)}
    label2id = {name:i for i, name in enumerate(label_names)}
    
    if train_maskedLM:
        # trains and saves a model at path model_name
        model_name = train_maskedlm(model_name, train_df, test_df)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=n_labels, id2label=id2label, label2id=label2id,
    )
    
    metric = evaluate.load("f1")
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=1)
        return metric.compute(predictions=predictions, references=labels, average='weighted')  
    
    training_args = TrainingArguments(
        output_dir='./checkpoints/',
        evaluation_strategy="epoch",
        report_to='none',
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        **train_args
    )
    
    # Make Dataset and tokenize    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def preprocess_function(example):
        return tokenizer(example["sentence"])

    train_df.sentence = train_df.sentence.apply(lambda text: text.lower())
    test_df.sentence  =  test_df.sentence.apply(lambda text: text.lower())
    
    tokenized_train = Dataset.from_pandas(train_df, split='train').map(preprocess_function, batched=True)
    tokenized_test  = Dataset.from_pandas( test_df, split='test' ).map(preprocess_function, batched=True)
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # Train and evaluate
    trainer = Trainer(
        model=model, args=training_args, train_dataset=tokenized_train, eval_dataset=tokenized_test,
        tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics,
    )

    trainer.evaluate()
    train_output = trainer.train()
    trainer.evaluate()
    predictions  = trainer.predict(tokenized_test)
    
    y_test, y_pred = test_df.label, predictions.predictions.argmax(1)
    
    clf_report  = classification_report(y_test, y_pred, target_names=label_names, digits=5)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    if save_model:
        trainer.save_model(f"/kaggle/working/model_{model_name.replace('/', '-')}/")
        
    !rm -r './checkpoints/'
    return clf_report, conf_matrix

# Mixed Language Models

## BERT-base-uncased

In [6]:
clf_report, conf_matrix = train_model(
    "google-bert/bert-base-uncased",
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 5e-6
    }
)

In [7]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()

## BanglishBERT

In [8]:
clf_report, conf_matrix = train_model(
    'csebuetnlp/banglishbert',
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 5e-6
    }
)

In [9]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()

## HingBERT

In [10]:
clf_report, conf_matrix = train_model(
    'l3cube-pune/hing-bert',
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 5e-6
    }
)

In [11]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()

## DarijaBERT

In [12]:
clf_report, conf_matrix = train_model(
    "SI2M-Lab/DarijaBERT-arabizi",
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 5e-6
    }
)

In [13]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()

# Multilingual models

## XLM-RoBERTa

In [14]:
clf_report, conf_matrix = train_model(
    'xlm-roberta-base',
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 5e-6
    },
)

In [15]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()

## mDeBERTa

In [16]:
clf_report, conf_matrix = train_model(
    'microsoft/mdeberta-v3-base',
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 5e-6
    }
)

In [17]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()

## mBERT

In [18]:
clf_report, acconf_matrix = train_model(
    'bert-base-multilingual-cased',
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 5e-6
    }
)

In [19]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()

# Character Models

## CharBERT

In [20]:
clf_report, conf_matrix = train_model(
    'imvladikon/charbert-bert-wiki',
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 5e-6
    }
)

In [21]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()

## CharBERT-RoBERTa

In [None]:
clf_report, conf_matrix = train_model(
    'imvladikon/charbert-roberta-wiki',
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 5e-6
    }
)

In [None]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()

## CANINE

In [None]:
clf_report, conf_matrix = train_model(
    'google/canine-c',
    train_args = {
        'num_train_epochs': EPOCHS,
        'learning_rate': 5e-6,
    }
)

In [None]:
print(clf_report)
ConfusionMatrixDisplay(conf_matrix, display_labels=label_names).plot()