## Importing all packages and setting environment for GPU usage

In [None]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import DistilBertTokenizer, DistilBertForTokenClassification
import numpy as np
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score, classification_report
import torch
import argparse
from os import path
import torch.nn as nn
import os
import random

os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"


if torch.cuda.is_available():
    device = torch.device("cuda:3")
else:
    device = torch.device("cpu")
    
print(torch.cuda.is_available())

## Helper functions and model selection for training

In [None]:
def get_fluent(s,label):
    new_s = []
    words = s.split(" ")
    for x,y in zip(words,label):
        if(y==0):
            new_s.append(x)
        
    return " ".join(new_s)
        

In [None]:
task = "dc"
model_checkpoint = "google/muril-base-cased"
model_checkpoint = "xlm-roberta-base"
#model_checkpoint = "bert-base-multilingual-cased"

path_to_dataset = "./data/"

print("Model Name:", model_checkpoint)
experiment_id = "123456"
no_of_epochs = 40

batch_size = 16
label_list = ['is_fluent', 'is_disfluent'] # 0 -> isFluent , 1 -> isDisfluent
splits = [80,10,10]

test_sentences = []
gt_fluent_sentences = []

def get_dataset(path):
    
    with open(f"{path}/data.dis", 'r') as dis_x, open(f"{path}/data.labels", 'r') as labels_x:

        train_dict = {'labels': [], 'disfluent': []}
        valid_dict = {'labels': [], 'disfluent': []}
        test_dict  = {'labels': [], 'disfluent': []}
        
        disfluent_lines = dis_x.readlines()
        disfluent_labels = labels_x.readlines()
        temp = list(zip(disfluent_lines, disfluent_labels))
        random.shuffle(temp)
        res1, res2 = zip(*temp)
        # res1 and res2 come out as tuples, and so must be converted to lists.
        disfluent_lines, disfluent_labels = list(res1), list(res2)
        total_size = len(disfluent_lines)
        
        for i,(disfluent,labels) in enumerate(zip(disfluent_lines, disfluent_labels)):

            disfluent = disfluent.strip().split()
            labels = list(map(int, labels.strip().split()))

            if i < round(splits[0] * total_size / 100) :
                train_dict['disfluent'].append(disfluent)
                train_dict['labels'].append(labels)
            elif i < round(sum(splits[:2]) * total_size / 100):
                valid_dict['disfluent'].append(disfluent)
                valid_dict['labels'].append(labels)
            elif i < round(sum(splits) * total_size / 100):
                test_dict['disfluent'].append(disfluent)
                test_dict['labels'].append(labels)   
                s = " ".join(disfluent)
                test_sentences.append(" ".join(disfluent))
                gt_fluent_sentences.append(get_fluent(s,labels))
                
                
    train_dataset = Dataset.from_dict(train_dict)
    valid_dataset = Dataset.from_dict(valid_dict)
    test_dataset = Dataset.from_dict(test_dict)

    return DatasetDict({'train': train_dataset, 'valid': valid_dataset, 'test': test_dataset})


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["disfluent"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

datasets = get_dataset(path="/home/development/vineet/DDP_1/presto_v1/test_partitions/de-DE/")
print(datasets)

In [None]:
print("Data Size = "+str(len(datasets['train'])+len(datasets['valid'])+len(datasets['test'])))

In [None]:
" ".join(datasets['test'][0]['disfluent'])

In [None]:
test_sentences[:10]

## Setting up training parameters

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
label_all_tokens = True
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"checkpoints/{experiment_id}",
    evaluation_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=no_of_epochs,#10,
    weight_decay=0.01,
    eval_steps=1000, #1000,
    logging_steps=1000, #1000,
    save_steps=1000, #1000,
    save_total_limit=1,
    load_best_model_at_end=True
)

In [None]:
model

In [None]:
tokenized_datasets['train']

In [None]:
tokenized_datasets['train']['disfluent'][:5]

In [None]:
print("Training Args", args)

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # 1-d prediction & true label
    true_predictions = [
        p for prediction, label in zip(predictions, labels) for (p, l) in zip(prediction, label) if l != -100
    ]
    true_labels = [
        l for prediction, label in zip(predictions, labels) for (p, l) in zip(prediction, label) if l != -100 
    ]

    results = precision_recall_fscore_support(true_labels, true_predictions, zero_division=0)
    return {
        'accuracy': accuracy_score(true_labels, true_predictions),
        'precision0': torch.tensor(results[0])[0],
        'precision1': torch.tensor(results[0])[1],
        'recall0': torch.tensor(results[1])[0],
        'recall1': torch.tensor(results[1])[1],
        'f1score0': torch.tensor(results[2])[0],
        'f1score1': torch.tensor(results[2])[1],
    }

## Start Training

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


trainer.train()

## Run Evaluation on Test Sentences

In [None]:
# Evaluate on test sentences
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

# 1-d prediction & true label
true_predictions = [
    p for prediction, label in zip(predictions, labels) for (p, l) in zip(prediction, label) if l != -100
]
true_labels = [
    l for prediction, label in zip(predictions, labels) for (p, l) in zip(prediction, label) if l != -100 
]

results = precision_recall_fscore_support(true_labels, true_predictions, zero_division=0)
print({
    'precision': results[0],
    'recall': results[1],
    'f1score': results[2]
})
print("Confusion Matrix:")
print(confusion_matrix(true_labels, true_predictions, normalize='all'))
print(classification_report(true_labels, true_predictions, target_names=label_list, zero_division=0))

In [None]:
confusion_matrix(true_labels, true_predictions, normalize='all')