In [1]:
!pip install numpy seqeval torch fairseq "transformers[torch]" datasets evaluate more_itertools sentencepiece protobuf
!pip install -U transformers[torch] datasets evaluate accelerate tokenizers

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting fairseq
  Downloading fairseq-0.12.2.tar.gz (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting hydra-core<1.1,>=1.0.7 (from fairseq)
  D

In [1]:
import os
os.environ['WANDB_DISABLED']="true"
import math
import evaluate
import numpy as np
from random import sample
from datasets import Dataset
from functools import partial
from itertools import groupby, chain
from accelerate import notebook_launcher
from more_itertools import split_at, chunked
from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)



In [2]:
label2id = {
    "B-repair_R": 0,
    "I-repair_R": 1,
    "B-filler_R": 2,
    "I-filler_R": 3,
    "B-repeat_R": 4,
    "I-repeat_R": 5,
    "B-edit_R": 6,
    "I-edit_R": 7,
    "B-false_R": 8,
    "I-false_R": 9,
    "B-pet_R": 10,
    "I-pet_R": 11,
    "B-Alteration": 12,
    "I-Alteration": 13,
    "O": 14,
}
id2label = {v: k for k, v in label2id.items()}

In [3]:
def generate_examples(fnames, tokenizer):
    for fname in fnames:
        with open(fname, "r") as f:
            for wordtags in (
                [word.rstrip().rsplit("\t", maxsplit=1) for word in sent]
                for sent in split_at(f, str.isspace)
            ):
                words = [word for word, _ in wordtags]
                tags = [tag for _, tag in wordtags]
                tokenized = tokenizer(words, is_split_into_words=True, truncation=True)
                input_ids = tokenized["input_ids"]
                attention_mask = tokenized["attention_mask"]
                for i in range(math.ceil(len(input_ids) / 510)):
                    _input_ids = (
                        [input_ids[0]]
                        + input_ids[1 + (i * 510) : 511 + (i * 510)]
                        + [input_ids[-1]]
                    )
                    _attention_mask = (
                        [attention_mask[0]]
                        + attention_mask[1 + (i * 510) : 511 + (i * 510)]
                        + [attention_mask[-1]]
                    )
                    _labels = list(
                        chain.from_iterable(
                            (
                                [-100]
                                if tokenid == None
                                else (label2id[tags[tokenid]] for _ in inputids)
                            )
                            for tokenid, inputids in groupby(
                                range(len(_input_ids)),
                                key=lambda x: tokenized.token_to_word(
                                    batch_or_token_index=x
                                ),
                            )
                        )
                    )

                    yield {
                        "words": words,
                        "tags": tags,
                        "input_ids": _input_ids,
                        "attention_mask": _attention_mask,
                        "labels": _labels,
                    }

In [4]:
def random_cutter(examples):
    for examplei in range(len(examples["input_ids"])):
        input_ids = examples["input_ids"][examplei]
        labels = examples["labels"][examplei]
        while True:
            lo, hi = sorted(
                sample(
                    [
                        i
                        for i, label in enumerate(labels)
                        if label == label2id["O"] or label == -100
                    ],
                    2,
                )
            )
            included_indices = [
                i
                for i in range(len(labels))
                if i in range(lo, hi + 1) or labels[i] == -100
            ]
            if len(included_indices) <= 512:
                break
        examples["input_ids"][examplei] = [input_ids[i] for i in included_indices]
        examples["labels"][examplei] = [labels[i] for i in included_indices]
    return examples

In [5]:
def compute_metrics(p, seqeval=evaluate.load("seqeval")):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (_, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [6]:
modelname = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(modelname)
train_data = Dataset.from_generator(
    partial(
        generate_examples,
        [
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Bengali/bengali_train.tsv",
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Hindi/hindi_train.tsv",
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Kannada/kannada_train.tsv",
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Marathi/marathi_train.tsv",
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Tamil/tamil_train.tsv",
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Telugu/telugu_train.tsv",
            "/kaggle/input/disfluency-shared-task/Train-data-part2/Bengali/bengali_train_2.tsv",
            "/kaggle/input/disfluency-shared-task/Train-data-part2/Hindi/hindi_train_2.tsv",
            "/kaggle/input/disfluency-shared-task/Train-data-part2/Kannada/kannada_train_2.tsv",
            "/kaggle/input/disfluency-shared-task/Train-data-part2/Marathi/marathi_train_2.tsv",
            "/kaggle/input/disfluency-shared-task/Train-data-part2/Tamil/tamil_train_2.tsv",
            "/kaggle/input/disfluency-shared-task/Train-data-part2/Tamil/tamil_train_2.tsv"
            
        ],
        tokenizer,
    )
)
# train_data.set_transform(random_cutter)
dev_data = Dataset.from_generator(
    partial(
        generate_examples,
        [
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Bengali/bengali_dev.tsv",
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Hindi/hindi_dev.tsv",
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Kannada/kannada_dev.tsv",
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Marathi/marathi_dev.tsv",
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Tamil/tamil_dev.tsv",
            "/kaggle/input/disfluency-shared-task/Train-Dev-data-part1/Telugu/telugu_dev.tsv"
        ],
        tokenizer
    )
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model = AutoModelForTokenClassification.from_pretrained(
    modelname, num_labels=len(id2label), id2label=id2label, label2id=label2id
)
trainingargs = TrainingArguments(
    output_dir="training_outputs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    learning_rate=1e-5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=3,
    fp16=True
)
trainer = Trainer(
    model=model,
    args=trainingargs,
    train_dataset=train_data,
    eval_dataset=dev_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Generating train split: 0 examples [00:00, ? examples/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [7]:
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3101,0.237561,0.690789,0.643087,0.666085,0.933065
2,0.1991,0.190373,0.770347,0.788394,0.779266,0.950882
3,0.155,0.157063,0.856842,0.809218,0.832349,0.961636
4,0.1289,0.150679,0.835844,0.815495,0.825544,0.958627
5,0.1076,0.155153,0.859323,0.862349,0.860833,0.966038
6,0.0966,0.172027,0.857383,0.860665,0.85902,0.965439
7,0.0807,0.17415,0.861694,0.849028,0.855314,0.966024
8,0.0723,0.1625,0.90767,0.844434,0.874911,0.969785
9,0.0613,0.167499,0.866071,0.86143,0.863745,0.965787
10,0.0597,0.187722,0.857986,0.867708,0.86282,0.965063


TrainOutput(global_step=27720, training_loss=0.1069612191869067, metrics={'train_runtime': 6954.7886, 'train_samples_per_second': 63.75, 'train_steps_per_second': 3.986, 'total_flos': 2.538292270771404e+16, 'train_loss': 0.1069612191869067, 'epoch': 15.0})

In [8]:
trainer.save_model('final_model')

In [89]:
model = AutoModelForTokenClassification.from_pretrained('final_model')
pipe = pipeline('ner', model=model, tokenizer=tokenizer)
for lang in ['Bengali', 'Hindi', 'Kannada', 'Marathi', 'Tamil', 'Telugu']:
    with open(f'/kaggle/input/disfluency-shared-task/Test-Blind/{lang}/{lang.lower()}_test_blind.tsv', 'r') as f: words = list(map(str.strip, f))
    texts = [' '.join(w) for w in chunked(words, 200)]
    with open(f'{lang.lower()}_out.tsv', 'w') as f:
        for text in texts:
            text_tokenized = tokenizer(text)
            words = text.split(' ')
            tags = ['O']*len(words)
            outs = pipe([text])
            for out in outs[0]: tags[text_tokenized.token_to_word(batch_or_token_index=out['index'])] = out['entity']
            for a, b in zip(words, tags): f.write(f'{a}\t{b}\n')