In [1]:
!pip uninstall fsspec -qq -y
!pip install --no-index --find-links ../input/hf-datasets/wheels datasets -qq

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-cloud 0.1.13 requires tensorflow<3.0,>=1.15.0, which is not installed.
dask-cudf 21.6.1+2.g101fc0fda4 requires cupy-cuda112, which is not installed.
cudf 21.6.1+2.g101fc0fda4 requires cupy-cuda110, which is not installed.
s3fs 2021.6.1 requires fsspec==2021.06.1, but you have fsspec 2021.6.0 which is incompatible.
pytorch-lightning 1.3.8 requires fsspec[http]!=2021.06.0,>=2021.05.0, but you have fsspec 2021.6.0 which is incompatible.
dask-cudf 21.6.1+2.g101fc0fda4 requires dask<=2021.5.1,>=2021.4.0, but you have dask 2021.6.2 which is incompatible.
dask-cudf 21.6.1+2.g101fc0fda4 requires distributed<=2021.5.1,>=2.22.0, but you have distributed 2021.6.2 which is incompatible.[0m


In [2]:
%env WANDB_DISABLED=True
import collections
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoTokenizer, default_data_collator

TORCH_EXTERNAL_CSV = "../input/external-data-mlqa-preprocessing/mlqa_hindi.csv"

BASE_PATH = "../input/chaii-hindi-and-tamil-question-answering/"
MODEL_NAME = '../input/xlm-roberta-squad2/deepset/xlm-roberta-large-squad2'
#MODEL_NAME = '../input/xlm-roberta-squad2/deepset/xlm-roberta-base-squad2'

df_train_base = pd.read_csv(BASE_PATH + "train.csv")
df_test = pd.read_csv(BASE_PATH + "test.csv")
df_sub = pd.read_csv(BASE_PATH + "sample_submission.csv")

max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.
batch_size = 4
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
pad_on_right = tokenizer.padding_side == "right"

env: WANDB_DISABLED=True


In [3]:
df_torch = pd.read_csv(TORCH_EXTERNAL_CSV)
df_torch = df_torch.reset_index().rename(columns={'index': 'id'})
df_torch['id'] = 'id' + df_torch['id'].astype(str)
df_train_base = pd.concat([df_train_base, df_torch], axis=0)

In [4]:
df_train_base

Unnamed: 0,id,context,question,answer_text,answer_start,language
0,903deec17,ஒரு சாதாரண வளர்ந்த மனிதனுடைய எலும்புக்கூடு பின...,மனித உடலில் எத்தனை எலும்புகள் உள்ளன?,206,53,tamil
1,d9841668c,காளிதாசன் (தேவநாகரி: कालिदास) சமஸ்கிருத இலக்கி...,காளிதாசன் எங்கு பிறந்தார்?,காசுமீரில்,2358,tamil
2,29d154b56,சர் அலெக்ஸாண்டர் ஃபிளெமிங் (Sir Alexander Flem...,பென்சிலின் கண்டுபிடித்தவர் யார்?,சர் அலெக்ஸாண்டர் ஃபிளெமிங்,0,tamil
3,41660850a,"குழந்தையின் அழுகையை நிறுத்தவும், தூங்க வைக்கவ...",தமிழ்நாட்டில் குழந்தைகளை தூங்க வைக்க பாடும் பா...,தாலாட்டு,68,tamil
4,b29c82c22,சூரியக் குடும்பம் \nசூரியக் குடும்பம் (Solar S...,பூமியின் அருகில் உள்ள விண்மீன் எது?,சூரியனும்,585,tamil
...,...,...,...,...,...,...
5420,id5420,"सिडनी शेल्डन (11 फरवरी,1917 - 30 जनवरी 2007) ए...",सिडनी शेल्डन की राष्ट्रीयता क्या थी?,अमेरिकी,48,hindi
5421,id5421,राज्यों को काउंटियों या काउंटी-समकक्ष में विभा...,"अतीत में, सार्वजनिक शिक्षा और सार्वजनिक स्वास्...",राज्य,590,hindi
5422,id5422,"89 वें अकादमी पुरस्कार (ऑस्कर 2017) समारोह, मो...",पुरस्कारों का आयोजन किस दिन किया गया?,"26 फरवरी, 2017",141,hindi
5423,id5423,डीज़ल उत्सर्जन तरल (अंग्रेजी:Diesel exhaust fl...,डीईएफ का क्या अर्थ होता है?,(अंग्रेजी:Diesel exhaust fluid,19,hindi


In [5]:
def prepare_train_features(examples):

    examples["question"] = [q.lstrip() for q in examples["question"]]
    
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

def prepare_validation_features(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples


def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        predictions[example["id"]] = best_answer["text"]

    return predictions


def submit(trainer, df_test):
    test_dataset = Dataset.from_pandas(df_test)

    test_features = test_dataset.map(prepare_validation_features, batched=True, remove_columns=test_dataset.column_names)
    test_feats_small = test_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])

    test_predictions = trainer.predict(test_feats_small)
    test_features.set_format(type=test_features.format["type"], columns=list(test_features.features.keys()))

    final_test_predictions = postprocess_qa_predictions(test_dataset, test_features, test_predictions.predictions)

    df_sub['PredictionString'] = df_sub['id'].apply(lambda r: final_test_predictions[r])
    df_sub.to_csv('submission.csv', index=False)
    display(df_sub.head())

def jaccard(row): 
    a = set(row['answer'].lower().split()) 
    b = set(row['prediction'].lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def convert_answers(row):
    return {'answer_start': [row['answer_start']], 'text': [row['answer_text']]}


In [6]:
df_train_base['answers'] = df_train_base[['answer_start', 'answer_text']].apply(convert_answers, axis=1)
df_train_base = df_train_base.sample(frac=1, random_state=42).copy()

df_train = df_train_base[:-64].reset_index(drop=True)
df_valid = df_train_base[-64:].reset_index(drop=True)

train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)
tokenized_train_ds = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_valid_ds = valid_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [7]:
def get_trainer():
    model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
    args = TrainingArguments(
    f"chaii-qa",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    )

    trainer = Trainer(model, args,
                      train_dataset=tokenized_train_ds,
                      eval_dataset=tokenized_valid_ds,
                      data_collator=default_data_collator,
                      tokenizer=tokenizer
    )
    
    return trainer


In [8]:
trainer = get_trainer()

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,0.6209,0.749131


TrainOutput(global_step=1346, training_loss=0.6502959660931116, metrics={'train_runtime': 3272.8511, 'train_samples_per_second': 0.411, 'total_flos': 0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 133029888, 'init_mem_gpu_alloc_delta': 2235376640, 'init_mem_cpu_peaked_delta': 1828618240, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 134823936, 'train_mem_gpu_alloc_delta': 6733405184, 'train_mem_cpu_peaked_delta': 3071717376, 'train_mem_gpu_peaked_delta': 5504041984})

In [10]:
validation_features = valid_dataset.map(prepare_validation_features, batched=True,remove_columns=valid_dataset.column_names)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [11]:
len(validation_features)

179

In [12]:
validation_features

Dataset({
    features: ['attention_mask', 'example_id', 'input_ids', 'offset_mapping'],
    num_rows: 179
})

In [13]:
valid_feats_small = validation_features.map(lambda example: example, remove_columns=['example_id', 'offset_mapping'])
valid_feats_small

HBox(children=(FloatProgress(value=0.0, max=179.0), HTML(value='')))




Dataset({
    features: ['attention_mask', 'input_ids'],
    num_rows: 179
})

In [14]:
raw_predictions = trainer.predict(valid_feats_small)

In [15]:
examples = valid_dataset
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)
    
final_predictions = postprocess_qa_predictions(valid_dataset, validation_features, raw_predictions.predictions)
references = [{"id": ex["id"], "answer": ex["answers"]['text'][0]} for ex in valid_dataset]

Post-processing 64 example predictions split into 179 features.


HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))




In [16]:
res = pd.DataFrame(references)
res['prediction'] = res['id'].apply(lambda r: final_predictions[r])
res['jaccard'] = res[['answer', 'prediction']].apply(jaccard, axis=1)
res

Unnamed: 0,id,answer,prediction,jaccard
0,id5121,छः,छः,1.000000
1,id249,130000,130000,1.000000
2,id1498,तेल को फ़िल्टर और प्रक्षालित किया जाता है।,तेल को फ़िल्टर और प्रक्षालित किया जाता है।,1.000000
3,id1710,2011,2011,1.000000
4,id927,2010,2010,1.000000
...,...,...,...,...
59,id2658,14:30,14:30,1.000000
60,id4077,किम नदी (निषिद्ध नदी) के मुहाने,वियतनाम के उत्तर-पूर्वी तटीय इलाके में,0.090909
61,id4112,यूनिक्स और वी एम एस,यूनिक्स और वी एम एस आधारित,0.833333
62,id4276,30,30,1.000000


In [17]:
res['jaccard'].mean()

0.698426037810229

In [18]:
submit(trainer, df_test)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=67.0), HTML(value='')))


Post-processing 5 example predictions split into 67 features.


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




Unnamed: 0,id,PredictionString
0,22bff3dec,येलन चीन से हैं। उनकी मां येलन गुट्टा
1,282758170,20 अप्रैल 2010
2,d60987e0e,१२ मार्च १८२४
3,f99c770dc,13
4,40dec1964,சுவாமிநாதன் மற்றும் வர்கீஸ் குரியன்
