In [5]:
%load_ext autoreload
%autoreload 2
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import json
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModelWithLMHead
import huggingface_hub as hf_hub
import os
from utils import CLASS_MAP, to_context_free_format
import datasets
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
import random

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# # %%
# os.environ["WANDB_API_KEY"] = "23e6940ba17fe0fd2bf2616685c3978f2ce87d7b"
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
# hf_hub.login("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",add_to_git_credential=True)
# WANDB_PROJECT="emnlp_pragtag_2023"

In [7]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [8]:
def preprocess(item):
    item["label"] = torch.tensor(CLASS_MAP[item["label"]]).unsqueeze(0)

    return item

In [9]:
# tokenizer
tokenizer = "microsoft/deberta-base"
model_name = "suryakiran786/emnlp_pragtag2023_domain_adapted_warm_restart"
tokenizer = AutoTokenizer.from_pretrained(tokenizer,do_lower_case=True, force_download=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

In [10]:
def tokenize(examples):
    toks = tokenizer.batch_encode_plus(examples["txt"], padding="max_length", max_length=512, truncation=True,
                                       return_tensors="pt")
    toks["labels"] = examples["label"]

    return toks

In [14]:
train_file = Path.cwd().joinpath("public_data","train_inputs_full.json")

In [15]:
full_data = datasets.Dataset.from_list(to_context_free_format(train_file))

In [16]:
full_data_df = full_data.to_pandas()

In [19]:
train_valid_gkf = GroupKFold()
valid_test_gkf = GroupKFold(n_splits=2)

In [53]:
for idx,(train_idx, valid_idx) in enumerate(train_valid_gkf.split(X=full_data_df,y=full_data_df["label"],groups=full_data_df["report_id"])):
    # Splitting data into Train and validation
    train_df = full_data_df.loc[train_idx,:]    
    og_valid_df = full_data_df.loc[valid_idx,:]
    
    # Splitting validation data into validation and test
    valid_idx,test_idx = \
    next(iter(valid_test_gkf.split(X=og_valid_df,y=og_valid_df["label"],groups=og_valid_df["report_id"])))
    valid_df = og_valid_df.iloc[valid_idx]
    test_df = og_valid_df.iloc[test_idx]
    
    # Converting all dataframes to HF dataset
    train_ds = datasets.Dataset.from_pandas(train_df)
    valid_ds = datasets.Dataset.from_pandas(valid_df)
    test_ds = datasets.Dataset.from_pandas(test_df)
    data_dict = \
    datasets.DatasetDict({"train":train_ds,"valid":valid_ds,"test":test_ds})
    data_dict = \
    data_dict.map(preprocess) \
    .shuffle(seed=seed) \
    .map(tokenize, batched=True, remove_columns=train_ds.features)
    print(data_dict)
    
    model = AutoModelForSequenceClassification.from_pretrained(model_name, \
                                                               num_labels=len(CLASS_MAP), \
                                                               force_download=True)
    # fine-tuning
    training_args = TrainingArguments(
    output_dir=f"emnlp_pragtag2023_finetuned_split_test_{idx}",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=2 * 16
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    adam_epsilon=1e-6,
    num_train_epochs=3,
    warmup_ratio=0.1,
    save_total_limit=4,
    push_to_hub=True,
    save_strategy="epoch",
    run_name=model_name.split("/")[-1],
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
    greater_is_better=False,
    report_to="wandb",
    hub_strategy="end",
    hub_private_repo=True
    )
    
    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_dict["train"],
    eval_dataset=data_dict["valid"],
    data_collator=data_collator,
    )

    trainer.train()
    
    

Map:   0%|          | 0/1858 [00:00<?, ? examples/s]

Map:   0%|          | 0/236 [00:00<?, ? examples/s]

Map:   0%|          | 0/232 [00:00<?, ? examples/s]

Map:   0%|          | 0/1858 [00:00<?, ? examples/s]

Map:   0%|          | 0/236 [00:00<?, ? examples/s]

Map:   0%|          | 0/232 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1858
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 236
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 232
    })
})


Map:   0%|          | 0/1859 [00:00<?, ? examples/s]

Map:   0%|          | 0/233 [00:00<?, ? examples/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Map:   0%|          | 0/1859 [00:00<?, ? examples/s]

Map:   0%|          | 0/233 [00:00<?, ? examples/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1859
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 233
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 234
    })
})


Map:   0%|          | 0/1859 [00:00<?, ? examples/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Map:   0%|          | 0/233 [00:00<?, ? examples/s]

Map:   0%|          | 0/1859 [00:00<?, ? examples/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Map:   0%|          | 0/233 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1859
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 234
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 233
    })
})


Map:   0%|          | 0/1864 [00:00<?, ? examples/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

Map:   0%|          | 0/1864 [00:00<?, ? examples/s]

Map:   0%|          | 0/234 [00:00<?, ? examples/s]

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1864
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 234
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 228
    })
})


Map:   0%|          | 0/1864 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

Map:   0%|          | 0/235 [00:00<?, ? examples/s]

Map:   0%|          | 0/1864 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

Map:   0%|          | 0/235 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1864
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 227
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 235
    })
})
