In [1]:
import warnings
import os
# TRANSFORMERS BASIC KFOLD
from kaggle_scripts.evaluation.classification_metrics import compute_metrics_for_classification
from kaggle_scripts.evaluation.regression_metrics import compute_metrics_for_regression
from kaggle_scripts.preprocessing.transformers import KaggleTokenizer
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import cohen_kappa_score
from tokenizers import AddedToken

from kaggle_scripts.comp_config import CFG, PATHS

2024-05-25 20:20:03.439089: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-25 20:20:03.441277: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-05-25 20:20:03.462943: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Tokenization
We use `max_length = 1024` to avoid truncating majority of essays.

In [2]:
# class Tokenize(object):
#     def __init__(self, train, valid, tokenizer):
#         self.tokenizer = tokenizer
#         self.train = train
#         self.valid = valid
#         
#     def get_dataset(self, df):
#         ds = Dataset.from_dict({
#                 'essay_id': [e for e in df['essay_id']],
#                 'full_text': [ft for ft in df['full_text']],
#                 'label': [s for s in df['label']],
#             })
#         return ds
#         
#     def tokenize_function(self, example):
#         tokenized_inputs = self.tokenizer(
#             example['full_text'], truncation=True, max_length=CFG.max_length
#         )
#         return tokenized_inputs
#     
#     def __call__(self):
#         train_ds = self.get_dataset(self.train)
#         valid_ds = self.get_dataset(self.valid)
#         
#         tokenized_train = train_ds.map(
#             self.tokenize_function, batched=True
#         )
#         tokenized_valid = valid_ds.map(
#             self.tokenize_function, batched=True
#         )
#         
#         return tokenized_train, tokenized_valid, self.tokenizer

In [4]:
data = pd.read_csv(PATHS.train_path)
data['label'] = data['score'].apply(lambda x: x-1)
data["label"] = data["label"].astype('int32') 

Unnamed: 0,essay_id,full_text,score,label,fold
0,000d118,Many people have car where they live. The thin...,3,2,3.0
1,000fe60,I am a scientist at NASA that is discussing th...,3,2,4.0
2,001ab80,People always wish they had the same technolog...,4,3,1.0
3,001bdc0,"We all heard about Venus, the planet without a...",4,3,0.0
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,2,2.0


In [None]:
skf = StratifiedKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed)
for i, (_, val_index) in enumerate(skf.split(data, data["score"])):
    data.loc[val_index, "fold"] = i
data.head()

In [None]:
tds = Dataset.from_pandas(data)

In [5]:
training_args = TrainingArguments(
    output_dir=f'output_v{CFG.VER}',
    learning_rate=CFG.lr,
    per_device_train_batch_size=CFG.train_batch_size,
    per_device_eval_batch_size=CFG.eval_batch_size,
    num_train_epochs=CFG.train_epochs,
    weight_decay=CFG.weight_decay,
    evaluation_strategy='epoch',
    metric_for_best_model='qwk',
    save_strategy='epoch',
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to='none',
    warmup_ratio=CFG.warmup_ratio,
    lr_scheduler_type='linear', # "cosine" or "linear" or "constant"
    optim='adamw_torch',
    logging_first_step=True,
)

In [None]:
# ADD NEW TOKENS for ("\n") new paragraph and (" "*2) double space 
tokenizer = AutoTokenizer.from_pretrained(PATHS.model_path)
tokenizer.add_tokens([AddedToken("\n", normalized=False)])
tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])
tokenize = Tokenize(train, valid, tokenizer)
tokenized_train, tokenized_valid, _ = tokenize()

In [6]:
for fold in range(len(data['fold'].unique())):
    train = data[data['fold'] != fold]
    valid = data[data['fold'] == fold].copy()

    train_dataset = dataset.filter(lambda example: example['fold'] != fold)
    valid_dataset = dataset.filter(lambda example: example['fold'] == fold)

    config = AutoConfig.from_pretrained(PATHS.model_path)
    config.num_labels = CFG.num_labels 

    model = AutoModelForSequenceClassification.from_pretrained(PATHS.model_path, config=config)
    model.resize_token_embeddings(len(tokenizer))

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    compute_metrics = compute_metrics_for_classification
    trainer = Trainer( 
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_valid,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()

    # PLOT CONFUSION MATRIX
    y_true = valid['score'].values
    predictions0 = trainer.predict(tokenized_valid).predictions
    predictions = predictions0.argmax(axis=1) + 1 
    cm = confusion_matrix(y_true, predictions, labels=[x for x in range(1,7)])
    draw_cm = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=[x for x in range(1,7)])
    draw_cm.plot()
    plt.show()

    trainer.save_model(f'deberta-v3-small_AES2_fold_{fold}_v{CFG.VER}')
    tokenizer.save_pretrained(f'deberta-v3-small_AES2_fold_{fold}_v{CFG.VER}')

    COLS = [f'p{x}' for x in range(CFG.num_labels)] 
    valid[COLS] = predictions0 
    valid.to_csv(f'valid_df_fold_{fold}_v{CFG.VER}.csv', index=False)

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [7]:
tokenized_train

Dataset({
    features: ['essay_id', 'full_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 13845
})

In [8]:
tokenized_valid

Dataset({
    features: ['essay_id', 'full_text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3462
})

In [9]:
tokenized_train.features

{'essay_id': Value(dtype='string', id=None),
 'full_text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [10]:
tokenized_valid.features

{'essay_id': Value(dtype='string', id=None),
 'full_text': Value(dtype='string', id=None),
 'label': Value(dtype='int64', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [None]:
dfs = []
for k in range(CFG.n_splits):
    dfs.append( pd.read_csv(f'valid_df_fold_{k}_v{CFG.VER}.csv') )
    os.system(f'rm valid_df_fold_{k}_v{CFG.VER}.csv')
dfs = pd.concat(dfs)
dfs.to_csv(f'valid_df_v{CFG.VER}.csv',index=False)
print('Valid OOF shape:', dfs.shape )
display( dfs.head() )

In [None]:
m = cohen_kappa_score(dfs.score.values, dfs.iloc[:,-6:].values.argmax(axis=1)+1, weights='quadratic')
print('Overall QWK CV =',m)

In [None]:
test = pd.read_csv(PATHS.test_path)
print('Test shape:', test.shape )
test.head()

In [None]:
all_pred = []
test['label'] = 0.0

for fold in range(CFG.n_splits):
    tokenizer = AutoTokenizer.from_pretrained(f'deberta-v3-small_AES2_fold_{fold}_v{CFG.VER}')
    tokenize = Tokenize(test, test, tokenizer)
    tokenized_test, _, _ = tokenize()
    
    tokenize = KaggleTokenizer(test, test, tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(f'deberta-v3-small_AES2_fold_{fold}_v{CFG.VER}')
    
    # INFER WITH TRAINER
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainer = Trainer( 
        model=model,
        args=training_args,
        train_dataset=tokenized_test,
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    # SAVE PREDICTIONS
    predictions = trainer.predict(tokenized_test).predictions
    all_pred.append( predictions )