In [1]:
!pip install unidecode
!pip install evaluate

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [2]:
import os
import glob
import sys
import argparse

from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
import evaluate
import torch
from datasets import Dataset, DatasetDict

import numpy as np
import pandas as pd
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import json
import collections
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import *
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import mean_squared_error




2024-05-20 02:44:43.226195: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-20 02:44:43.226298: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-20 02:44:43.360824: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data Util

In [3]:
augment_set = ['no_art', 'no_conj', 'add_and-0.1', 'swap_word-0.05',
               'no_first_sent', 'no_last_sent', 'no_longest_sent', 'reverse_sent']


MAXLEN = [-1, 70, 88, 22, 23, 24, 20, 67, 97]
MAXWORDLEN = 50

PAD_SENT_TOKEN = ''

score_range = [(-1, -1),
               (2, 12),
               (1, 6),
               (0, 3),
               (0, 3),
               (0, 4),
               (0, 4),
               (0, 30),
               (0, 60)]


def get_threshold(p):
    low, high = score_range[p]
    return 1/((high - low))


def rescale_to_int(raw, p):
    low, high = score_range[p]
    return np.around(raw*(high-low)+low).astype(int)

def normalize_score(Y, p):
    low, high = score_range[p]
    return (Y-low)/(high-low)


def clean_data(df):
    new_df = []
    for essay in df:
        new_df.append(clean_text(essay))
    return new_df


def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove quotation
    text = re.sub(r'\"', '', text)
    # URL replace by https://github.com/feidong1991/aes
    text = re.sub(
        '(http[s]?://)?((www)\.)?([a-zA-Z0-9]+)\.{1}((com)(\.(cn))?|(org))', '<url>', text)
    # Truncate any duplicate non-alphanumeric and add a space after it
    # e.g. sent1.sent2!!!...??? becomes sent1. sent2! . ?
    text = re.sub(r'([^a-zA-Z0-9_@\'\s])\1*', r'\1 ', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)

    return text


def mkpath(path):
    if not os.path.exists(path):
        os.makedirs(path)
    return path + '/'


def shorten_sentence(tokens):
    if len(tokens) <= MAXWORDLEN:
        return [tokens]

    # Step 1: split sentence based on keywords
    # split_keywords = ['because', 'but', 'so', 'then', 'You', 'He', 'She', 'We', 'It', 'They', 'Your', 'His', 'Her']
    split_keywords = ['because', 'but', 'so', 'then']
    k_indexes = [i for i, key in enumerate(tokens) if key in split_keywords]
    processed_tokens = []
    if not k_indexes:
        num = len(tokens) // MAXWORDLEN
        k_indexes = [(i+1)*MAXWORDLEN for i in range(num)]

    if len(tokens[:k_indexes[0]]) > 0:
        processed_tokens.append(tokens[:k_indexes[0]])
    len_k = len(k_indexes)
    for j in range(len_k-1):
        processed_tokens.append(tokens[k_indexes[j]:k_indexes[j+1]])
    processed_tokens.append(tokens[k_indexes[-1]:])

    # Step 2: split sentence to no more than MAXWORDLEN
    # if there are still sentences whose length exceeds MAXWORDLEN
    new_tokens = []
    for token in processed_tokens:
        if len(token) > MAXWORDLEN:
            num = len(token) // MAXWORDLEN
            s_indexes = [(i+1)*MAXWORDLEN for i in range(num)]
            len_s = len(s_indexes)
            if len(token[:s_indexes[0]]) > 0:
                new_tokens.append(token[0:s_indexes[0]])
            for j in range(len_s-1):
                new_tokens.append(token[s_indexes[j]:s_indexes[j+1]])
            new_tokens.append(token[s_indexes[-1]:])
        else:
            new_tokens.append(token)
    # print('before', len(tokens), 'after', [len(x) for x in new_tokens])
    return new_tokens


def get_vocab(prompt, df=None, length=4000, features='essay'):
    vocab_path = mkpath('vocab')
    file_path = os.path.join(vocab_path, '{}.vocab'.format(prompt))
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            vocab = json.load(f)
        assert type(vocab) == dict
        print('load vocab from {}'.format(file_path))
        return vocab

    word_all = []
    for essay in df[features]:
        sents = sentenize(essay)
        for sent in sents:
            words = tokenize(sent)
            word_all.extend(words)
    print('word count:', len(word_all))
    print('unique word count:', len(set(word_all)))

    most_common = collections.Counter(word_all).most_common(length - 3)

    vocab = {'<pad>': 0, '<unk>': 1, '<num>': 2}
    for w, c in most_common:
        vocab[w] = len(vocab)

    # save as JSON
    with open(file_path, 'w') as f:
        json.dump(vocab, f)
    print('save vocab to {}'.format(file_path))

    return vocab


def word2idx(w, vocab):
    if not w in vocab:
        return vocab['<unk>']
    return vocab[w]


def load_data(prompt, suffix=None):
    if suffix:
        data = pd.read_csv(
            'asap/prompt_{}_{}.tsv'.format( prompt, suffix), sep='\t')
    else:
        data = pd.read_csv(
            'asap/prompt_{}.tsv'.format( prompt), sep='\t')
    return data


def prepare_features(model_name, **kwargs):
    if model_name.startswith('elmo'):
        return prepare_elmo_features(**kwargs)
    elif model_name.startswith('glove'):
        return prepare_glove_features(**kwargs)


def gen(model_name, prompt, df, vocab=None, batch_size=1, test=False, shuffle=True, **kwargs):
    data = df.copy()
    while True:
        if shuffle:
            data = data.sample(frac=1).reset_index(drop=True)
        for i in range(0, len(data), batch_size):
            j = min(len(data), i+batch_size)
            if test:
                x = prepare_features(model_name,
                                     df=data[i:j], prompt=prompt, vocab=vocab, x_only=True, **kwargs)
                yield x
            else:
                x, y = prepare_features(model_name,
                                        df=data[i:j], prompt=prompt, vocab=vocab, **kwargs)
                yield x, y


def augment_gen(model_name, prompt, test_df, vocab=None, batch_size=1, augment=None, **kwargs):
    data = test_df.copy()
    rnd = np.random.RandomState(1)
    while True:
        for i in range(0, len(data), batch_size):
            j = min(len(data), i+batch_size)
            x = prepare_features(model_name,
                                 df=data[i:j], prompt=prompt, vocab=vocab, x_only=True, augment=augment, rnd=rnd, **kwargs)
            yield x


def make_augment(sents, augment, rnd=None):
    '''augment essay (list of sentences)'''
    assert augment in augment_set
    t = augment.split('-')
    if len(t) > 1:
        augment, threshold = t[0], float(t[1])
    else:
        threshold = 1.0

    new_sents = []
    if not rnd:
        rnd = np.random.RandomState(1)

    if augment == 'no_art':
        for sent in sents:
            new_sents.append(re.sub(r'\b(a|an|the)\b ', r'', sent))

    elif augment == 'no_conj':
        for sent in sents:
            new_sents.append(re.sub(r'\b(and|or|but)\b ', r'', sent))

    elif augment == 'add_and':
        for sent in sents:
            state = rnd.rand()
            if state < threshold:
                sent = 'and ' + sent
            new_sents.append(sent)

    elif augment == 'swap_word':
        for sent in sents:
            words = sent.split()
            word_idx = np.arange(len(words)-2)
            rnd.shuffle(word_idx)
            for i in word_idx:
                state = rnd.rand()
                if state < threshold:
                    words[i], words[i+1] = words[i+1], words[i]
            new_sents.append(' '.join(words))

    elif augment == 'no_first_sent':
        if len(sents) > 1:
            new_sents.extend(sents[1:])
        else:
            new_sents.extend(['.'])

    elif augment == 'no_last_sent':
        if len(sents) > 1:
            new_sents.extend(sents[:-1])
        else:
            new_sents.extend(['.'])

    elif augment == 'no_longest_sent':
        if len(sents) > 1:
            maxidx = np.argmax([len(sent) for sent in sents])
            new_sents.extend(sents[:maxidx] + sents[maxidx+1:])
        else:
            new_sents.extend(['.'])

    elif augment == 'reverse_sent':
        new_sents.extend(sents[::-1])

    else:
        raise NameError('Unknown augment : ' + str(augment))
    assert type(new_sents) is list
    return new_sents


# Evaluate Util

In [4]:
class EvaluateCallback(Callback):
    def __init__(self, prompt, val_data, model_name, vocab=None, batch_size=5):
        self.prompt = prompt
        self.val_data = val_data
        self.model_name = model_name
        self.vocab = vocab
        self.batch_size = batch_size
        self.steps = np.ceil(len(val_data) / batch_size)
        self.y_true = prepare_features(model_name,
                                       df=val_data, prompt=prompt, y_only=True)

    def on_epoch_end(self, epoch, logs):
        y_pred = self.model.predict_generator(
            gen(self.model_name, self.prompt, self.val_data, self.vocab, self.batch_size, test=True, shuffle=False), steps=self.steps, verbose=1)

        generate_qwk(self.prompt, self.model_name,
                     self.y_true, y_pred, epoch+1, 'val')


def generate_qwk(prompt, model_name, y_true, y_pred, epoch, suffix=''):
    path = mkpath('pred/{}'.format(model_name))

    y_true = rescale_to_int(y_true, prompt)
    y_pred = rescale_to_int(y_pred, prompt)
    qwk = QWK(y_true, y_pred)

    with open(os.path.join(path, 'qwk_{}_{}.csv'.format(prompt, suffix)), 'a+') as f:
        f.write('{}, {}\n'.format(epoch, qwk))


def generate_score(prompt, model_name, epoch, y_true, y_pred, aug_pred, test_df):
    path = mkpath('pred/{}'.format(model_name))

    df = pd.DataFrame()
    df['essay_id'] = test_df['essay_id']
    df['essay_set'] = test_df['essay_set']
    df['domain1_score'] = y_true
    df['test'] = y_pred
    for key in aug_pred:
        df['test_' + key] = aug_pred[key]
    df.to_csv(os.path.join(path, 'score_{}_{}.tsv'.format(prompt, epoch)),
              sep='\t', index=False)
    return df


def generate_robustness(prompt, model_name, epoch, y_true, y_pred, aug_pred):
    path = mkpath('pred/{}'.format(model_name))

    # y_true = rescale_to_int(y_true, prompt)
    y_pred_int = rescale_to_int(y_pred, prompt)
    aug_pred_int = {}
    wr_t, br_t, w_t, b_t = 0, 0, 0, 0
    N = len(y_pred) * len(aug_pred)
    print('N :', N)

    with open(os.path.join(path, 'robustness_{}_{}.csv'.format(prompt, epoch)), 'w+') as f:
        f.write('augment,worse_raw,better_raw,worse_resolved,better_resolved\n')
        for key in aug_pred:
            aug_pred_int[key] = rescale_to_int(aug_pred[key], prompt)

            wr, br, w, b = robustness(
                y_pred, aug_pred[key], y_pred_int, aug_pred_int[key])
            wr_t += wr
            br_t += br
            w_t += w
            b_t += b
            f.write('{},{},{},{},{}\n'.format(key, wr, br, w, b))
        f.write('sum,{},{},{},{}\n'.format(wr_t, br_t, w_t, b_t))
        f.write('avg,{},{},{},{}\n'.format(wr_t/N, br_t/N, w_t/N, b_t/N))


def generate_summary(model_name, epoch):
    prompts = [1, 2, 3, 4, 5, 6, 7, 8]
    # number of essay in test set
    length = [-1, 179, 180, 173, 177, 181, 180, 157, 73]
    path = mkpath('pred/{}'.format(model_name))

    with open(os.path.join(path, 'summary_{}.txt'.format(epoch)), 'w+') as f:
        f.write('{} epoch {}\n\n'.format(model_name, epoch))
        f.write('QWK\n')
        qwk_avg = 0
        for p in prompts:
            qwk_df = pd.read_csv(os.path.join(path, 'qwk_{}_test.csv'.format(
                p)), header=None, names=['epoch', 'qwk'])
            qwk = qwk_df[qwk_df['epoch'] == epoch].values[-1, -1]
            f.write('{}\t{}\n'.format(p, qwk))
            qwk_avg += qwk

        f.write('\nRobustness per prompt\n')
        r_avg = 0
        r_aug_avg = 0
        for p in prompts:
            robustness_df = pd.read_csv(os.path.join(
                path, 'robustness_{}_{}.csv'.format(p, epoch)))
            r = (robustness_df['worse_resolved'] -
                 robustness_df['better_resolved']).values[-1]
            f.write('{}\t{}\n'.format(p, r))
            r_avg += r

            r_aug = (robustness_df['worse_resolved'] -
                     robustness_df['better_resolved']).values[:-2]/length[p]
            r_aug_avg += r_aug

        f.write('\nRobustness per augment\n')
        r_aug_avg /= 8
        for a, r in zip(robustness_df['augment'][:-2], r_aug_avg):
            f.write('{}\t{}\n'.format(a, r))

        f.write('\n')
        f.write('QWK Average:\t{}\n'.format(qwk_avg / 8))
        f.write('Robustness Average:\t{}\n'.format(r_avg / 8))
        f.write('Robustness Average:\t{}\n'.format(r_aug_avg.mean()))
    print('summary generated!')


def generate_summary_best(model_name):
    prompts = [1, 2, 3, 4, 5, 6, 7, 8]
    # number of essay in test set
    length = [-1, 179, 180, 173, 177, 181, 180, 157, 73]
    path = mkpath('pred/{}'.format(model_name))

    best_ep = [-1]*9
    with open(os.path.join(path, 'summary_best.txt'), 'w+') as f:
        f.write('{}\n\n'.format(model_name))
        f.write('QWK\n')
        f.write('epoch\tprompt\tqwk\n')
        qwk_avg = 0
        for p in prompts:
            qwk_df = pd.read_csv(os.path.join(path, 'qwk_{}_val.csv'.format(
                p)), header=None, names=['epoch', 'qwk'])
            max_idx = qwk_df['qwk'].idxmax()
            best_ep[p] = int(qwk_df.iloc[max_idx].values[0])

            qwk_df = pd.read_csv(os.path.join(path, 'qwk_{}_test.csv'.format(
                p)), header=None, names=['epoch', 'qwk'])

            try:
                tmp = qwk_df[qwk_df['epoch'] == best_ep[p]].values
                # in case of multiple runs of same epoch, pick one with the best QWK
                ep, qwk = tmp[tmp.argmax(axis=0)[-1]]
            except:
                raise Exception(
                    'Error: epoch {} of prompt {} not found in test'.format(best_ep[p], p))

            f.write('{}\t{}\t{}\n'.format(best_ep[p], p, qwk))
            qwk_avg += qwk

        f.write('\nRobustness per prompt\n')
        r_avg = 0
        r_aug_avg = 0
        for p in prompts:
            robustness_df = pd.read_csv(os.path.join(
                path, 'robustness_{}_{}.csv'.format(p, best_ep[p])))
            r = (robustness_df['worse_resolved'] -
                 robustness_df['better_resolved']).values[-1]
            f.write('{}\t{}\n'.format(p, r))
            r_avg += r

            r_aug = (robustness_df['worse_resolved'] -
                     robustness_df['better_resolved']).values[:-2]/length[p]
            r_aug_avg += r_aug

        f.write('\nRobustness per augment\n')
        r_aug_avg /= 8
        for a, r in zip(robustness_df['augment'][:-2], r_aug_avg):
            f.write('{}\t{}\n'.format(a, r))

        f.write('\n')
        f.write('QWK Average:\t{}\n'.format(qwk_avg / 8))
        f.write('Robustness Average:\t{}\n'.format(r_avg / 8))
        f.write('Robustness Average:\t{}\n'.format(r_aug_avg.mean()))
    print('summary generated!')


def QWK(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')


def robustness(original, augment, original_int, augment_int, threshold=0.0):
    worse_raw = np.sum(original - augment > threshold)
    better_raw = np.sum(augment - original > threshold)
    worse_resolved = np.sum(original_int > augment_int)
    better_resolved = np.sum(original_int < augment_int)
    return worse_raw, better_raw, worse_resolved, better_resolved


# Create Dataset

In [5]:
import pandas as pd
from unidecode import unidecode
from sklearn.model_selection import KFold, train_test_split


def convert_to_ascii(df):
    new_df = []
    for essay in df:
        new_df.append(unidecode(essay))
    return new_df


def create_dataset(fold=True):
    '''Run this function once to create train,val,test files for K folds'''
    data_all = pd.read_csv('/kaggle/input/asap-aes/training_set_rel3.tsv',
                           sep='\t', encoding='latin1')
    data_all['essay'] = convert_to_ascii(data_all['essay'])
    data_all['essay'] = clean_data(data_all['essay'])

    for p in range(1, 9):
#         path = mkpath('asap/')
        data_prompt = data_all[data_all['essay_set']
                               == p].reset_index(drop=True)
        print(data_prompt.head())
#         data_prompt.to_csv(path + 'prompt_{}_all.tsv'.format(p),
#                          sep='\t', index=False)

        if fold:
            kf = KFold(n_splits=5, shuffle=True, random_state=420)
            n = 1
            for train_index, test_index in kf.split(data_prompt):
                # print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
                val_index = test_index[:len(test_index)//2]
                test_index = test_index[len(test_index)//2:]
                print(len(train_index), len(val_index), len(test_index))

                fold_path = mkpath('asap/fold_{}/'.format(n))
                data_prompt.loc[train_index].to_csv(
                    fold_path + 'prompt_{}_train.tsv'.format(p), sep='\t', index=False)
                data_prompt.loc[val_index].to_csv(
                    fold_path + 'prompt_{}_val.tsv'.format(p), sep='\t', index=False)
                data_prompt.loc[test_index].to_csv(
                    fold_path + 'prompt_{}_test.tsv'.format(p), sep='\t', index=False)
                n += 1
        else:
            train, test = train_test_split(
                data_prompt, test_size=0.2, random_state=420, shuffle=False)
            val = test[:len(test)//2]
            test = test[len(test)//2:]
            path = mkpath('asap/')
            print(len(train), len(val), len(test))
            train.to_csv(path + 'prompt_{}_train.tsv'.format(p),
                         sep='\t', index=False)
            val.to_csv(path + 'prompt_{}_val.tsv'.format(p),
                       sep='\t', index=False)
            test.to_csv(path + 'prompt_{}_test.tsv'.format(p),
                        sep='\t', index=False)


if __name__ == "__main__":
    create_dataset(fold=False)


   essay_id  essay_set                                              essay  \
0         1          1  dear local newspaper, i think effects computer...   
1         2          1  dear @caps1 @caps2, i believe that using compu...   
2         3          1  dear, @caps1 @caps2 @caps3 more and more peopl...   
3         4          1  dear local newspaper, @caps1 i have found that...   
4         5          1  dear @location1, i know having computers has a...   

   rater1_domain1  rater2_domain1  rater3_domain1  domain1_score  \
0               4               4             NaN              8   
1               5               4             NaN              9   
2               4               3             NaN              7   
3               5               5             NaN             10   
4               4               4             NaN              8   

   rater1_domain2  rater2_domain2  domain2_score  ...  rater2_trait3  \
0             NaN             NaN            NaN  ...   

# SET UP

In [6]:
LEARNING_RATE = 2e-5
MAX_LENGTH = 256
BATCH_SIZE = 8
EPOCHS = 5

prompts = [1, 2, 3, 4, 5, 6, 7, 8]


def process(df,p):
    new_df = df[["essay", "domain1_score"]].copy()
    new_df["label"] = new_df["domain1_score"].apply(lambda x: normalize_score(x, p))
    new_df.rename(columns={"essay": "text"}, inplace=True)
    return new_df

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    logits = logits[:, 0]
    mse = mean_squared_error(labels, logits)
#     qwk = QWK(labels, logits)

    return {"mse": mse}
    
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss
    
# def quadratic_weighted_kappa(y_true, y_pred):
#     """
#     Compute the quadratic weighted kappa (QWK) score between two arrays of ratings.
#     """
#     assert len(y_true) == len(y_pred)
#     min_rating, max_rating = score_range(p)
    
#     y_true = np.array(y_true, dtype=int)
#     y_pred = np.array(y_pred, dtype=int)

#     # Create the confusion matrix
#     O = np.zeros((max_rating - min_rating + 1, max_rating - min_rating + 1))
#     for a, p in zip(y_true, y_pred):
#         O[a - min_rating, p - min_rating] += 1

#     # Create the weight matrix
#     W = np.zeros((max_rating - min_rating + 1, max_rating - min_rating + 1))
#     for i in range(max_rating - min_rating + 1):
#         for j in range(max_rating - min_rating + 1):
#             W[i, j] = ((i - j) ** 2) / ((max_rating - min_rating) ** 2)

#     # Calculate the expected matrix
#     act_hist = np.bincount(y_true - min_rating, minlength=max_rating - min_rating + 1)
#     pred_hist = np.bincount(y_pred - min_rating, minlength=max_rating - min_rating + 1)
#     E = np.outer(act_hist, pred_hist)
#     E = E / E.sum()

#     # Calculate QWK
#     O = O / O.sum()
#     num = np.sum(W * O)
#     den = np.sum(W * E)
#     return 1 - num / den

# def compute_metrics_for_regression(eval_pred):
#     """
#     Compute metrics for regression, including quadratic weighted kappa.
#     eval_pred: A tuple (predictions, true_values)
#     """
#     predictions, true_values = eval_pred
#     predictions = np.rint(predictions).astype(int)  # Round to nearest integer for kappa
#     true_values = np.rint(true_values).astype(int)  # Round to nearest integer for kappa

#     qwk = quadratic_weighted_kappa(true_values, predictions)

#     return {
#         "quadratic_weighted_kappa": qwk
#     }



# BERT

In [12]:
from transformers import BertTokenizer, BertForSequenceClassification

BERT_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
BERT_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

def BERT_preprocess_function(examples):
    return BERT_tokenizer(examples["text"],  truncation=True)

for p in prompts:
    print('PROMPT :', p)

    train_df = load_data(p, 'train')
    val_df = load_data(p, 'val')
    test_df = load_data(p, 'test')

    process_train = process(train_df,p)
    process_val = process(val_df,p)
    process_test = process(test_df,p)

    datasets = DatasetDict({
        "train": Dataset.from_pandas(process_train),
        "val": Dataset.from_pandas(process_val),
        "test": Dataset.from_pandas(process_test)
    })

    tokenized_datasets = datasets.map(BERT_preprocess_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=BERT_tokenizer)
    accuracy = evaluate.load("accuracy") 
    
    training_args = TrainingArguments(
        output_dir="BERT_model",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="no",  
        load_best_model_at_end=False,
        
    )
    

    trainer = RegressionTrainer(
        model=BERT_model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["val"],
        tokenizer=BERT_tokenizer,
        compute_metrics=compute_metrics_for_regression,
        
    )
    trainer.train()
    trainer.eval_dataset=tokenized_datasets["test"]
#     trainer.save_model('bert_model/prompt_{}'.format(p))
    print(trainer.evaluate())


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PROMPT : 1


Map:   0%|          | 0/1426 [00:00<?, ? examples/s]

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

Map:   0%|          | 0/179 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Mse
1,No log,0.026265,0.026265
2,No log,0.007217,0.007217
3,0.019700,0.008992,0.008992
4,0.019700,0.005551,0.005551
5,0.019700,0.006092,0.006092


{'eval_loss': 0.006898785475641489, 'eval_mse': 0.006898785475641489, 'eval_runtime': 3.2367, 'eval_samples_per_second': 55.304, 'eval_steps_per_second': 7.106, 'epoch': 5.0}
PROMPT : 2


Map:   0%|          | 0/1440 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Mse
1,No log,0.009458,0.009458
2,No log,0.00929,0.00929
3,0.011700,0.01039,0.01039
4,0.011700,0.009852,0.009852
5,0.011700,0.010185,0.010185


{'eval_loss': 0.010720447637140751, 'eval_mse': 0.010720447637140751, 'eval_runtime': 3.2919, 'eval_samples_per_second': 54.679, 'eval_steps_per_second': 6.987, 'epoch': 5.0}
PROMPT : 3


Map:   0%|          | 0/1380 [00:00<?, ? examples/s]

Map:   0%|          | 0/173 [00:00<?, ? examples/s]

Map:   0%|          | 0/173 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Mse
1,No log,0.031595,0.031595
2,No log,0.030301,0.030301
3,0.031900,0.034839,0.034839
4,0.031900,0.035585,0.035585
5,0.031900,0.035739,0.035739


{'eval_loss': 0.04840991273522377, 'eval_mse': 0.04840992018580437, 'eval_runtime': 1.4646, 'eval_samples_per_second': 118.12, 'eval_steps_per_second': 15.021, 'epoch': 5.0}
PROMPT : 4


Map:   0%|          | 0/1416 [00:00<?, ? examples/s]

Map:   0%|          | 0/177 [00:00<?, ? examples/s]

Map:   0%|          | 0/177 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Mse
1,No log,0.036131,0.036131
2,No log,0.037637,0.037637
3,0.026400,0.038581,0.038581
4,0.026400,0.034706,0.034706
5,0.026400,0.033891,0.033891


{'eval_loss': 0.02407163381576538, 'eval_mse': 0.02407163940370083, 'eval_runtime': 1.4221, 'eval_samples_per_second': 124.46, 'eval_steps_per_second': 16.173, 'epoch': 5.0}
PROMPT : 5


Map:   0%|          | 0/1444 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/181 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Mse
1,No log,0.01636,0.01636
2,No log,0.016154,0.016154
3,0.015400,0.015729,0.015729
4,0.015400,0.015591,0.015591
5,0.015400,0.015549,0.015549


{'eval_loss': 0.014837196096777916, 'eval_mse': 0.01483719889074564, 'eval_runtime': 1.6797, 'eval_samples_per_second': 107.756, 'eval_steps_per_second': 13.693, 'epoch': 5.0}
PROMPT : 6


Map:   0%|          | 0/1440 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Mse
1,No log,0.01715,0.01715
2,No log,0.016126,0.016126
3,0.016300,0.014219,0.014219
4,0.016300,0.013682,0.013682
5,0.016300,0.014118,0.014118


{'eval_loss': 0.01959497667849064, 'eval_mse': 0.019594978541135788, 'eval_runtime': 1.9762, 'eval_samples_per_second': 91.086, 'eval_steps_per_second': 11.639, 'epoch': 5.0}
PROMPT : 7


Map:   0%|          | 0/1255 [00:00<?, ? examples/s]

Map:   0%|          | 0/157 [00:00<?, ? examples/s]

Map:   0%|          | 0/157 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Mse
1,No log,0.006402,0.006402
2,No log,0.005933,0.005933
3,No log,0.006296,0.006296
4,0.006600,0.006569,0.006569
5,0.006600,0.006438,0.006438


{'eval_loss': 0.007680067792534828, 'eval_mse': 0.007680065929889679, 'eval_runtime': 2.3428, 'eval_samples_per_second': 67.015, 'eval_steps_per_second': 8.537, 'epoch': 5.0}
PROMPT : 8


Map:   0%|          | 0/578 [00:00<?, ? examples/s]

Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Map:   0%|          | 0/73 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Mse
1,No log,0.003802,0.003802
2,No log,0.004007,0.004007
3,No log,0.003758,0.003758
4,No log,0.00411,0.00411
5,No log,0.003799,0.003799


{'eval_loss': 0.003353484906256199, 'eval_mse': 0.0033534851390868425, 'eval_runtime': 1.3418, 'eval_samples_per_second': 54.406, 'eval_steps_per_second': 7.453, 'epoch': 5.0}


# T5

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

T5_tokenizer = AutoTokenizer.from_pretrained('t5-small')
T5_model = T5ForConditionalGeneration.from_pretrained('t5-small', num_labels=1)

def T5_preprocess_function(examples):
    return T5_tokenizer(examples["text"],  truncation=True)


for p in prompts:
    print('PROMPT :', p)

    train_df = load_data(p, 'train')
    val_df = load_data(p, 'val')
    test_df = load_data(p, 'test')
    
    process_train = process(train_df,p)
    process_val = process(val_df,p)
    process_test = process(test_df,p)
    
    #print(process_train.label.unique())

    datasets = DatasetDict({
        "train": Dataset.from_pandas(process_train),
        "val": Dataset.from_pandas(process_val),
        "test": Dataset.from_pandas(process_test)
    })

    tokenized_datasets = datasets.map(T5_preprocess_function, batched=True)
    #print(tokenized_datasets["train"][0])
    data_collator = DataCollatorWithPadding(tokenizer=T5_tokenizer)
    accuracy = evaluate.load("accuracy") #name??
    
    training_args = TrainingArguments(
        output_dir="T5_model",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="no",  
        load_best_model_at_end=False,
    )

    trainer = RegressionTrainer(
        model=T5_model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["val"],
        tokenizer=T5_tokenizer,
        compute_metrics=compute_metrics_for_regression,
    )

    trainer.train()
    trainer.eval_dataset=tokenized_datasets["test"]
    print(trainer.evaluate())



# XLM

In [None]:
pip install sacremoses

In [None]:


XLM_tokenizer = AutoTokenizer.from_pretrained('xlm-mlm-en-2048')
XLM_model = AutoModelForSequenceClassification.from_pretrained('xlm-mlm-en-2048', num_labels=1)

def XLM_preprocess_function(examples):
    return XLM_tokenizer(examples["text"],  truncation=True)

for p in prompts:
    print('PROMPT :', p)

    train_df = load_data(p, 'train')
    val_df = load_data(p, 'val')
    test_df = load_data(p, 'test')

    process_train = process(train_df,p)
    process_val = process(val_df,p)
    process_test = process(test_df,p)

    #print(process_train.label.unique())

    datasets = DatasetDict({
        "train": Dataset.from_pandas(process_train),
        "val": Dataset.from_pandas(process_val),
        "test": Dataset.from_pandas(process_test)
    })

    tokenized_datasets = datasets.map(XLM_preprocess_function, batched=True)
    #print(tokenized_datasets["train"][0])
    data_collator = DataCollatorWithPadding(tokenizer=XLM_tokenizer)
    accuracy = evaluate.load("accuracy") #name??
    
    training_args = TrainingArguments(
        output_dir="XLM_model",
        learning_rate=LEARNING_RATE,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="no",  
        load_best_model_at_end=False,
    )

    trainer = RegressionTrainer(
        model=XLM_model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["val"],
        tokenizer=XLM_tokenizer,
        compute_metrics=compute_metrics_for_regression,
    )

    trainer.train()
    trainer.eval_dataset=tokenized_datasets["test"]
    print(trainer.evaluate())

