# CommonLit - Evaluate Student Summaries
# Introduction
Create a quality assessment model for summaries written by students from grade 3 to grade 12. The quality will be evaluated based on the following two criteria:
  - content: How well the summary captures the main ideas and details of the source text
  - wording: The clarity, precision, and fluency of the language used in the summary

#  Data Loading
Loading the data and displaying basic information

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
prompts_train_path = "/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv"
propmts_test_path = "/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv"

summaries_train_path = "/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv"
summaries_test_path = "/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv"

# prompts_train_path = "prompts_train.csv"
# propmts_test_path = "prompts_test.csv"

# summaries_train_path = "summaries_train.csv"
# summaries_test_path = "summaries_test.csv"

In [3]:
# load files
df_prompts_train = pd.read_csv(prompts_train_path)
df_propmts_test = pd.read_csv(propmts_test_path)

df_summaries_train = pd.read_csv(summaries_train_path)
df_summaries_test = pd.read_csv(summaries_test_path)

# merge files
df_train = df_summaries_train.merge(df_prompts_train, on="prompt_id")
df_test = df_summaries_test.merge(df_propmts_test, on="prompt_id")

In [4]:
df_train.head(3)

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


# Exploratory Data Analysis (EDA)
Checking the distribution, outliers, etc.

In [5]:
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, GroupKFold

import re
import string
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import seaborn as sns
from transformers import Trainer
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import shutil

from datasets import Dataset,load_dataset, load_from_disk

from datasets import load_metric, disable_progress_bar

from sklearn.metrics import mean_squared_error

from tqdm import tqdm, tqdm_notebook
# pyspellchecker
# https://pyspellchecker.readthedocs.io/en/latest/quickstart.html
import os

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [6]:
!pip install /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize

Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [7]:
tqdm.pandas()

In [8]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [9]:
nltk.data.path.append('/kaggle/input/nltk-dataset/stopwords')
nltk.download('stopwords')
difficult_words = set(stopwords.words('english'))

# SpellChecker
spell = SpellChecker()



[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


In [10]:
def extract_features(text):
    
    # 
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)
    
    # 
    num_difficult_words = sum(1 for word in words if word.lower() not in difficult_words)
    
    # 
    lexical_diversity = len(set(words)) / len(words)
    
    # 
    freq_dist = FreqDist(words)
    
    # 
    num_top_words = len([word for word, freq in freq_dist.items() if freq >= len(words) * 0.10])
    
    # 
    num_interrogative = text.count('?')
    num_exclamatory = text.count('!')
    
    # 
    misspelled_words = spell.unknown(words)
    
    # 
    return {
        'num_words': len(words),
        'avg_sentence_length': len(words) / len(sentences),
        'num_difficult_words': num_difficult_words,
        'lexical_diversity': lexical_diversity,
        'num_top_words': num_top_words,
        'num_interrogative': num_interrogative,
        'num_exclamatory': num_exclamatory,
        'num_misspelled_words': len(misspelled_words)
    }

In [11]:
#
df_train_features = df_train['text'].apply(extract_features).apply(pd.Series)
df_train = pd.concat([df_train, df_train_features], axis=1)

df_test_features = df_test['text'].apply(extract_features).apply(pd.Series)
df_test = pd.concat([df_test, df_test_features], axis=1)

In [12]:
misspell_mapping = {
    'studentdesigned': 'student designed',
    'teacherdesigned': 'teacher designed',
    'genericname': 'generic name',
    'winnertakeall': 'winner take all',
    'studentname': 'student name',
    'driveless': 'driverless',
    'teachername': 'teacher name',
    'propername': 'proper name',
    'bestlaid': 'best laid',
    'genericschool': 'generic school',
    'schoolname': 'school name',
    'winnertakesall': 'winner take all',
    'elctoral': 'electoral',
    'eletoral': 'electoral',
    'genericcity': 'generic city',
    'elctors': 'electoral',
    'venuse': 'venue',
    'blimplike': 'blimp like',
    'selfdriving': 'self driving',
    'electorals': 'electoral',
    'nearrecord': 'near record',
    'egyptianstyle': 'egyptian style',
    'oddnumbered': 'odd numbered',
    'carintensive': 'car intensive',
    'elecoral': 'electoral',
    'oction': 'auction',
    'electroal': 'electoral',
    'evennumbered': 'even numbered',
    'mesalandforms': 'mesa landforms',
    'electoralvote': 'electoral vote',
    'relativename': 'relative name',
    '22euro': 'twenty two euro',
    'ellectoral': 'electoral',
    'thirtyplus': 'thirty plus',
    'collegewon': 'college won',
    'hisher': 'higher',
    'teacherbased': 'teacher based',
    'computeranimated': 'computer animated',
    'canadidate': 'candidate',
    'studentbased': 'student based',
    'gorethanks': 'gore thanks',
    'clouddraped': 'cloud draped',
    'edgarsnyder': 'edgar snyder',
    'emotionrecognition': 'emotion recognition',
    'landfrom': 'land form',
    'fivedays': 'five days',
    'electoal': 'electoral',
    'lanform': 'land form',
    'electral': 'electoral',
    'presidentbut': 'president but',
    'teacherassigned': 'teacher assigned',
    'beacuas': 'because',
    'positionestimating': 'position estimating',
    'selfeducation': 'self education',
    'diverless': 'driverless',
    'computerdriven': 'computer driven',
    'outofcontrol': 'out of control',
    'faultthe': 'fault the',
    'unfairoutdated': 'unfair outdated',
    'aviods': 'avoid',
    'momdad': 'mom dad',
    'statesbig': 'states big',
    'presidentswing': 'president swing',
    'inconclusion': 'in conclusion',
    'handsonlearning': 'hands on learning',
    'electroral': 'electoral',
    'carowner': 'car owner',
    'elecotral': 'electoral',
    'studentassigned': 'student assigned',
    'collegefive': 'college five',
    'presidant': 'president',
    'unfairoutdatedand': 'unfair outdated and',
    'nixonjimmy': 'nixon jimmy',
    'canadates': 'candidate',
    'tabletennis': 'table tennis',
    'himher': 'him her',
    'studentsummerpacketdesigners': 'student summer packet designers',
    'studentdesign': 'student designed',
    'limting': 'limiting',
    'electrol': 'electoral',
    'campaignto': 'campaign to',
    'presendent': 'president',
    'thezebra': 'the zebra',
    'landformation': 'land formation',
    'eyetoeye': 'eye to eye',
    'selfreliance': 'self reliance',
    'studentdriven': 'student driven',
    'winnertake': 'winner take',
    'alliens': 'aliens',
    '2000but': '2000 but',
    'electionto': 'election to',
    'candidatesas': 'candidates as',
    'electers': 'electoral',
    'winnertakes': 'winner takes',
    'isfeet': 'is feet',
    'incar': 'incur',
    'wellconstructed': 'well constructed',
    'craftsmenwomen': 'crafts men women',
    'freelunch': 'free lunch',
    'twothousandrevolutions': 'two thousand revolutions',
    'ushistoryorg': 'us history org',
    'pharohs': 'pharaohs',
    'whitehot': 'white hot',
    'vizers': 'visors',
    'mrjones': 'mr jones',
    'aminute': 'a minute',
    'spoiledmeat': 'spoiled meat',
    'farmersgave': 'farmers gave',
    'spolied': 'spoiled',
    'tradgey': 'tragedy',
    'pyrimid': 'pyramid',
    'pyrimad': 'pyramid',
    'egyptiansfrom': 'egyptians from',
    'harvestthats': 'harvest that',
    'expierment': 'experiment',
    'jestthat': 'jest that',
    'twothousandrevolutionsaminute': 'two thousand revolutions a minute',
    'expirament': 'experiment',
    'nonspoiled': 'non spoiled',
    'egyptains': 'egyptians',
    'tragedys': 'tragedy',
    'pyrmaid': 'pyramid',
    'expirment': 'experiment',
    'whiteit': 'grade there',
    'gradethere': 'tragedy',
    'goverement': 'government',
    'godsthe': 'gods the',
    'paraoh': 'pharaoh',
    'classesupper': 'classes upper',
    'pharoes': 'pharaohs',
    'noblespriests': 'noble priests',
    'farmersslaves': 'farmers slaves',
    'harvestâ€”thatâ€™s': 'harvest that',
    'tradedy': 'tragedy',
    'paraohs': 'pharaohs',
    'paragrapgh': 'paragraph',
    'expieriment': 'experiment',
    'tragdey': 'tragedy',
    'pyramaid': 'pyramid',
    'pyrmid': 'pyramid',
    'prists': 'priests',
    'pharoas': 'pharaohs',
    'priets': 'priests',
    'pharoph': 'pharaohs',
    'pharaoah': 'pharaohs',
    'pharahos': 'pharaohs',
    'pharaohthe': 'pharaohs'
}

In [13]:
def decontraction(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"he's", "he is", phrase)
    phrase = re.sub(r"there's", "there is", phrase)
    phrase = re.sub(r"We're", "We are", phrase)
    phrase = re.sub(r"That's", "That is", phrase)
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"they're", "they are", phrase)
    phrase = re.sub(r"Can't", "Cannot", phrase)
    phrase = re.sub(r"wasn't", "was not", phrase)
    phrase = re.sub(r"don\x89Ûªt", "do not", phrase)
    phrase = re.sub(r"donãât", "do not", phrase)
    phrase = re.sub(r"aren't", "are not", phrase)
    phrase = re.sub(r"isn't", "is not", phrase)
    phrase = re.sub(r"What's", "What is", phrase)
    phrase = re.sub(r"haven't", "have not", phrase)
    phrase = re.sub(r"hasn't", "has not", phrase)
    phrase = re.sub(r"There's", "There is", phrase)
    phrase = re.sub(r"He's", "He is", phrase)
    phrase = re.sub(r"It's", "It is", phrase)
    phrase = re.sub(r"You're", "You are", phrase)
    phrase = re.sub(r"I'M", "I am", phrase)
    phrase = re.sub(r"shouldn't", "should not", phrase)
    phrase = re.sub(r"wouldn't", "would not", phrase)
    phrase = re.sub(r"i'm", "I am", phrase)
    phrase = re.sub(r"I\x89Ûªm", "I am", phrase)
    phrase = re.sub(r"I'm", "I am", phrase)
    phrase = re.sub(r"Isn't", "is not", phrase)
    phrase = re.sub(r"Here's", "Here is", phrase)
    phrase = re.sub(r"you've", "you have", phrase)
    phrase = re.sub(r"you\x89Ûªve", "you have", phrase)
    phrase = re.sub(r"we're", "we are", phrase)
    phrase = re.sub(r"what's", "what is", phrase)
    phrase = re.sub(r"couldn't", "could not", phrase)
    phrase = re.sub(r"we've", "we have", phrase)
    phrase = re.sub(r"it\x89Ûªs", "it is", phrase)
    phrase = re.sub(r"doesn\x89Ûªt", "does not", phrase)
    phrase = re.sub(r"It\x89Ûªs", "It is", phrase)
    phrase = re.sub(r"Here\x89Ûªs", "Here is", phrase)
    phrase = re.sub(r"who's", "who is", phrase)
    phrase = re.sub(r"I\x89Ûªve", "I have", phrase)
    phrase = re.sub(r"y'all", "you all", phrase)
    phrase = re.sub(r"can\x89Ûªt", "cannot", phrase)
    phrase = re.sub(r"would've", "would have", phrase)
    phrase = re.sub(r"it'll", "it will", phrase)
    phrase = re.sub(r"we'll", "we will", phrase)
    phrase = re.sub(r"wouldn\x89Ûªt", "would not", phrase)
    phrase = re.sub(r"We've", "We have", phrase)
    phrase = re.sub(r"he'll", "he will", phrase)
    phrase = re.sub(r"Y'all", "You all", phrase)
    phrase = re.sub(r"Weren't", "Were not", phrase)
    phrase = re.sub(r"Didn't", "Did not", phrase)
    phrase = re.sub(r"they'll", "they will", phrase)
    phrase = re.sub(r"they'd", "they would", phrase)
    phrase = re.sub(r"DON'T", "DO NOT", phrase)
    phrase = re.sub(r"That\x89Ûªs", "That is", phrase)
    phrase = re.sub(r"they've", "they have", phrase)
    phrase = re.sub(r"i'd", "I would", phrase)
    phrase = re.sub(r"should've", "should have", phrase)
    phrase = re.sub(r"You\x89Ûªre", "You are", phrase)
    phrase = re.sub(r"where's", "where is", phrase)
    phrase = re.sub(r"Don\x89Ûªt", "Do not", phrase)
    phrase = re.sub(r"we'd", "we would", phrase)
    phrase = re.sub(r"i'll", "I will", phrase)
    phrase = re.sub(r"weren't", "were not", phrase)
    phrase = re.sub(r"They're", "They are", phrase)
    phrase = re.sub(r"Can\x89Ûªt", "Cannot", phrase)
    phrase = re.sub(r"you\x89Ûªll", "you will", phrase)
    phrase = re.sub(r"I\x89Ûªd", "I would", phrase)
    phrase = re.sub(r"let's", "let us", phrase)
    phrase = re.sub(r"it's", "it is", phrase)
    phrase = re.sub(r"can't", "cannot", phrase)
    phrase = re.sub(r"don't", "do not", phrase)
    phrase = re.sub(r"you're", "you are", phrase)
    phrase = re.sub(r"i've", "I have", phrase)
    phrase = re.sub(r"that's", "that is", phrase)
    phrase = re.sub(r"i'll", "I will", phrase)
    phrase = re.sub(r"doesn't", "does not",phrase)
    phrase = re.sub(r"i'd", "I would", phrase)
    phrase = re.sub(r"didn't", "did not", phrase)
    phrase = re.sub(r"ain't", "am not", phrase)
    phrase = re.sub(r"you'll", "you will", phrase)
    phrase = re.sub(r"I've", "I have", phrase)
    phrase = re.sub(r"Don't", "do not", phrase)
    phrase = re.sub(r"I'll", "I will", phrase)
    phrase = re.sub(r"I'd", "I would", phrase)
    phrase = re.sub(r"Let's", "Let us", phrase)
    phrase = re.sub(r"you'd", "You would", phrase)
    phrase = re.sub(r"It's", "It is", phrase)
    phrase = re.sub(r"Ain't", "am not", phrase)
    phrase = re.sub(r"Haven't", "Have not", phrase)
    phrase = re.sub(r"Could've", "Could have", phrase)
    phrase = re.sub(r"youve", "you have", phrase)  
    phrase = re.sub(r"donå«t", "do not", phrase)
    return phrase

In [14]:
def clean_text(text):
    text = decontraction(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    return text

In [15]:
def remove_punctuations(text):
    for punctuation in list(string.punctuation):
        text = text.replace(punctuation, '')
    return text


In [16]:
def clean_number(text):
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    return text

In [17]:
def clean_misspell(text):
    for bad_word in misspell_mapping:
        if bad_word in text:
            text = text.replace(bad_word, misspell_mapping[bad_word])
    return text

In [18]:
def pos_count(sent):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    jj_count = 0   #Adjective
    uh_count = 0   #Interjection
    cd_count = 0   #Numerics
    
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)

    for token in sent:
        if token[1] in ['NN','NNP','NNS']:
            nn_count += 1

        if token[1] in ['PRP','PRP$']:
            pr_count += 1

        if token[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']:
            vb_count += 1

        if token[1] in ['JJ','JJR','JJS']:
            jj_count += 1

        if token[1] in ['UH']:
            uh_count += 1

        if token[1] in ['CD']:
            cd_count += 1
    
    return pd.Series([nn_count, pr_count, vb_count, jj_count, uh_count, cd_count])

In [19]:
# from autocorrect import Speller
# from spellchecker import SpellChecker
# speller = Speller(lang='en')
# spellchecker = SpellChecker()
# stop_words = set(stopwords.words('english'))

In [20]:
# def add_spelling_dictionary(tokens):
#     spellchecker.word_frequency.load_words(tokens)
#     speller.nlp_data.update({token:1000 for token in tokens})

In [21]:
# def spelling(text):
#     wordlist = text.split()
#     amount_miss = len(list(spellchecker.unknown(wordlist)))
#     return amount_miss

In [22]:
def word_overlap_count(row):
    def check_is_stop_word(word):
        return word in difficult_words
    
    prompt_words = row['prompt_tokens']
    summary_words = row['summary_tokens']
    
    if difficult_words:
        prompt_words = list(filter(check_is_stop_word, prompt_words))
        summary_words = list(filter(check_is_stop_word, summary_words))
    
    return len(set(prompt_words).intersection(set(summary_words)))

def ngrams(token, n):
    ngrams = zip(*[token[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]


def ngram_co_occurrence(row, n):
    original_tokens = row['prompt_tokens']
    summary_tokens = row['summary_tokens']

    original_ngrams = set(ngrams(original_tokens, n))
    summary_ngrams = set(ngrams(summary_tokens, n))

    common_ngrams = original_ngrams.intersection(summary_ngrams)
    return len(common_ngrams)


def quotes_count(row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

In [23]:
def text_preprocess(data):
    data["prompt_length"] = data["prompt_text"].progress_apply(lambda x: len(word_tokenize(x)))
    data["prompt_tokens"] = data["prompt_text"].progress_apply(lambda x: word_tokenize(x))
    data["summary_length"] = data["text"].progress_apply(lambda x: len(word_tokenize(x)))
    data["summary_tokens"] = data["text"].progress_apply(lambda x: word_tokenize(x))
#     data["prompt_tokens"].progress_apply(lambda x: add_spelling_dictionary(x))
    #summaries["fixed_summary_text"] = summaries["text"].progress_apply(lambda x: speller(x))
#     data["splling_err_num"] = data["text"].progress_apply(spelling)
    
    df = data
    df['length_ratio'] = df['summary_length'] / df['prompt_length']
    df['word_overlap_count'] = df.progress_apply(word_overlap_count, axis=1)
    df['bigram_overlap_count'] = df.progress_apply(ngram_co_occurrence, args=(2,), axis=1)
    df['bigram_overlap_ratio'] = df['bigram_overlap_count'] / (df['summary_length'] - 1)
    df['trigram_overlap_count'] = df.progress_apply(ngram_co_occurrence, args=(3,), axis=1)
    df['trigram_overlap_ratio'] = df['trigram_overlap_count'] / (df['summary_length'] - 2)
    df['quotes_count'] = df.progress_apply(quotes_count, axis=1)
    return df.drop(columns=["summary_tokens", "prompt_tokens"])

In [24]:
df_train = text_preprocess(df_train)

100%|██████████| 7165/7165 [01:00<00:00, 118.32it/s]
100%|██████████| 7165/7165 [01:00<00:00, 118.18it/s]
100%|██████████| 7165/7165 [00:06<00:00, 1057.22it/s]
100%|██████████| 7165/7165 [00:07<00:00, 979.96it/s] 
100%|██████████| 7165/7165 [00:01<00:00, 6598.61it/s]
100%|██████████| 7165/7165 [00:01<00:00, 3637.42it/s]
100%|██████████| 7165/7165 [00:02<00:00, 3185.74it/s]
100%|██████████| 7165/7165 [00:00<00:00, 45720.31it/s]


In [25]:
df_train["num_words"] = df_train["text"].progress_apply(lambda x: len(str(x).split()))
df_train["num_unique_words"] = df_train["text"].progress_apply(lambda x: len(set(str(x).split())))
df_train["num_chars"] = df_train["text"].progress_apply(lambda x: len(str(x)))
df_train["num_stopwords"] = df_train["text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
df_train["num_punctuations"] =df_train['text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
df_train["num_words_upper"] = df_train["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
df_train["num_words_title"] = df_train["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df_train["mean_word_len"] = df_train["text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_train["num_paragraphs"] = df_train["text"].progress_apply(lambda x: len(x.split('\n')))
df_train["num_sentences"] = df_train["text"].progress_apply(lambda x: len(str(x).split('.')))
df_train["polarity"] = df_train['text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
df_train["subjectivity"] = df_train['text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
df_train[['nn_count','pr_count','vb_count','jj_count','uh_count','cd_count']] = df_train['text'].progress_apply(pos_count)

# df_train["prompt_num_words"] = df_train["prompt_text"].progress_apply(lambda x: len(str(x).split()))
# df_train["prompt_num_unique_words"] = df_train["prompt_text"].progress_apply(lambda x: len(set(str(x).split())))
# df_train["prompt_num_chars"] = df_train["prompt_text"].progress_apply(lambda x: len(str(x)))
# df_train["prompt_num_stopwords"] = df_train["prompt_text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
# df_train["prompt_num_punctuations"] =df_train['prompt_text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
# df_train["prompt_num_words_upper"] = df_train["prompt_text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
# df_train["prompt_num_words_title"] = df_train["prompt_text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
# df_train["prompt_mean_word_len"] = df_train["prompt_text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
# df_train["prompt_num_paragraphs"] = df_train["prompt_text"].progress_apply(lambda x: len(x.split('\n')))
# df_train["prompt_num_sentences"] = df_train["prompt_text"].progress_apply(lambda x: len(str(x).split('.')))
# df_train["prompt_polarity"] = df_train['prompt_text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
# df_train["prompt_subjectivity"] = df_train['prompt_text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
# df_train[['prompt_nn_count','prompt_pr_count','prompt_vb_count','prompt_jj_count','prompt_uh_count','prompt_cd_count']] = df_train['prompt_text'].progress_apply(pos_count)
df_train.head()

100%|██████████| 7165/7165 [00:00<00:00, 141859.99it/s]
100%|██████████| 7165/7165 [00:00<00:00, 74423.82it/s]
100%|██████████| 7165/7165 [00:00<00:00, 416050.89it/s]
100%|██████████| 7165/7165 [01:06<00:00, 107.18it/s]
100%|██████████| 7165/7165 [00:05<00:00, 1420.60it/s]
100%|██████████| 7165/7165 [00:00<00:00, 85014.95it/s]
100%|██████████| 7165/7165 [00:00<00:00, 80229.67it/s]
100%|██████████| 7165/7165 [00:00<00:00, 27377.39it/s]
100%|██████████| 7165/7165 [00:00<00:00, 362533.18it/s]
100%|██████████| 7165/7165 [00:00<00:00, 310845.05it/s]
100%|██████████| 7165/7165 [00:05<00:00, 1406.09it/s]
100%|██████████| 7165/7165 [00:04<00:00, 1447.86it/s]
100%|██████████| 7165/7165 [00:37<00:00, 189.62it/s]


Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,num_words,avg_sentence_length,...,num_paragraphs,num_sentences,polarity,subjectivity,nn_count,pr_count,vb_count,jj_count,uh_count,cd_count
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,61,16.0,...,1,4,0.170455,0.334848,14,3,17,6,0,1
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,203,16.571429,...,1,14,0.048203,0.355229,59,11,37,7,0,6
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,60,22.333333,...,1,6,0.075,0.31875,16,4,12,3,0,0
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,76,28.666667,...,1,4,-0.666667,0.666667,17,4,15,6,0,0
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,27,14.5,...,1,3,0.088939,0.325909,5,2,4,4,0,0


In [26]:
df_test = text_preprocess(df_test)

100%|██████████| 4/4 [00:00<00:00, 2781.83it/s]
100%|██████████| 4/4 [00:00<00:00, 3117.86it/s]
100%|██████████| 4/4 [00:00<00:00, 1422.40it/s]
100%|██████████| 4/4 [00:00<00:00, 1583.05it/s]
100%|██████████| 4/4 [00:00<00:00, 1205.26it/s]
100%|██████████| 4/4 [00:00<00:00, 1335.34it/s]
100%|██████████| 4/4 [00:00<00:00, 1336.30it/s]
100%|██████████| 4/4 [00:00<00:00, 1334.70it/s]


In [27]:
df_test["num_words"] = df_test["text"].progress_apply(lambda x: len(str(x).split()))
df_test["num_unique_words"] = df_test["text"].progress_apply(lambda x: len(set(str(x).split())))
df_test["num_chars"] = df_test["text"].progress_apply(lambda x: len(str(x)))
df_test["num_stopwords"] = df_test["text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
df_test["num_punctuations"] =df_test['text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
df_test["num_words_upper"] = df_test["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
df_test["num_words_title"] = df_test["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df_test["mean_word_len"] = df_test["text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_test["num_paragraphs"] = df_test["text"].progress_apply(lambda x: len(x.split('\n')))
df_test["num_sentences"] = df_test["text"].progress_apply(lambda x: len(str(x).split('.')))
df_test["polarity"] = df_test['text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
df_test["subjectivity"] = df_test['text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
df_test[['nn_count','pr_count','vb_count','jj_count','uh_count','cd_count']] = df_test['text'].progress_apply(pos_count)

# df_test["prompt_num_words"] = df_test["prompt_text"].progress_apply(lambda x: len(str(x).split()))
# df_test["prompt_num_unique_words"] = df_test["prompt_text"].progress_apply(lambda x: len(set(str(x).split())))
# df_test["prompt_num_chars"] = df_test["prompt_text"].progress_apply(lambda x: len(str(x)))
# df_test["prompt_num_stopwords"] = df_test["prompt_text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
# df_test["prompt_num_punctuations"] =df_test['prompt_text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
# df_test["prompt_num_words_upper"] = df_test["prompt_text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
# df_test["prompt_num_words_title"] = df_test["prompt_text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
# df_test["prompt_mean_word_len"] = df_test["prompt_text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
# df_test["prompt_num_paragraphs"] = df_test["prompt_text"].progress_apply(lambda x: len(x.split('\n')))
# df_test["prompt_num_sentences"] = df_test["prompt_text"].progress_apply(lambda x: len(str(x).split('.')))
# df_test["prompt_polarity"] = df_test['prompt_text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
# df_test["prompt_subjectivity"] = df_test['prompt_text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
# df_test[['prompt_nn_count','prompt_pr_count','prompt_vb_count','prompt_jj_count','prompt_uh_count','prompt_cd_count']] = df_test['prompt_text'].progress_apply(pos_count)

df_test.head()

100%|██████████| 4/4 [00:00<00:00, 5420.75it/s]
100%|██████████| 4/4 [00:00<00:00, 7016.82it/s]
100%|██████████| 4/4 [00:00<00:00, 7310.33it/s]
100%|██████████| 4/4 [00:00<00:00, 1276.51it/s]
100%|██████████| 4/4 [00:00<00:00, 2493.64it/s]
100%|██████████| 4/4 [00:00<00:00, 7543.71it/s]
100%|██████████| 4/4 [00:00<00:00, 10125.05it/s]
100%|██████████| 4/4 [00:00<00:00, 7019.76it/s]
100%|██████████| 4/4 [00:00<00:00, 9992.39it/s]
100%|██████████| 4/4 [00:00<00:00, 10578.32it/s]
100%|██████████| 4/4 [00:00<00:00, 2338.29it/s]
100%|██████████| 4/4 [00:00<00:00, 2699.47it/s]
100%|██████████| 4/4 [00:00<00:00, 945.51it/s]


Unnamed: 0,student_id,prompt_id,text,prompt_question,prompt_title,prompt_text,num_words,avg_sentence_length,num_difficult_words,lexical_diversity,...,num_paragraphs,num_sentences,polarity,subjectivity,nn_count,pr_count,vb_count,jj_count,uh_count,cd_count
0,000000ffffff,abc123,Example text 1,Summarize...,Example Title 1,Heading\nText...,3,3.0,3.0,1.0,...,1,1,0.0,0.0,2,0,0,0,0,1
1,222222cccccc,abc123,Example text 3,Summarize...,Example Title 1,Heading\nText...,3,3.0,3.0,1.0,...,1,1,0.0,0.0,2,0,0,0,0,1
2,111111eeeeee,def789,Example text 2,Summarize...,Example Title 2,Heading\nText...,3,3.0,3.0,1.0,...,1,1,0.0,0.0,2,0,0,0,0,1
3,333333dddddd,def789,Example text 4,Summarize...,Example Title 2,Heading\nText...,3,3.0,3.0,1.0,...,1,1,0.0,0.0,2,0,0,0,0,1


In [28]:
df_train['text_clean'] = df_train['text'].progress_apply(clean_text)
df_train['text_clean'] = df_train['text_clean'].progress_apply(remove_punctuations)
df_train['text_clean'] = df_train['text_clean'].progress_apply(clean_number)
df_train['text_clean'] = df_train['text_clean'].progress_apply(clean_misspell)

100%|██████████| 7165/7165 [00:01<00:00, 5263.71it/s]
100%|██████████| 7165/7165 [00:00<00:00, 63163.51it/s]
100%|██████████| 7165/7165 [00:00<00:00, 13299.39it/s]
100%|██████████| 7165/7165 [00:00<00:00, 17523.89it/s]


In [29]:
df_test['text_clean'] = df_test['text'].progress_apply(clean_text)
df_test['text_clean'] = df_test['text_clean'].progress_apply(remove_punctuations)
df_test['text_clean'] = df_test['text_clean'].progress_apply(clean_number)
df_test['text_clean'] = df_test['text_clean'].progress_apply(clean_misspell)

100%|██████████| 4/4 [00:00<00:00, 3826.92it/s]
100%|██████████| 4/4 [00:00<00:00, 6347.79it/s]
100%|██████████| 4/4 [00:00<00:00, 5938.84it/s]
100%|██████████| 4/4 [00:00<00:00, 5801.25it/s]


# Dataset Creation
Splitting the data into training and testing sets  
Checking the shape of the data

In [30]:
from transformers import AutoTokenizer

model_name = "/kaggle/input/deberta-v3-base/deberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
# from torch.utils.data import Dataset

class CommonLitDataset(Dataset):
    def __init__(self, df, has_labels=True):
        self.df = df
        self.prompt_titles = df["text"].values.tolist()
#         self.prompt_titles = df["text_clean"].values.tolist()
        self.texts = (df["prompt_title"] + tokenizer.sep_token + df["prompt_text"]).values.tolist()
        self.encoded_examples = tokenizer(
            text=self.prompt_titles,
            text_pair=self.texts,
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors="pt"
        )
        
    
        self.has_labels = has_labels
        if self.has_labels:
            self.labels_list = df[["content", "wording"]].values.tolist()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = {
            "input_ids": self.encoded_examples["input_ids"][idx],
            "attention_mask": self.encoded_examples["attention_mask"][idx],
            "token_type_ids": self.encoded_examples["token_type_ids"][idx]
        }
        
        if self.has_labels:
            item["labels"] = torch.tensor(self.labels_list[idx])
        
        return item

# Model Training
Training the model and monitoring the progress


In [32]:
class CFG:
    model_name="deberta-v3-base"
    learning_rate=1.5e-5
    weight_decay=0.02 # Regularization 防止过拟合
    hidden_dropout_prob=0.007 # Dropout setting 随机失活的概率
    attention_probs_dropout_prob=0.007
    num_train_epochs= 5
    n_splits=4
    batch_size=12
    random_seed=42
    save_steps=100 
    max_length=512 # Max length of imput 限制输入序列的长度

In [33]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

In [34]:
class ContentScoreRegressor:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "text"]#"fixed_summary_text"]
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = [target]

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        
        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/deberta-v3-base/{model_name}")
        self.model_config = AutoConfig.from_pretrained(f"/kaggle/input/deberta-v3-base/{model_name}")
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression",
        })
        
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def train(self, 
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""
        
        sep = self.tokenizer.sep_token
        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep 
                    + train_df["prompt_question"] + sep 
                    + train_df["text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep 
                    + valid_df["prompt_question"] + sep 
                    + valid_df["text"]
                  )
        
        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]
        
        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"/kaggle/input/deberta-v3-base/{self.model_name}", 
            config=self.model_config
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
    
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )

        trainer.train()
        
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 

        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

In [35]:
def train_by_fold(
        train_df: pd.DataFrame,
        model_name: str,
        target:str,
        save_each_model: bool,
        n_splits: int,
        batch_size: int,
        learning_rate: int,
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        weight_decay: float,
        num_train_epochs: int,
        save_steps: int,
        max_length:int
    ):

    # delete old model files
    if os.path.exists(model_name):
        shutil.rmtree(model_name)
    
    os.mkdir(model_name)
        
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        train_data = train_df[train_df["fold"] != fold]
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        csr.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data, 
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )

def validate(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"
        
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"{target}_pred"] = pred

    return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"{target}_pred_{fold}"] = pred
    
    test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [36]:
train = df_train.copy()
gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,num_words,avg_sentence_length,...,polarity,subjectivity,nn_count,pr_count,vb_count,jj_count,uh_count,cd_count,text_clean,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,61,16.0,...,0.170455,0.334848,14,3,17,6,0,1,the third wave was an experimentto see how peo...,3.0
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,203,16.571429,...,0.048203,0.355229,59,11,37,7,0,6,the third wave developed rapidly because the ...,3.0
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,60,22.333333,...,0.075,0.31875,16,4,12,3,0,0,the third wave only started as an experiment w...,3.0
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,76,28.666667,...,-0.666667,0.666667,17,4,15,6,0,0,the experimen was orginally about how even whe...,3.0
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,27,14.5,...,0.088939,0.325909,5,2,4,4,0,0,the third wave developed so quickly due to the...,3.0


In [37]:
for target in ["content", "wording"]:
    train_by_fold(
        train,
        model_name=CFG.model_name,
        save_each_model=False,
        target=target,
        learning_rate=CFG.learning_rate,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        weight_decay=CFG.weight_decay,
        num_train_epochs=CFG.num_train_epochs,
        n_splits=CFG.n_splits,
        batch_size=CFG.batch_size,
        save_steps=CFG.save_steps,
        max_length=CFG.max_length
    )
    
    
    train = validate(
        train,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

    rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

    test = predict(
        df_test,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.den

  0%|          | 0/5108 [00:00<?, ?ex/s]

  0%|          | 0/2057 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.322022,0.56747
200,No log,0.581104,0.762302
300,No log,0.363667,0.603048
400,No log,0.370565,0.608741
500,0.277700,0.29605,0.544105
600,0.277700,0.208726,0.456865
700,0.277700,0.340182,0.583251
800,0.277700,0.279891,0.529048
900,0.277700,0.231359,0.480998
1000,0.154600,0.313105,0.559558


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.den

  0%|          | 0/5156 [00:00<?, ?ex/s]

  0%|          | 0/2009 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.432247,0.657455
200,No log,0.292302,0.540649
300,No log,0.307226,0.554279
400,No log,0.254195,0.504178
500,0.261500,0.276928,0.526239
600,0.261500,0.283401,0.532354
700,0.261500,0.271028,0.520604
800,0.261500,0.334499,0.578359
900,0.261500,0.252113,0.502109
1000,0.146500,0.256369,0.506329


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.den

  0%|          | 0/5169 [00:00<?, ?ex/s]

  0%|          | 0/1996 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.248646,0.498644
200,No log,0.211791,0.460208
300,No log,0.237323,0.487158
400,No log,0.208074,0.456152
500,0.266000,0.219113,0.468095
600,0.266000,0.216651,0.465458
700,0.266000,0.224326,0.473631
800,0.266000,0.327336,0.572132
900,0.266000,0.284765,0.533634
1000,0.151800,0.255452,0.505422


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.den

  0%|          | 0/6062 [00:00<?, ?ex/s]

  0%|          | 0/1103 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.381897,0.617979
200,No log,0.623912,0.789881
300,No log,0.302387,0.549897
400,No log,0.487625,0.698302
500,0.263600,0.477493,0.691009
600,0.263600,0.425844,0.652567
700,0.263600,0.582975,0.763528
800,0.263600,0.500077,0.707161
900,0.263600,0.551501,0.742631
1000,0.141700,0.485229,0.696584


fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


  0%|          | 0/2057 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


  0%|          | 0/2009 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


  0%|          | 0/1996 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


  0%|          | 0/1103 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


cv content rmse: 0.48490020123966093
fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.den

  0%|          | 0/5108 [00:00<?, ?ex/s]

  0%|          | 0/2057 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.495413,0.703856
200,No log,0.360614,0.600512
300,No log,0.313119,0.55957
400,No log,0.288785,0.537387
500,0.449100,0.328577,0.573217
600,0.449100,0.28796,0.536619
700,0.449100,0.296489,0.544508
800,0.449100,0.290371,0.53886
900,0.449100,0.296529,0.544545
1000,0.258700,0.289127,0.537706


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.den

  0%|          | 0/5156 [00:00<?, ?ex/s]

  0%|          | 0/2009 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.73487,0.857246
200,No log,0.747241,0.864431
300,No log,0.6266,0.791581
400,No log,0.678248,0.823558
500,0.388100,0.959753,0.97967
600,0.388100,0.583398,0.763805
700,0.388100,0.849816,0.921855
800,0.388100,0.707743,0.841275
900,0.388100,0.67122,0.81928
1000,0.228400,0.54705,0.739628


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.den

  0%|          | 0/5169 [00:00<?, ?ex/s]

  0%|          | 0/1996 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.737113,0.858553
200,No log,0.357126,0.5976
300,No log,0.488073,0.698622
400,No log,0.411263,0.641298
500,0.460400,0.399799,0.632297
600,0.460400,0.338406,0.581726
700,0.460400,0.358133,0.598442
800,0.460400,0.380027,0.616463
900,0.460400,0.566574,0.752711
1000,0.266500,0.522411,0.72278


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at /kaggle/input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.den

  0%|          | 0/6062 [00:00<?, ?ex/s]

  0%|          | 0/1103 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.965287,0.98249
200,No log,0.836798,0.914766
300,No log,0.584085,0.764255
400,No log,0.529104,0.727396
500,0.408900,0.504523,0.710298
600,0.408900,0.480383,0.693097
700,0.408900,0.483517,0.695354
800,0.408900,0.472191,0.687162
900,0.408900,0.514794,0.717492
1000,0.251900,0.47795,0.691339


fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


  0%|          | 0/2057 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


  0%|          | 0/2009 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


  0%|          | 0/1996 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


  0%|          | 0/1103 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


cv wording rmse: 0.6229618854176151
fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [38]:
import gc 
torch.cuda.empty_cache()
gc.collect()

381

## Starting of tree model

In [39]:
train.columns

Index(['student_id', 'prompt_id', 'text', 'content', 'wording',
       'prompt_question', 'prompt_title', 'prompt_text', 'num_words',
       'avg_sentence_length', 'num_difficult_words', 'lexical_diversity',
       'num_top_words', 'num_interrogative', 'num_exclamatory',
       'num_misspelled_words', 'prompt_length', 'summary_length',
       'length_ratio', 'word_overlap_count', 'bigram_overlap_count',
       'bigram_overlap_ratio', 'trigram_overlap_count',
       'trigram_overlap_ratio', 'quotes_count', 'num_unique_words',
       'num_chars', 'num_stopwords', 'num_punctuations', 'num_words_upper',
       'num_words_title', 'mean_word_len', 'num_paragraphs', 'num_sentences',
       'polarity', 'subjectivity', 'nn_count', 'pr_count', 'vb_count',
       'jj_count', 'uh_count', 'cd_count', 'text_clean', 'fold',
       'content_pred', 'wording_pred'],
      dtype='object')

In [40]:
# drop_columns = [
#                 #"fold", 
#                 "student_id", "prompt_id", "text", "fixed_summary_text",
#                 "prompt_question", "prompt_title", 
#                 "prompt_text",
#                 "input"
#                ] + [
#                 f"content_pred_{i}" for i in range(CFG.n_splits)
#                 ] + [
#                 f"wording_pred_{i}" for i in range(CFG.n_splits)
#                 ]
targets = ["content", "wording"]
drop_columns = ["fold", "student_id", "prompt_id", "text", 
                "prompt_question", "prompt_title", 'text_clean',
                "prompt_text"
               ] + targets

In [41]:
import lightgbm as lgb

In [42]:
model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

        params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.05,
                  }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                               lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        models.append(model)
    
    model_dict[target] = models

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4550
[LightGBM] [Info] Number of data points in the train set: 5108, number of used features: 36
[LightGBM] [Info] Start training from score 0.017606
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[46]	train's rmse: 0.421535
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4397
[LightGBM] [Info] Number of data points in the train set: 5156, number of used features: 33
[LightGBM] [Info] Start training from score -0.039959
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.468097
Early stopping, best iteration is:
[96]	train's rmse: 0.467606
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4463
[LightGBM] [Info] Number of data points in the train set: 5169, number of used features: 36
[LightGBM] [Info] Start training from score 0.013356
Training un

In [43]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

content_rmse : 0.44436284834166023
wording_rmse : 0.5549322579095844
mcrmse : 0.49964755312562237


without prompt text features, 8 pca:
0.403,0.526,0.465

# Prediction
Making predictions on the test data  

In [44]:
drop_columns = [
                #"fold", 
                "student_id", "prompt_id", "text", 
                "prompt_question", "prompt_title", 
                "prompt_text",'text_clean',
                "input"
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [45]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [46]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

# Creating the Submission File
reating the file for submission


In [47]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)