# CommonLit - Evaluate Student Summaries
# Introduction
Create a quality assessment model for summaries written by students from grade 3 to grade 12. The quality will be evaluated based on the following two criteria:
  - content: How well the summary captures the main ideas and details of the source text
  - wording: The clarity, precision, and fluency of the language used in the summary

#  Data Loading
Loading the data and displaying basic information

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
# prompts_train_path = "/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv"
# propmts_test_path = "/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv"

# summaries_train_path = "/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv"
# summaries_test_path = "/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv"

prompts_train_path = "prompts_train.csv"
propmts_test_path = "prompts_test.csv"

summaries_train_path = "summaries_train.csv"
summaries_test_path = "summaries_test.csv"

In [3]:
# load files
df_prompts_train = pd.read_csv(prompts_train_path)
df_propmts_test = pd.read_csv(propmts_test_path)

df_summaries_train = pd.read_csv(summaries_train_path)
df_summaries_test = pd.read_csv(summaries_test_path)

# merge files
df_train = df_summaries_train.merge(df_prompts_train, on="prompt_id")
df_test = df_summaries_test.merge(df_propmts_test, on="prompt_id")

In [4]:
df_train.head(3)

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


# Exploratory Data Analysis (EDA)
Checking the distribution, outliers, etc.

In [5]:
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, GroupKFold

import re
import string
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import seaborn as sns
from transformers import Trainer
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import shutil

from datasets import Dataset,load_dataset, load_from_disk

from datasets import load_metric, disable_progress_bar

from sklearn.metrics import mean_squared_error

from tqdm import tqdm, tqdm_notebook
# pyspellcheckerのインストール
# https://pyspellchecker.readthedocs.io/en/latest/quickstart.html
import os

In [6]:
# !pip install /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize

In [7]:
tqdm.pandas()

In [8]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [9]:
# nltk.data.path.append('/kaggle/input/nltk-dataset/stopwords')
nltk.download('stopwords')
difficult_words = set(stopwords.words('english'))

# SpellChecker
spell = SpellChecker()



[nltk_data] Downloading package stopwords to /home/yi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
wordfreqfile = open('enwiki-2023-04-13.txt', 'r')
wordfreqlist = [line.split(' ')[0] for line in wordfreqfile.readlines()]
wordfreqlist_500 = set(wordfreqlist[:500])
wordfreqlist_1000 = set(wordfreqlist[:1000])
wordfreqlist_5000 = set(wordfreqlist[:5000])
wordfreqlist_10000 = set(wordfreqlist[:10000])
wordfreqlist_20000 = set(wordfreqlist[:20000])
wordfreqlist_50000 = set(wordfreqlist[:50000])
wordfreqlist_100000 = set(wordfreqlist[:100000])

In [11]:
def extract_features(text):
    
    # 
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)
    
    # 
    num_difficult_words = sum(1 for word in words if word.lower() not in difficult_words)
    num_unfreq_words_500 = sum(1 for word in words if word.lower() not in wordfreqlist_500)
    num_unfreq_words_1000 = sum(1 for word in words if word.lower() not in wordfreqlist_1000)
    
    num_unfreq_words_5000 = sum(1 for word in words if word.lower() not in wordfreqlist_5000)
    num_unfreq_words_10000 = sum(1 for word in words if word.lower() not in wordfreqlist_10000)
    num_unfreq_words_20000 = sum(1 for word in words if word.lower() not in wordfreqlist_20000)
    num_unfreq_words_50000 = sum(1 for word in words if word.lower() not in wordfreqlist_50000)
    num_unfreq_words_100000 = sum(1 for word in words if word.lower() not in wordfreqlist_100000)
    # 
    lexical_diversity = len(set(words)) / len(words)
    
    # 
    freq_dist = FreqDist(words)
    
    # 
    num_top_words = len([word for word, freq in freq_dist.items() if freq >= len(words) * 0.10])
    
    # 
    num_interrogative = text.count('?')
    num_exclamatory = text.count('!')
    
    # 
    misspelled_words = spell.unknown(words)
    
    # 
    return {
        'num_words': len(words),
        'avg_sentence_length': len(words) / len(sentences),
        'num_difficult_words': num_difficult_words,
        'num_unfreq_words_500':num_unfreq_words_500,
        'num_unfreq_words_1000':num_unfreq_words_1000,
        'num_unfreq_words_5000':num_unfreq_words_5000,
        'num_unfreq_words_10000':num_unfreq_words_10000,
        'num_unfreq_words_20000':num_unfreq_words_20000,
        'num_unfreq_words_50000':num_unfreq_words_50000,
        'num_unfreq_words_100000':num_unfreq_words_100000,
        'lexical_diversity': lexical_diversity,
        'num_top_words': num_top_words,
        'num_interrogative': num_interrogative,
        'num_exclamatory': num_exclamatory,
        'num_misspelled_words': len(misspelled_words)
    }


def extract_features_prompt(text):
    
    # 
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)
    
    # 
    num_difficult_words = sum(1 for word in words if word.lower() not in difficult_words)
    num_unfreq_words_500 = sum(1 for word in words if word.lower() not in wordfreqlist_500)
    num_unfreq_words_1000 = sum(1 for word in words if word.lower() not in wordfreqlist_1000)
    
    num_unfreq_words_5000 = sum(1 for word in words if word.lower() not in wordfreqlist_5000)
    num_unfreq_words_10000 = sum(1 for word in words if word.lower() not in wordfreqlist_10000)
    num_unfreq_words_20000 = sum(1 for word in words if word.lower() not in wordfreqlist_20000)
    num_unfreq_words_50000 = sum(1 for word in words if word.lower() not in wordfreqlist_50000)
    num_unfreq_words_100000 = sum(1 for word in words if word.lower() not in wordfreqlist_100000)
    # 
    lexical_diversity = len(set(words)) / len(words)
    
    # 
    freq_dist = FreqDist(words)
    
    # 
    num_top_words = len([word for word, freq in freq_dist.items() if freq >= len(words) * 0.10])
    
    # 
    num_interrogative = text.count('?')
    num_exclamatory = text.count('!')
    
    # 
    misspelled_words = spell.unknown(words)
    
    # 
    return {
        'prompt_num_words': len(words),
        'prompt_avg_sentence_length': len(words) / len(sentences),
        'prompt_num_difficult_words': num_difficult_words,
        'prompt_num_unfreq_words_500':num_unfreq_words_500,
        'prompt_num_unfreq_words_1000':num_unfreq_words_1000,
        'prompt_num_unfreq_words_5000':num_unfreq_words_5000,
        'prompt_num_unfreq_words_10000':num_unfreq_words_10000,
        'prompt_num_unfreq_words_20000':num_unfreq_words_20000,
        'prompt_num_unfreq_words_50000':num_unfreq_words_50000,
        'prompt_num_unfreq_words_100000':num_unfreq_words_100000,
        'prompt_lexical_diversity': lexical_diversity,
        'prompt_num_top_words': num_top_words,
        'prompt_num_interrogative': num_interrogative,
        'prompt_num_exclamatory': num_exclamatory,
        'prompt_num_misspelled_words': len(misspelled_words)
    }

In [12]:
#
df_train_features = df_train['text'].apply(extract_features).apply(pd.Series)
df_train_features_prompt = df_train['prompt_text'].apply(extract_features_prompt).apply(pd.Series)
df_train = pd.concat([df_train, df_train_features,df_train_features_prompt], axis=1)

df_test_features = df_test['text'].apply(extract_features).apply(pd.Series)
df_test_features_prompt = df_test['prompt_text'].apply(extract_features_prompt).apply(pd.Series)
df_test = pd.concat([df_test, df_test_features,df_test_features_prompt], axis=1)

In [13]:
misspell_mapping = {
    'studentdesigned': 'student designed',
    'teacherdesigned': 'teacher designed',
    'genericname': 'generic name',
    'winnertakeall': 'winner take all',
    'studentname': 'student name',
    'driveless': 'driverless',
    'teachername': 'teacher name',
    'propername': 'proper name',
    'bestlaid': 'best laid',
    'genericschool': 'generic school',
    'schoolname': 'school name',
    'winnertakesall': 'winner take all',
    'elctoral': 'electoral',
    'eletoral': 'electoral',
    'genericcity': 'generic city',
    'elctors': 'electoral',
    'venuse': 'venue',
    'blimplike': 'blimp like',
    'selfdriving': 'self driving',
    'electorals': 'electoral',
    'nearrecord': 'near record',
    'egyptianstyle': 'egyptian style',
    'oddnumbered': 'odd numbered',
    'carintensive': 'car intensive',
    'elecoral': 'electoral',
    'oction': 'auction',
    'electroal': 'electoral',
    'evennumbered': 'even numbered',
    'mesalandforms': 'mesa landforms',
    'electoralvote': 'electoral vote',
    'relativename': 'relative name',
    '22euro': 'twenty two euro',
    'ellectoral': 'electoral',
    'thirtyplus': 'thirty plus',
    'collegewon': 'college won',
    'hisher': 'higher',
    'teacherbased': 'teacher based',
    'computeranimated': 'computer animated',
    'canadidate': 'candidate',
    'studentbased': 'student based',
    'gorethanks': 'gore thanks',
    'clouddraped': 'cloud draped',
    'edgarsnyder': 'edgar snyder',
    'emotionrecognition': 'emotion recognition',
    'landfrom': 'land form',
    'fivedays': 'five days',
    'electoal': 'electoral',
    'lanform': 'land form',
    'electral': 'electoral',
    'presidentbut': 'president but',
    'teacherassigned': 'teacher assigned',
    'beacuas': 'because',
    'positionestimating': 'position estimating',
    'selfeducation': 'self education',
    'diverless': 'driverless',
    'computerdriven': 'computer driven',
    'outofcontrol': 'out of control',
    'faultthe': 'fault the',
    'unfairoutdated': 'unfair outdated',
    'aviods': 'avoid',
    'momdad': 'mom dad',
    'statesbig': 'states big',
    'presidentswing': 'president swing',
    'inconclusion': 'in conclusion',
    'handsonlearning': 'hands on learning',
    'electroral': 'electoral',
    'carowner': 'car owner',
    'elecotral': 'electoral',
    'studentassigned': 'student assigned',
    'collegefive': 'college five',
    'presidant': 'president',
    'unfairoutdatedand': 'unfair outdated and',
    'nixonjimmy': 'nixon jimmy',
    'canadates': 'candidate',
    'tabletennis': 'table tennis',
    'himher': 'him her',
    'studentsummerpacketdesigners': 'student summer packet designers',
    'studentdesign': 'student designed',
    'limting': 'limiting',
    'electrol': 'electoral',
    'campaignto': 'campaign to',
    'presendent': 'president',
    'thezebra': 'the zebra',
    'landformation': 'land formation',
    'eyetoeye': 'eye to eye',
    'selfreliance': 'self reliance',
    'studentdriven': 'student driven',
    'winnertake': 'winner take',
    'alliens': 'aliens',
    '2000but': '2000 but',
    'electionto': 'election to',
    'candidatesas': 'candidates as',
    'electers': 'electoral',
    'winnertakes': 'winner takes',
    'isfeet': 'is feet',
    'incar': 'incur',
    'wellconstructed': 'well constructed',
    'craftsmenwomen': 'crafts men women',
    'freelunch': 'free lunch',
    'twothousandrevolutions': 'two thousand revolutions',
    'ushistoryorg': 'us history org',
    'pharohs': 'pharaohs',
    'whitehot': 'white hot',
    'vizers': 'visors',
    'mrjones': 'mr jones',
    'aminute': 'a minute',
    'spoiledmeat': 'spoiled meat',
    'farmersgave': 'farmers gave',
    'spolied': 'spoiled',
    'tradgey': 'tragedy',
    'pyrimid': 'pyramid',
    'pyrimad': 'pyramid',
    'egyptiansfrom': 'egyptians from',
    'harvestthats': 'harvest that',
    'expierment': 'experiment',
    'jestthat': 'jest that',
    'twothousandrevolutionsaminute': 'two thousand revolutions a minute',
    'expirament': 'experiment',
    'nonspoiled': 'non spoiled',
    'egyptains': 'egyptians',
    'tragedys': 'tragedy',
    'pyrmaid': 'pyramid',
    'expirment': 'experiment',
    'whiteit': 'grade there',
    'gradethere': 'tragedy',
    'goverement': 'government',
    'godsthe': 'gods the',
    'paraoh': 'pharaoh',
    'classesupper': 'classes upper',
    'pharoes': 'pharaohs',
    'noblespriests': 'noble priests',
    'farmersslaves': 'farmers slaves',
    'harvestâ€”thatâ€™s': 'harvest that',
    'tradedy': 'tragedy',
    'paraohs': 'pharaohs',
    'paragrapgh': 'paragraph',
    'expieriment': 'experiment',
    'tragdey': 'tragedy',
    'pyramaid': 'pyramid',
    'pyrmid': 'pyramid',
    'prists': 'priests',
    'pharoas': 'pharaohs',
    'priets': 'priests',
    'pharoph': 'pharaohs',
    'pharaoah': 'pharaohs',
    'pharahos': 'pharaohs',
    'pharaohthe': 'pharaohs'
}

In [14]:
def decontraction(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"he's", "he is", phrase)
    phrase = re.sub(r"there's", "there is", phrase)
    phrase = re.sub(r"We're", "We are", phrase)
    phrase = re.sub(r"That's", "That is", phrase)
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"they're", "they are", phrase)
    phrase = re.sub(r"Can't", "Cannot", phrase)
    phrase = re.sub(r"wasn't", "was not", phrase)
    phrase = re.sub(r"don\x89Ûªt", "do not", phrase)
    phrase = re.sub(r"donãât", "do not", phrase)
    phrase = re.sub(r"aren't", "are not", phrase)
    phrase = re.sub(r"isn't", "is not", phrase)
    phrase = re.sub(r"What's", "What is", phrase)
    phrase = re.sub(r"haven't", "have not", phrase)
    phrase = re.sub(r"hasn't", "has not", phrase)
    phrase = re.sub(r"There's", "There is", phrase)
    phrase = re.sub(r"He's", "He is", phrase)
    phrase = re.sub(r"It's", "It is", phrase)
    phrase = re.sub(r"You're", "You are", phrase)
    phrase = re.sub(r"I'M", "I am", phrase)
    phrase = re.sub(r"shouldn't", "should not", phrase)
    phrase = re.sub(r"wouldn't", "would not", phrase)
    phrase = re.sub(r"i'm", "I am", phrase)
    phrase = re.sub(r"I\x89Ûªm", "I am", phrase)
    phrase = re.sub(r"I'm", "I am", phrase)
    phrase = re.sub(r"Isn't", "is not", phrase)
    phrase = re.sub(r"Here's", "Here is", phrase)
    phrase = re.sub(r"you've", "you have", phrase)
    phrase = re.sub(r"you\x89Ûªve", "you have", phrase)
    phrase = re.sub(r"we're", "we are", phrase)
    phrase = re.sub(r"what's", "what is", phrase)
    phrase = re.sub(r"couldn't", "could not", phrase)
    phrase = re.sub(r"we've", "we have", phrase)
    phrase = re.sub(r"it\x89Ûªs", "it is", phrase)
    phrase = re.sub(r"doesn\x89Ûªt", "does not", phrase)
    phrase = re.sub(r"It\x89Ûªs", "It is", phrase)
    phrase = re.sub(r"Here\x89Ûªs", "Here is", phrase)
    phrase = re.sub(r"who's", "who is", phrase)
    phrase = re.sub(r"I\x89Ûªve", "I have", phrase)
    phrase = re.sub(r"y'all", "you all", phrase)
    phrase = re.sub(r"can\x89Ûªt", "cannot", phrase)
    phrase = re.sub(r"would've", "would have", phrase)
    phrase = re.sub(r"it'll", "it will", phrase)
    phrase = re.sub(r"we'll", "we will", phrase)
    phrase = re.sub(r"wouldn\x89Ûªt", "would not", phrase)
    phrase = re.sub(r"We've", "We have", phrase)
    phrase = re.sub(r"he'll", "he will", phrase)
    phrase = re.sub(r"Y'all", "You all", phrase)
    phrase = re.sub(r"Weren't", "Were not", phrase)
    phrase = re.sub(r"Didn't", "Did not", phrase)
    phrase = re.sub(r"they'll", "they will", phrase)
    phrase = re.sub(r"they'd", "they would", phrase)
    phrase = re.sub(r"DON'T", "DO NOT", phrase)
    phrase = re.sub(r"That\x89Ûªs", "That is", phrase)
    phrase = re.sub(r"they've", "they have", phrase)
    phrase = re.sub(r"i'd", "I would", phrase)
    phrase = re.sub(r"should've", "should have", phrase)
    phrase = re.sub(r"You\x89Ûªre", "You are", phrase)
    phrase = re.sub(r"where's", "where is", phrase)
    phrase = re.sub(r"Don\x89Ûªt", "Do not", phrase)
    phrase = re.sub(r"we'd", "we would", phrase)
    phrase = re.sub(r"i'll", "I will", phrase)
    phrase = re.sub(r"weren't", "were not", phrase)
    phrase = re.sub(r"They're", "They are", phrase)
    phrase = re.sub(r"Can\x89Ûªt", "Cannot", phrase)
    phrase = re.sub(r"you\x89Ûªll", "you will", phrase)
    phrase = re.sub(r"I\x89Ûªd", "I would", phrase)
    phrase = re.sub(r"let's", "let us", phrase)
    phrase = re.sub(r"it's", "it is", phrase)
    phrase = re.sub(r"can't", "cannot", phrase)
    phrase = re.sub(r"don't", "do not", phrase)
    phrase = re.sub(r"you're", "you are", phrase)
    phrase = re.sub(r"i've", "I have", phrase)
    phrase = re.sub(r"that's", "that is", phrase)
    phrase = re.sub(r"i'll", "I will", phrase)
    phrase = re.sub(r"doesn't", "does not",phrase)
    phrase = re.sub(r"i'd", "I would", phrase)
    phrase = re.sub(r"didn't", "did not", phrase)
    phrase = re.sub(r"ain't", "am not", phrase)
    phrase = re.sub(r"you'll", "you will", phrase)
    phrase = re.sub(r"I've", "I have", phrase)
    phrase = re.sub(r"Don't", "do not", phrase)
    phrase = re.sub(r"I'll", "I will", phrase)
    phrase = re.sub(r"I'd", "I would", phrase)
    phrase = re.sub(r"Let's", "Let us", phrase)
    phrase = re.sub(r"you'd", "You would", phrase)
    phrase = re.sub(r"It's", "It is", phrase)
    phrase = re.sub(r"Ain't", "am not", phrase)
    phrase = re.sub(r"Haven't", "Have not", phrase)
    phrase = re.sub(r"Could've", "Could have", phrase)
    phrase = re.sub(r"youve", "you have", phrase)  
    phrase = re.sub(r"donå«t", "do not", phrase)
    return phrase

In [15]:
def clean_text(text):
    text = decontraction(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    return text

In [16]:
def remove_punctuations(text):
    for punctuation in list(string.punctuation):
        text = text.replace(punctuation, '')
    return text


In [17]:
def clean_number(text):
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    return text

In [18]:
def clean_misspell(text):
    for bad_word in misspell_mapping:
        if bad_word in text:
            text = text.replace(bad_word, misspell_mapping[bad_word])
    return text

In [19]:
def pos_count(sent):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    jj_count = 0   #Adjective
    uh_count = 0   #Interjection
    cd_count = 0   #Numerics
    
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)

    for token in sent:
        if token[1] in ['NN','NNP','NNS']:
            nn_count += 1

        if token[1] in ['PRP','PRP$']:
            pr_count += 1

        if token[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']:
            vb_count += 1

        if token[1] in ['JJ','JJR','JJS']:
            jj_count += 1

        if token[1] in ['UH']:
            uh_count += 1

        if token[1] in ['CD']:
            cd_count += 1
    
    return pd.Series([nn_count, pr_count, vb_count, jj_count, uh_count, cd_count])

In [20]:
# from autocorrect import Speller
# from spellchecker import SpellChecker
# speller = Speller(lang='en')
# spellchecker = SpellChecker()
# stop_words = set(stopwords.words('english'))

In [21]:
# def add_spelling_dictionary(tokens):
#     spellchecker.word_frequency.load_words(tokens)
#     speller.nlp_data.update({token:1000 for token in tokens})

In [22]:
# def spelling(text):
#     wordlist = text.split()
#     amount_miss = len(list(spellchecker.unknown(wordlist)))
#     return amount_miss

In [23]:
def word_overlap_count(row):
    def check_is_stop_word(word):
        return word in difficult_words
    
    prompt_words = row['prompt_tokens']
    summary_words = row['summary_tokens']
    
    if difficult_words:
        prompt_words = list(filter(check_is_stop_word, prompt_words))
        summary_words = list(filter(check_is_stop_word, summary_words))
    
    return len(set(prompt_words).intersection(set(summary_words)))

def ngrams(token, n):
    ngrams = zip(*[token[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]


def ngram_co_occurrence(row, n):
    original_tokens = row['prompt_tokens']
    summary_tokens = row['summary_tokens']

    original_ngrams = set(ngrams(original_tokens, n))
    summary_ngrams = set(ngrams(summary_tokens, n))

    common_ngrams = original_ngrams.intersection(summary_ngrams)
    return len(common_ngrams)


def quotes_count(row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

In [24]:
def text_preprocess(data):
    data["prompt_length"] = data["prompt_text"].progress_apply(lambda x: len(word_tokenize(x)))
    data["prompt_tokens"] = data["prompt_text"].progress_apply(lambda x: word_tokenize(x))
    data["summary_length"] = data["text"].progress_apply(lambda x: len(word_tokenize(x)))
    data["summary_tokens"] = data["text"].progress_apply(lambda x: word_tokenize(x))
#     data["prompt_tokens"].progress_apply(lambda x: add_spelling_dictionary(x))
    #summaries["fixed_summary_text"] = summaries["text"].progress_apply(lambda x: speller(x))
#     data["splling_err_num"] = data["text"].progress_apply(spelling)
    
    df = data
    df['length_ratio'] = df['summary_length'] / df['prompt_length']
    df['word_overlap_count'] = df.progress_apply(word_overlap_count, axis=1)
    df['bigram_overlap_count'] = df.progress_apply(ngram_co_occurrence, args=(2,), axis=1)
    df['bigram_overlap_ratio'] = df['bigram_overlap_count'] / (df['summary_length'] - 1)
    df['trigram_overlap_count'] = df.progress_apply(ngram_co_occurrence, args=(3,), axis=1)
    df['trigram_overlap_ratio'] = df['trigram_overlap_count'] / (df['summary_length'] - 2)
    df['quotes_count'] = df.progress_apply(quotes_count, axis=1)
    return df.drop(columns=["summary_tokens", "prompt_tokens"])

In [25]:
df_train = text_preprocess(df_train)

100%|██████████| 7165/7165 [00:20<00:00, 346.68it/s]
100%|██████████| 7165/7165 [00:20<00:00, 343.33it/s]
100%|██████████| 7165/7165 [00:02<00:00, 2960.68it/s]
100%|██████████| 7165/7165 [00:02<00:00, 2987.08it/s]
100%|██████████| 7165/7165 [00:00<00:00, 11267.90it/s]
100%|██████████| 7165/7165 [00:00<00:00, 8401.66it/s]
100%|██████████| 7165/7165 [00:00<00:00, 7459.44it/s]
100%|██████████| 7165/7165 [00:00<00:00, 106890.99it/s]


In [26]:
df_train["num_words"] = df_train["text"].progress_apply(lambda x: len(str(x).split()))
df_train["num_unique_words"] = df_train["text"].progress_apply(lambda x: len(set(str(x).split())))
df_train["num_chars"] = df_train["text"].progress_apply(lambda x: len(str(x)))
df_train["num_stopwords"] = df_train["text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
df_train["num_punctuations"] =df_train['text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
df_train["num_words_upper"] = df_train["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
df_train["num_words_title"] = df_train["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df_train["mean_word_len"] = df_train["text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_train["num_paragraphs"] = df_train["text"].progress_apply(lambda x: len(x.split('\n')))
df_train["num_sentences"] = df_train["text"].progress_apply(lambda x: len(str(x).split('.')))
df_train["polarity"] = df_train['text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
df_train["subjectivity"] = df_train['text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
df_train[['nn_count','pr_count','vb_count','jj_count','uh_count','cd_count']] = df_train['text'].progress_apply(pos_count)

# df_train["prompt_num_words"] = df_train["prompt_text"].progress_apply(lambda x: len(str(x).split()))
# df_train["prompt_num_unique_words"] = df_train["prompt_text"].progress_apply(lambda x: len(set(str(x).split())))
# df_train["prompt_num_chars"] = df_train["prompt_text"].progress_apply(lambda x: len(str(x)))
# df_train["prompt_num_stopwords"] = df_train["prompt_text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
# df_train["prompt_num_punctuations"] =df_train['prompt_text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
# df_train["prompt_num_words_upper"] = df_train["prompt_text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
# df_train["prompt_num_words_title"] = df_train["prompt_text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
# df_train["prompt_mean_word_len"] = df_train["prompt_text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
# df_train["prompt_num_paragraphs"] = df_train["prompt_text"].progress_apply(lambda x: len(x.split('\n')))
# df_train["prompt_num_sentences"] = df_train["prompt_text"].progress_apply(lambda x: len(str(x).split('.')))
# df_train["prompt_polarity"] = df_train['prompt_text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
# df_train["prompt_subjectivity"] = df_train['prompt_text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
# df_train[['prompt_nn_count','prompt_pr_count','prompt_vb_count','prompt_jj_count','prompt_uh_count','prompt_cd_count']] = df_train['prompt_text'].progress_apply(pos_count)
df_train.head()

100%|██████████| 7165/7165 [00:00<00:00, 308965.92it/s]
100%|██████████| 7165/7165 [00:00<00:00, 143932.24it/s]
100%|██████████| 7165/7165 [00:00<00:00, 1196916.85it/s]
100%|██████████| 7165/7165 [00:26<00:00, 268.61it/s]
100%|██████████| 7165/7165 [00:01<00:00, 5127.44it/s]
100%|██████████| 7165/7165 [00:00<00:00, 187105.82it/s]
100%|██████████| 7165/7165 [00:00<00:00, 171055.28it/s]
100%|██████████| 7165/7165 [00:00<00:00, 67077.93it/s]
100%|██████████| 7165/7165 [00:00<00:00, 942252.09it/s]
100%|██████████| 7165/7165 [00:00<00:00, 767557.74it/s]
100%|██████████| 7165/7165 [00:01<00:00, 3629.26it/s]
100%|██████████| 7165/7165 [00:01<00:00, 3626.03it/s]
100%|██████████| 7165/7165 [00:15<00:00, 466.83it/s]


Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,num_words,avg_sentence_length,...,num_paragraphs,num_sentences,polarity,subjectivity,nn_count,pr_count,vb_count,jj_count,uh_count,cd_count
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,61,16.0,...,1,4,0.170455,0.334848,14,3,17,6,0,1
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,203,16.571429,...,1,14,0.048203,0.355229,59,11,37,7,0,6
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,60,22.333333,...,1,6,0.075,0.31875,16,4,12,3,0,0
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,76,28.666667,...,1,4,-0.666667,0.666667,17,4,15,6,0,0
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,27,14.5,...,1,3,0.088939,0.325909,5,2,4,4,0,0


In [27]:
df_test = text_preprocess(df_test)

100%|██████████| 4/4 [00:00<00:00, 8820.83it/s]
100%|██████████| 4/4 [00:00<00:00, 8586.09it/s]
100%|██████████| 4/4 [00:00<00:00, 7869.24it/s]
100%|██████████| 4/4 [00:00<00:00, 7778.03it/s]
100%|██████████| 4/4 [00:00<00:00, 3888.11it/s]
100%|██████████| 4/4 [00:00<00:00, 4394.24it/s]
100%|██████████| 4/4 [00:00<00:00, 4432.55it/s]
100%|██████████| 4/4 [00:00<00:00, 4655.17it/s]


In [28]:
df_test["num_words"] = df_test["text"].progress_apply(lambda x: len(str(x).split()))
df_test["num_unique_words"] = df_test["text"].progress_apply(lambda x: len(set(str(x).split())))
df_test["num_chars"] = df_test["text"].progress_apply(lambda x: len(str(x)))
df_test["num_stopwords"] = df_test["text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
df_test["num_punctuations"] =df_test['text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
df_test["num_words_upper"] = df_test["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
df_test["num_words_title"] = df_test["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df_test["mean_word_len"] = df_test["text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_test["num_paragraphs"] = df_test["text"].progress_apply(lambda x: len(x.split('\n')))
df_test["num_sentences"] = df_test["text"].progress_apply(lambda x: len(str(x).split('.')))
df_test["polarity"] = df_test['text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
df_test["subjectivity"] = df_test['text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
df_test[['nn_count','pr_count','vb_count','jj_count','uh_count','cd_count']] = df_test['text'].progress_apply(pos_count)

# df_test["prompt_num_words"] = df_test["prompt_text"].progress_apply(lambda x: len(str(x).split()))
# df_test["prompt_num_unique_words"] = df_test["prompt_text"].progress_apply(lambda x: len(set(str(x).split())))
# df_test["prompt_num_chars"] = df_test["prompt_text"].progress_apply(lambda x: len(str(x)))
# df_test["prompt_num_stopwords"] = df_test["prompt_text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
# df_test["prompt_num_punctuations"] =df_test['prompt_text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
# df_test["prompt_num_words_upper"] = df_test["prompt_text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
# df_test["prompt_num_words_title"] = df_test["prompt_text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
# df_test["prompt_mean_word_len"] = df_test["prompt_text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
# df_test["prompt_num_paragraphs"] = df_test["prompt_text"].progress_apply(lambda x: len(x.split('\n')))
# df_test["prompt_num_sentences"] = df_test["prompt_text"].progress_apply(lambda x: len(str(x).split('.')))
# df_test["prompt_polarity"] = df_test['prompt_text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
# df_test["prompt_subjectivity"] = df_test['prompt_text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
# df_test[['prompt_nn_count','prompt_pr_count','prompt_vb_count','prompt_jj_count','prompt_uh_count','prompt_cd_count']] = df_test['prompt_text'].progress_apply(pos_count)

df_test.head()

100%|██████████| 4/4 [00:00<00:00, 24600.02it/s]
100%|██████████| 4/4 [00:00<00:00, 26886.56it/s]
100%|██████████| 4/4 [00:00<00:00, 16304.39it/s]
100%|██████████| 4/4 [00:00<00:00, 2934.11it/s]
100%|██████████| 4/4 [00:00<00:00, 15577.73it/s]
100%|██████████| 4/4 [00:00<00:00, 11169.92it/s]
100%|██████████| 4/4 [00:00<00:00, 18703.70it/s]
100%|██████████| 4/4 [00:00<00:00, 14807.78it/s]
100%|██████████| 4/4 [00:00<00:00, 18872.01it/s]
100%|██████████| 4/4 [00:00<00:00, 19152.07it/s]
100%|██████████| 4/4 [00:00<00:00, 5090.17it/s]
100%|██████████| 4/4 [00:00<00:00, 4950.49it/s]
100%|██████████| 4/4 [00:00<00:00, 1467.31it/s]


Unnamed: 0,student_id,prompt_id,text,prompt_question,prompt_title,prompt_text,num_words,avg_sentence_length,num_difficult_words,num_unfreq_words_500,...,num_paragraphs,num_sentences,polarity,subjectivity,nn_count,pr_count,vb_count,jj_count,uh_count,cd_count
0,000000ffffff,abc123,Example text 1,Summarize...,Example Title 1,Heading\nText...,3,3.0,3.0,2.0,...,1,1,0.0,0.0,2,0,0,0,0,1
1,222222cccccc,abc123,Example text 3,Summarize...,Example Title 1,Heading\nText...,3,3.0,3.0,2.0,...,1,1,0.0,0.0,2,0,0,0,0,1
2,111111eeeeee,def789,Example text 2,Summarize...,Example Title 2,Heading\nText...,3,3.0,3.0,2.0,...,1,1,0.0,0.0,2,0,0,0,0,1
3,333333dddddd,def789,Example text 4,Summarize...,Example Title 2,Heading\nText...,3,3.0,3.0,2.0,...,1,1,0.0,0.0,2,0,0,0,0,1


In [29]:
df_train['text_clean'] = df_train['text'].progress_apply(clean_text)
df_train['text_clean'] = df_train['text_clean'].progress_apply(remove_punctuations)
df_train['text_clean'] = df_train['text_clean'].progress_apply(clean_number)
df_train['text_clean'] = df_train['text_clean'].progress_apply(clean_misspell)

100%|██████████| 7165/7165 [00:00<00:00, 13759.62it/s]
100%|██████████| 7165/7165 [00:00<00:00, 180375.54it/s]
100%|██████████| 7165/7165 [00:00<00:00, 54614.13it/s]
100%|██████████| 7165/7165 [00:00<00:00, 24985.32it/s]


In [30]:
df_test['text_clean'] = df_test['text'].progress_apply(clean_text)
df_test['text_clean'] = df_test['text_clean'].progress_apply(remove_punctuations)
df_test['text_clean'] = df_test['text_clean'].progress_apply(clean_number)
df_test['text_clean'] = df_test['text_clean'].progress_apply(clean_misspell)

100%|██████████| 4/4 [00:00<00:00, 10180.35it/s]
100%|██████████| 4/4 [00:00<00:00, 12018.06it/s]
100%|██████████| 4/4 [00:00<00:00, 11259.88it/s]
100%|██████████| 4/4 [00:00<00:00, 11008.67it/s]


# Dataset Creation
Splitting the data into training and testing sets  
Checking the shape of the data

In [30]:
from transformers import AutoTokenizer

model_name = "microsoft/deberta-v3-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Model Training
Training the model and monitoring the progress


In [39]:
class CFG:
    model_name="deberta-v3-base"
    learning_rate=1.5e-5
    weight_decay=0.02 # Regularization 防止过拟合
    hidden_dropout_prob=0.007 # Dropout setting 随机失活的概率
    attention_probs_dropout_prob=0.007
    num_train_epochs= 1
    n_splits=4
    batch_size=4
    random_seed=42
    save_steps=100 
    max_length=512 # Max length of imput 限制输入序列的长度

In [40]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

In [41]:
class ContentScoreRegressor: 
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "text"]#"fixed_summary_text"]
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = [target]

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        
        self.tokenizer = AutoTokenizer.from_pretrained(f"microsoft/deberta-v3-base")
        self.model_config = AutoConfig.from_pretrained(f"microsoft/deberta-v3-base")
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression",
        })
        
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def train(self, 
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""
        
        sep = self.tokenizer.sep_token
        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep 
                    + train_df["prompt_question"] + sep 
                    + train_df["text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep 
                    + valid_df["prompt_question"] + sep 
                    + valid_df["text"]
                  )
        
        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]
        
        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"microsoft/deberta-v3-base", 
            config=self.model_config
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
    
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )

        trainer.train()
        
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 

        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

In [42]:
def train_by_fold(
        train_df: pd.DataFrame,
        model_name: str,
        target:str,
        save_each_model: bool,
        n_splits: int,
        batch_size: int,
        learning_rate: int,
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        weight_decay: float,
        num_train_epochs: int,
        save_steps: int,
        max_length:int
    ):

    # delete old model files
    if os.path.exists(model_name):
        shutil.rmtree(model_name)
    
    os.mkdir(model_name)
        
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        train_data = train_df[train_df["fold"] != fold]
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        csr.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data, 
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )

def validate(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"
        
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"{target}_pred"] = pred

    return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"{target}_pred_{fold}"] = pred
    
    test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [43]:
train = df_train.copy()
gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,num_words,avg_sentence_length,...,polarity,subjectivity,nn_count,pr_count,vb_count,jj_count,uh_count,cd_count,text_clean,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,61,16.0,...,0.170455,0.334848,14,3,17,6,0,1,the third wave was an experimentto see how peo...,3.0
1,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,203,16.571429,...,0.048203,0.355229,59,11,37,7,0,6,the third wave developed rapidly because the ...,3.0
2,0095993991fe,814d6b,The third wave only started as an experiment w...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,60,22.333333,...,0.075,0.31875,16,4,12,3,0,0,the third wave only started as an experiment w...,3.0
3,00c20c6ddd23,814d6b,The experimen was orginally about how even whe...,0.567975,0.969062,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,76,28.666667,...,-0.666667,0.666667,17,4,15,6,0,0,the experimen was orginally about how even whe...,3.0
4,00d40ad10dc9,814d6b,The third wave developed so quickly due to the...,-0.910596,-0.081769,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,27,14.5,...,0.088939,0.325909,5,2,4,4,0,0,the third wave developed so quickly due to the...,3.0


In [46]:
for target in ["content", "wording"]:
    train_by_fold(
        train,
        model_name=CFG.model_name,
        save_each_model=False,
        target=target,
        learning_rate=CFG.learning_rate,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        weight_decay=CFG.weight_decay,
        num_train_epochs=CFG.num_train_epochs,
        n_splits=CFG.n_splits,
        batch_size=CFG.batch_size,
        save_steps=CFG.save_steps,
        max_length=CFG.max_length
    )
    
    
    train = validate(
        train,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

    rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

    test = predict(
        df_test,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_

Map:   0%|          | 0/5108 [00:00<?, ? examples/s]

Map:   0%|          | 0/2057 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.401398,0.633559
200,No log,0.319006,0.564806
300,No log,0.426074,0.652743
400,No log,0.23917,0.48905
500,0.347800,0.464791,0.681756
600,0.347800,0.317144,0.563156
700,0.347800,0.214987,0.463667
800,0.347800,0.236764,0.486584
900,0.347800,0.32954,0.574055
1000,0.223200,0.270699,0.520287


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_

Map:   0%|          | 0/5156 [00:00<?, ? examples/s]

Map:   0%|          | 0/2009 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.385737,0.621077
200,No log,0.580042,0.761605
300,No log,0.319427,0.565179
400,No log,0.34918,0.590914
500,0.337400,0.293957,0.542178
600,0.337400,0.319976,0.565664
700,0.337400,0.456921,0.675959
800,0.337400,0.363039,0.602527
900,0.337400,0.336576,0.580152
1000,0.185100,0.284807,0.533673


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_

Map:   0%|          | 0/5169 [00:00<?, ? examples/s]

Map:   0%|          | 0/1996 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.331818,0.576037
200,No log,0.243977,0.49394
300,No log,0.323284,0.568581
400,No log,0.228092,0.47759
500,0.336600,0.221806,0.470963
600,0.336600,0.211604,0.460005
700,0.336600,0.245315,0.495293
800,0.336600,0.22913,0.478676
900,0.336600,0.244342,0.49431
1000,0.211600,0.263096,0.512929


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_

Map:   0%|          | 0/6062 [00:00<?, ? examples/s]

Map:   0%|          | 0/1103 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.419256,0.647499
200,No log,0.398849,0.631545
300,No log,0.547738,0.740093
400,No log,0.538486,0.733816
500,0.341000,0.471162,0.686412
600,0.341000,0.557215,0.746468
700,0.341000,0.351471,0.59285
800,0.341000,0.387574,0.622554
900,0.341000,0.343305,0.585923
1000,0.187500,0.383445,0.61923


fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


Map:   0%|          | 0/2057 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


Map:   0%|          | 0/2009 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


Map:   0%|          | 0/1996 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


Map:   0%|          | 0/1103 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


cv content rmse: 0.501216233911963
fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_

Map:   0%|          | 0/5108 [00:00<?, ? examples/s]

Map:   0%|          | 0/2057 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.495753,0.704097
200,No log,0.492575,0.701837
300,No log,0.374369,0.611857
400,No log,0.342094,0.584888
500,0.553100,0.315342,0.561553
600,0.553100,0.386581,0.621757
700,0.553100,0.326614,0.571502
800,0.553100,0.346566,0.588698
900,0.553100,0.293005,0.5413
1000,0.373700,0.344916,0.587295


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_

Map:   0%|          | 0/5156 [00:00<?, ? examples/s]

Map:   0%|          | 0/2009 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.645586,0.803484
200,No log,0.603143,0.776623
300,No log,0.818556,0.904741
400,No log,0.813777,0.902096
500,0.502700,0.889608,0.94319
600,0.502700,0.75799,0.870626
700,0.502700,0.921968,0.960191
800,0.502700,0.70323,0.838588
900,0.502700,0.669797,0.818411
1000,0.317500,0.6565,0.810247


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_

Map:   0%|          | 0/5169 [00:00<?, ? examples/s]

Map:   0%|          | 0/1996 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,0.771314,0.878245
200,No log,0.462933,0.680392
300,No log,0.417501,0.646143
400,No log,0.446097,0.667905
500,0.566900,0.422573,0.650056
600,0.566900,0.313759,0.560142
700,0.566900,0.36897,0.607429
800,0.566900,0.442364,0.665104
900,0.566900,0.378227,0.615001
1000,0.405300,0.326535,0.571433


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[self.input_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df[self.input_col] = (
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_

Map:   0%|          | 0/6062 [00:00<?, ? examples/s]

Map:   0%|          | 0/1103 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rmse
100,No log,1.127853,1.062004
200,No log,0.868088,0.931713
300,No log,1.371032,1.170911
400,No log,0.685915,0.8282
500,0.502700,0.508128,0.712831
600,0.502700,0.61811,0.7862
700,0.502700,0.519488,0.720755
800,0.502700,0.470154,0.685678
900,0.502700,0.505365,0.71089
1000,0.326600,0.517265,0.719212


fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


Map:   0%|          | 0/2057 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


Map:   0%|          | 0/2009 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


Map:   0%|          | 0/1996 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[self.input_col] = in_text


Map:   0%|          | 0/1103 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


cv wording rmse: 0.6367536772260763
fold 0:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [45]:
import gc 
torch.cuda.empty_cache()
gc.collect()

7226

## Starting of tree model

In [57]:
X_train_cv.columns

Index(['num_words', 'avg_sentence_length', 'num_difficult_words',
       'num_unfreq_words_500', 'num_unfreq_words_1000',
       'num_unfreq_words_5000', 'num_unfreq_words_10000',
       'num_unfreq_words_20000', 'num_unfreq_words_50000',
       'num_unfreq_words_100000', 'lexical_diversity', 'num_top_words',
       'num_interrogative', 'num_exclamatory', 'num_misspelled_words',
       'prompt_num_words', 'prompt_avg_sentence_length',
       'prompt_num_difficult_words', 'prompt_num_unfreq_words_500',
       'prompt_num_unfreq_words_1000', 'prompt_num_unfreq_words_5000',
       'prompt_num_unfreq_words_10000', 'prompt_num_unfreq_words_20000',
       'prompt_num_unfreq_words_50000', 'prompt_num_unfreq_words_100000',
       'prompt_lexical_diversity', 'prompt_num_top_words',
       'prompt_num_interrogative', 'prompt_num_exclamatory',
       'prompt_num_misspelled_words', 'prompt_length', 'summary_length',
       'length_ratio', 'word_overlap_count', 'bigram_overlap_count',
       'bigra

In [59]:
test_copy.drop(columns=drop_columns).columns

Index(['num_words', 'avg_sentence_length', 'num_difficult_words',
       'num_unfreq_words_500', 'num_unfreq_words_1000',
       'num_unfreq_words_5000', 'num_unfreq_words_10000',
       'num_unfreq_words_20000', 'num_unfreq_words_50000',
       'num_unfreq_words_100000', 'lexical_diversity', 'num_top_words',
       'num_interrogative', 'num_exclamatory', 'num_misspelled_words',
       'prompt_num_words', 'prompt_avg_sentence_length',
       'prompt_num_difficult_words', 'prompt_num_unfreq_words_500',
       'prompt_num_unfreq_words_1000', 'prompt_num_unfreq_words_5000',
       'prompt_num_unfreq_words_10000', 'prompt_num_unfreq_words_20000',
       'prompt_num_unfreq_words_50000', 'prompt_num_unfreq_words_100000',
       'prompt_lexical_diversity', 'prompt_num_top_words',
       'prompt_num_interrogative', 'prompt_num_exclamatory',
       'prompt_num_misspelled_words', 'prompt_length', 'summary_length',
       'length_ratio', 'word_overlap_count', 'bigram_overlap_count',
       'bigra

In [60]:
df_test

Unnamed: 0,student_id,prompt_id,text,prompt_question,prompt_title,prompt_text,num_words,avg_sentence_length,num_difficult_words,num_unfreq_words_500,...,content_pred_0,content_pred_1,content_pred_2,content_pred_3,content,wording_pred_0,wording_pred_1,wording_pred_2,wording_pred_3,wording
0,000000ffffff,abc123,Example text 1,Summarize...,Example Title 1,Heading\nText...,3,3.0,3.0,2.0,...,-1.37364,-1.669059,-1.440567,-1.526402,-1.502417,-1.402608,-1.129992,-1.285518,-1.358818,-1.294234
1,222222cccccc,abc123,Example text 3,Summarize...,Example Title 1,Heading\nText...,3,3.0,3.0,2.0,...,-1.376333,-1.676393,-1.441463,-1.535597,-1.507447,-1.420122,-1.147928,-1.289689,-1.372256,-1.307498
2,111111eeeeee,def789,Example text 2,Summarize...,Example Title 2,Heading\nText...,3,3.0,3.0,2.0,...,-1.372934,-1.668015,-1.445778,-1.526391,-1.503279,-1.405735,-1.130658,-1.289774,-1.358575,-1.296185
3,333333dddddd,def789,Example text 4,Summarize...,Example Title 2,Heading\nText...,3,3.0,3.0,2.0,...,-1.381322,-1.685353,-1.449115,-1.543126,-1.514729,-1.429919,-1.14466,-1.30005,-1.379801,-1.313608


In [64]:
train_copy = pd.concat([df_train,train[['fold', 'content_pred', 'wording_pred']]],axis = 1)
test_copy = test

In [65]:
# drop_columns = [
#                 #"fold", 
#                 "student_id", "prompt_id", "text", "fixed_summary_text",
#                 "prompt_question", "prompt_title", 
#                 "prompt_text",
#                 "input"
#                ] + [
#                 f"content_pred_{i}" for i in range(CFG.n_splits)
#                 ] + [
#                 f"wording_pred_{i}" for i in range(CFG.n_splits)
#                 ]
targets = ["content", "wording"]
drop_columns = ["fold", "student_id", "prompt_id", "text", 
                "prompt_question", "prompt_title", 'text_clean',
                "prompt_text"
               ] + targets

In [66]:
import lightgbm as lgb

In [67]:
model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train_copy[train_copy["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train_copy[train_copy["fold"] != fold][target]

        X_eval_cv = train_copy[train_copy["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train_copy[train_copy["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

        params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.05,
                  }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                               lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        models.append(model)
    
    model_dict[target] = models

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001383 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5307
[LightGBM] [Info] Number of data points in the train set: 5108, number of used features: 56
[LightGBM] [Info] Start training from score 0.017606
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.395511
Early stopping, best iteration is:
[109]	train's rmse: 0.394583
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000907 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5082
[LightGBM] [Info] Number of data points in the train set: 5156, number of used features: 53
[LightGBM] [Info] Start training from score -0.039959
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.505873
Early stopping, best iteration is:
[100]	train's rmse: 0.505873
[LightGBM

In [68]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train_copy[train_copy["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train_copy[train_copy["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

content_rmse : 0.45948373619657434
wording_rmse : 0.5627812349099749
mcrmse : 0.5111324855532746


# Prediction
Making predictions on the test data  

In [69]:
drop_columns = [
                #"fold", 
                "student_id", "prompt_id", "text", 
                "prompt_question", "prompt_title", 
                "prompt_text",'text_clean',
                "input"
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [70]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test_copy.drop(columns=drop_columns)

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [71]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

# Creating the Submission File
reating the file for submission


In [72]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)