# CommonLit - Evaluate Student Summaries
# Introduction
Create a quality assessment model for summaries written by students from grade 3 to grade 12. The quality will be evaluated based on the following two criteria:
  - content: How well the summary captures the main ideas and details of the source text
  - wording: The clarity, precision, and fluency of the language used in the summary

#  Data Loading
Loading the data and displaying basic information

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
prompts_train_path = "/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv"
propmts_test_path = "/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv"

summaries_train_path = "/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv"
summaries_test_path = "/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv"

In [3]:
# load files
df_prompts_train = pd.read_csv(prompts_train_path)
df_propmts_test = pd.read_csv(propmts_test_path)

df_summaries_train = pd.read_csv(summaries_train_path)
df_summaries_test = pd.read_csv(summaries_test_path)

# merge files
df_train = df_summaries_train.merge(df_prompts_train, on="prompt_id")
df_test = df_summaries_test.merge(df_propmts_test, on="prompt_id")

In [4]:
# df_train = df_train.groupby('prompt_id', as_index=False).head(50).reset_index(drop=True)
# df_train

# Exploratory Data Analysis (EDA)
Checking the distribution, outliers, etc.

In [5]:
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, GroupKFold

import re
import string
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import seaborn as sns
from transformers import Trainer
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import shutil

from datasets import Dataset,load_dataset, load_from_disk

from datasets import load_metric, disable_progress_bar

from sklearn.metrics import mean_squared_error

from tqdm import tqdm, tqdm_notebook
import os

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [6]:
!pip install /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
from spellchecker import SpellChecker
from nltk.tokenize import word_tokenize

tqdm.pandas()

Processing /kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.7.2


In [7]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [8]:
nltk.data.path.append('/kaggle/input/nltk-dataset/stopwords')
nltk.download('stopwords')
difficult_words = set(stopwords.words('english'))

# SpellChecker
spell = SpellChecker()

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


In [9]:
wordfreqfile = open('/kaggle/input/wordfreq-for-nlp-en/enwiki-2023-04-13.txt', 'r')
wordfreqlist = [line.split(' ')[0] for line in wordfreqfile.readlines()]
wordfreqlist_500 = set(wordfreqlist[:500])
wordfreqlist_1000 = set(wordfreqlist[:1000])
wordfreqlist_5000 = set(wordfreqlist[:5000])
wordfreqlist_10000 = set(wordfreqlist[:10000])
wordfreqlist_20000 = set(wordfreqlist[:20000])
wordfreqlist_50000 = set(wordfreqlist[:50000])
wordfreqlist_100000 = set(wordfreqlist[:100000])

In [10]:
def extract_features(text):
    
    # 
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)
    
    # 
    num_difficult_words = len(set(word for word in words if word.lower() not in difficult_words))
    num_unfreq_words_500 = len(set(word for word in words if word.lower() not in wordfreqlist_500))
    num_unfreq_words_1000 = len(set(word for word in words if word.lower() not in wordfreqlist_1000))
    
    num_unfreq_words_5000 = len(set(word for word in words if word.lower() not in wordfreqlist_5000))
    num_unfreq_words_10000 = len(set(word for word in words if word.lower() not in wordfreqlist_10000))
    num_unfreq_words_20000 = len(set(word for word in words if word.lower() not in wordfreqlist_20000))
    num_unfreq_words_50000 = len(set(word for word in words if word.lower() not in wordfreqlist_50000))
    num_unfreq_words_100000 = len(set(word for word in words if word.lower() not in wordfreqlist_100000))
    # 
    lexical_diversity = len(set(words)) / len(words)
    
    # 
    freq_dist = FreqDist(words)
    
    # 
    num_top_words = len([word for word, freq in freq_dist.items() if freq >= len(words) * 0.10])
    
    # 
    num_interrogative = text.count('?')
    num_exclamatory = text.count('!')
    
    # 
    misspelled_words = spell.unknown(words)
    
    # 
    return {
        'num_words': len(words),
        'avg_sentence_length': len(words) / len(sentences),
        'num_difficult_words': num_difficult_words,
        'num_unfreq_words_500':num_unfreq_words_500,
        'num_unfreq_words_1000':num_unfreq_words_1000,
        'num_unfreq_words_5000':num_unfreq_words_5000,
        'num_unfreq_words_10000':num_unfreq_words_10000,
        'num_unfreq_words_20000':num_unfreq_words_20000,
        'num_unfreq_words_50000':num_unfreq_words_50000,
        'num_unfreq_words_100000':num_unfreq_words_100000,
        'num_unfreq_words_500_ratio':num_unfreq_words_500 / len(set(words)),
        'num_unfreq_words_1000_ratio':num_unfreq_words_1000 / len(set(words)),
        'num_unfreq_words_5000_ratio':num_unfreq_words_5000 / len(set(words)),
        'num_unfreq_words_10000_ratio':num_unfreq_words_10000 / len(set(words)),
        'num_unfreq_words_20000_ratio':num_unfreq_words_20000 / len(set(words)),
        'num_unfreq_words_50000_ratio':num_unfreq_words_50000 / len(set(words)),
        'num_unfreq_words_100000_ratio':num_unfreq_words_100000 / len(set(words)),



        'lexical_diversity': lexical_diversity,
        'num_top_words': num_top_words,
        'num_interrogative': num_interrogative,
        'num_exclamatory': num_exclamatory,
        'num_misspelled_words': len(misspelled_words)
    }


def extract_features_prompt(text):
    
    # 
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)
    
    # 
    num_difficult_words = len(set(word for word in words if word.lower() not in difficult_words))
    num_unfreq_words_500 = len(set(word for word in words if word.lower() not in wordfreqlist_500))
    num_unfreq_words_1000 = len(set(word for word in words if word.lower() not in wordfreqlist_1000))
    
    num_unfreq_words_5000 = len(set(word for word in words if word.lower() not in wordfreqlist_5000))
    num_unfreq_words_10000 = len(set(word for word in words if word.lower() not in wordfreqlist_10000))
    num_unfreq_words_20000 = len(set(word for word in words if word.lower() not in wordfreqlist_20000))
    num_unfreq_words_50000 = len(set(word for word in words if word.lower() not in wordfreqlist_50000))
    num_unfreq_words_100000 = len(set(word for word in words if word.lower() not in wordfreqlist_100000))
    # 
    lexical_diversity = len(set(words)) / len(words)
    
    # 
    freq_dist = FreqDist(words)
    
    # 
    num_top_words = len([word for word, freq in freq_dist.items() if freq >= len(words) * 0.10])
    
    # 
    num_interrogative = text.count('?')
    num_exclamatory = text.count('!')
    
    # 
    misspelled_words = spell.unknown(words)
    
    # 
    return {
        'prompt_num_words': len(words),
        'prompt_avg_sentence_length': len(words) / len(sentences),
        'prompt_num_difficult_words': num_difficult_words,
        'prompt_num_unfreq_words_500':num_unfreq_words_500,
        'prompt_num_unfreq_words_1000':num_unfreq_words_1000,
        'prompt_num_unfreq_words_5000':num_unfreq_words_5000,
        'prompt_num_unfreq_words_10000':num_unfreq_words_10000,
        'prompt_num_unfreq_words_20000':num_unfreq_words_20000,
        'prompt_num_unfreq_words_50000':num_unfreq_words_50000,
        'prompt_num_unfreq_words_100000':num_unfreq_words_100000,
        'prompt_lexical_diversity': lexical_diversity,
        'prompt_num_top_words': num_top_words,
        'prompt_num_interrogative': num_interrogative,
        'prompt_num_exclamatory': num_exclamatory,
        'prompt_num_misspelled_words': len(misspelled_words)
    }

In [11]:
#
df_train_features = df_train['text'].apply(extract_features).apply(pd.Series)
df_train_features_prompt = df_train['prompt_text'].apply(extract_features_prompt).apply(pd.Series)
df_train = pd.concat([df_train, df_train_features,df_train_features_prompt], axis=1)

df_test_features = df_test['text'].apply(extract_features).apply(pd.Series)
df_test_features_prompt = df_test['prompt_text'].apply(extract_features_prompt).apply(pd.Series)
df_test = pd.concat([df_test, df_test_features,df_test_features_prompt], axis=1)

In [12]:
misspell_mapping = {
    'studentdesigned': 'student designed',
    'teacherdesigned': 'teacher designed',
    'genericname': 'generic name',
    'winnertakeall': 'winner take all',
    'studentname': 'student name',
    'driveless': 'driverless',
    'teachername': 'teacher name',
    'propername': 'proper name',
    'bestlaid': 'best laid',
    'genericschool': 'generic school',
    'schoolname': 'school name',
    'winnertakesall': 'winner take all',
    'elctoral': 'electoral',
    'eletoral': 'electoral',
    'genericcity': 'generic city',
    'elctors': 'electoral',
    'venuse': 'venue',
    'blimplike': 'blimp like',
    'selfdriving': 'self driving',
    'electorals': 'electoral',
    'nearrecord': 'near record',
    'egyptianstyle': 'egyptian style',
    'oddnumbered': 'odd numbered',
    'carintensive': 'car intensive',
    'elecoral': 'electoral',
    'oction': 'auction',
    'electroal': 'electoral',
    'evennumbered': 'even numbered',
    'mesalandforms': 'mesa landforms',
    'electoralvote': 'electoral vote',
    'relativename': 'relative name',
    '22euro': 'twenty two euro',
    'ellectoral': 'electoral',
    'thirtyplus': 'thirty plus',
    'collegewon': 'college won',
    'hisher': 'higher',
    'teacherbased': 'teacher based',
    'computeranimated': 'computer animated',
    'canadidate': 'candidate',
    'studentbased': 'student based',
    'gorethanks': 'gore thanks',
    'clouddraped': 'cloud draped',
    'edgarsnyder': 'edgar snyder',
    'emotionrecognition': 'emotion recognition',
    'landfrom': 'land form',
    'fivedays': 'five days',
    'electoal': 'electoral',
    'lanform': 'land form',
    'electral': 'electoral',
    'presidentbut': 'president but',
    'teacherassigned': 'teacher assigned',
    'beacuas': 'because',
    'positionestimating': 'position estimating',
    'selfeducation': 'self education',
    'diverless': 'driverless',
    'computerdriven': 'computer driven',
    'outofcontrol': 'out of control',
    'faultthe': 'fault the',
    'unfairoutdated': 'unfair outdated',
    'aviods': 'avoid',
    'momdad': 'mom dad',
    'statesbig': 'states big',
    'presidentswing': 'president swing',
    'inconclusion': 'in conclusion',
    'handsonlearning': 'hands on learning',
    'electroral': 'electoral',
    'carowner': 'car owner',
    'elecotral': 'electoral',
    'studentassigned': 'student assigned',
    'collegefive': 'college five',
    'presidant': 'president',
    'unfairoutdatedand': 'unfair outdated and',
    'nixonjimmy': 'nixon jimmy',
    'canadates': 'candidate',
    'tabletennis': 'table tennis',
    'himher': 'him her',
    'studentsummerpacketdesigners': 'student summer packet designers',
    'studentdesign': 'student designed',
    'limting': 'limiting',
    'electrol': 'electoral',
    'campaignto': 'campaign to',
    'presendent': 'president',
    'thezebra': 'the zebra',
    'landformation': 'land formation',
    'eyetoeye': 'eye to eye',
    'selfreliance': 'self reliance',
    'studentdriven': 'student driven',
    'winnertake': 'winner take',
    'alliens': 'aliens',
    '2000but': '2000 but',
    'electionto': 'election to',
    'candidatesas': 'candidates as',
    'electers': 'electoral',
    'winnertakes': 'winner takes',
    'isfeet': 'is feet',
    'incar': 'incur',
    'wellconstructed': 'well constructed',
    'craftsmenwomen': 'crafts men women',
    'freelunch': 'free lunch',
    'twothousandrevolutions': 'two thousand revolutions',
    'ushistoryorg': 'us history org',
    'pharohs': 'pharaohs',
    'whitehot': 'white hot',
    'vizers': 'visors',
    'mrjones': 'mr jones',
    'aminute': 'a minute',
    'spoiledmeat': 'spoiled meat',
    'farmersgave': 'farmers gave',
    'spolied': 'spoiled',
    'tradgey': 'tragedy',
    'pyrimid': 'pyramid',
    'pyrimad': 'pyramid',
    'egyptiansfrom': 'egyptians from',
    'harvestthats': 'harvest that',
    'expierment': 'experiment',
    'jestthat': 'jest that',
    'twothousandrevolutionsaminute': 'two thousand revolutions a minute',
    'expirament': 'experiment',
    'nonspoiled': 'non spoiled',
    'egyptains': 'egyptians',
    'tragedys': 'tragedy',
    'pyrmaid': 'pyramid',
    'expirment': 'experiment',
    'whiteit': 'grade there',
    'gradethere': 'tragedy',
    'goverement': 'government',
    'godsthe': 'gods the',
    'paraoh': 'pharaoh',
    'classesupper': 'classes upper',
    'pharoes': 'pharaohs',
    'noblespriests': 'noble priests',
    'farmersslaves': 'farmers slaves',
    'harvestâ€”thatâ€™s': 'harvest that',
    'tradedy': 'tragedy',
    'paraohs': 'pharaohs',
    'paragrapgh': 'paragraph',
    'expieriment': 'experiment',
    'tragdey': 'tragedy',
    'pyramaid': 'pyramid',
    'pyrmid': 'pyramid',
    'prists': 'priests',
    'pharoas': 'pharaohs',
    'priets': 'priests',
    'pharoph': 'pharaohs',
    'pharaoah': 'pharaohs',
    'pharahos': 'pharaohs',
    'pharaohthe': 'pharaohs'
}

In [13]:
def decontraction(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"he's", "he is", phrase)
    phrase = re.sub(r"there's", "there is", phrase)
    phrase = re.sub(r"We're", "We are", phrase)
    phrase = re.sub(r"That's", "That is", phrase)
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"they're", "they are", phrase)
    phrase = re.sub(r"Can't", "Cannot", phrase)
    phrase = re.sub(r"wasn't", "was not", phrase)
    phrase = re.sub(r"don\x89Ûªt", "do not", phrase)
    phrase = re.sub(r"donãât", "do not", phrase)
    phrase = re.sub(r"aren't", "are not", phrase)
    phrase = re.sub(r"isn't", "is not", phrase)
    phrase = re.sub(r"What's", "What is", phrase)
    phrase = re.sub(r"haven't", "have not", phrase)
    phrase = re.sub(r"hasn't", "has not", phrase)
    phrase = re.sub(r"There's", "There is", phrase)
    phrase = re.sub(r"He's", "He is", phrase)
    phrase = re.sub(r"It's", "It is", phrase)
    phrase = re.sub(r"You're", "You are", phrase)
    phrase = re.sub(r"I'M", "I am", phrase)
    phrase = re.sub(r"shouldn't", "should not", phrase)
    phrase = re.sub(r"wouldn't", "would not", phrase)
    phrase = re.sub(r"i'm", "I am", phrase)
    phrase = re.sub(r"I\x89Ûªm", "I am", phrase)
    phrase = re.sub(r"I'm", "I am", phrase)
    phrase = re.sub(r"Isn't", "is not", phrase)
    phrase = re.sub(r"Here's", "Here is", phrase)
    phrase = re.sub(r"you've", "you have", phrase)
    phrase = re.sub(r"you\x89Ûªve", "you have", phrase)
    phrase = re.sub(r"we're", "we are", phrase)
    phrase = re.sub(r"what's", "what is", phrase)
    phrase = re.sub(r"couldn't", "could not", phrase)
    phrase = re.sub(r"we've", "we have", phrase)
    phrase = re.sub(r"it\x89Ûªs", "it is", phrase)
    phrase = re.sub(r"doesn\x89Ûªt", "does not", phrase)
    phrase = re.sub(r"It\x89Ûªs", "It is", phrase)
    phrase = re.sub(r"Here\x89Ûªs", "Here is", phrase)
    phrase = re.sub(r"who's", "who is", phrase)
    phrase = re.sub(r"I\x89Ûªve", "I have", phrase)
    phrase = re.sub(r"y'all", "you all", phrase)
    phrase = re.sub(r"can\x89Ûªt", "cannot", phrase)
    phrase = re.sub(r"would've", "would have", phrase)
    phrase = re.sub(r"it'll", "it will", phrase)
    phrase = re.sub(r"we'll", "we will", phrase)
    phrase = re.sub(r"wouldn\x89Ûªt", "would not", phrase)
    phrase = re.sub(r"We've", "We have", phrase)
    phrase = re.sub(r"he'll", "he will", phrase)
    phrase = re.sub(r"Y'all", "You all", phrase)
    phrase = re.sub(r"Weren't", "Were not", phrase)
    phrase = re.sub(r"Didn't", "Did not", phrase)
    phrase = re.sub(r"they'll", "they will", phrase)
    phrase = re.sub(r"they'd", "they would", phrase)
    phrase = re.sub(r"DON'T", "DO NOT", phrase)
    phrase = re.sub(r"That\x89Ûªs", "That is", phrase)
    phrase = re.sub(r"they've", "they have", phrase)
    phrase = re.sub(r"i'd", "I would", phrase)
    phrase = re.sub(r"should've", "should have", phrase)
    phrase = re.sub(r"You\x89Ûªre", "You are", phrase)
    phrase = re.sub(r"where's", "where is", phrase)
    phrase = re.sub(r"Don\x89Ûªt", "Do not", phrase)
    phrase = re.sub(r"we'd", "we would", phrase)
    phrase = re.sub(r"i'll", "I will", phrase)
    phrase = re.sub(r"weren't", "were not", phrase)
    phrase = re.sub(r"They're", "They are", phrase)
    phrase = re.sub(r"Can\x89Ûªt", "Cannot", phrase)
    phrase = re.sub(r"you\x89Ûªll", "you will", phrase)
    phrase = re.sub(r"I\x89Ûªd", "I would", phrase)
    phrase = re.sub(r"let's", "let us", phrase)
    phrase = re.sub(r"it's", "it is", phrase)
    phrase = re.sub(r"can't", "cannot", phrase)
    phrase = re.sub(r"don't", "do not", phrase)
    phrase = re.sub(r"you're", "you are", phrase)
    phrase = re.sub(r"i've", "I have", phrase)
    phrase = re.sub(r"that's", "that is", phrase)
    phrase = re.sub(r"i'll", "I will", phrase)
    phrase = re.sub(r"doesn't", "does not",phrase)
    phrase = re.sub(r"i'd", "I would", phrase)
    phrase = re.sub(r"didn't", "did not", phrase)
    phrase = re.sub(r"ain't", "am not", phrase)
    phrase = re.sub(r"you'll", "you will", phrase)
    phrase = re.sub(r"I've", "I have", phrase)
    phrase = re.sub(r"Don't", "do not", phrase)
    phrase = re.sub(r"I'll", "I will", phrase)
    phrase = re.sub(r"I'd", "I would", phrase)
    phrase = re.sub(r"Let's", "Let us", phrase)
    phrase = re.sub(r"you'd", "You would", phrase)
    phrase = re.sub(r"It's", "It is", phrase)
    phrase = re.sub(r"Ain't", "am not", phrase)
    phrase = re.sub(r"Haven't", "Have not", phrase)
    phrase = re.sub(r"Could've", "Could have", phrase)
    phrase = re.sub(r"youve", "you have", phrase)  
    phrase = re.sub(r"donå«t", "do not", phrase)
    return phrase

In [14]:
def clean_text(text):
    text = decontraction(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]','',text, re.UNICODE)
    return text

def remove_punctuations(text):
    for punctuation in list(string.punctuation):
        text = text.replace(punctuation, '')
    return text

def clean_number(text):
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    return text

def clean_misspell(text):
    for bad_word in misspell_mapping:
        if bad_word in text:
            text = text.replace(bad_word, misspell_mapping[bad_word])
    return text

In [15]:
def pos_count(sent):
    nn_count = 0   #Noun
    pr_count = 0   #Pronoun
    vb_count = 0   #Verb
    jj_count = 0   #Adjective
    uh_count = 0   #Interjection
    cd_count = 0   #Numerics
    
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)

    for token in sent:
        if token[1] in ['NN','NNP','NNS']:
            nn_count += 1

        if token[1] in ['PRP','PRP$']:
            pr_count += 1

        if token[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']:
            vb_count += 1

        if token[1] in ['JJ','JJR','JJS']:
            jj_count += 1

        if token[1] in ['UH']:
            uh_count += 1

        if token[1] in ['CD']:
            cd_count += 1
    
    return pd.Series([nn_count, pr_count, vb_count, jj_count, uh_count, cd_count])

In [16]:
def word_overlap_count(row):
    def check_is_stop_word(word):
        return word in difficult_words
    
    prompt_words = row['prompt_tokens']
    summary_words = row['summary_tokens']
    
    if difficult_words:
        prompt_words = list(filter(check_is_stop_word, prompt_words))
        summary_words = list(filter(check_is_stop_word, summary_words))
    
    return len(set(prompt_words).intersection(set(summary_words)))

def ngrams(token, n):
    ngrams = zip(*[token[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]


def ngram_co_occurrence(row, n):
    original_tokens = row['prompt_tokens']
    summary_tokens = row['summary_tokens']

    original_ngrams = set(ngrams(original_tokens, n))
    summary_ngrams = set(ngrams(summary_tokens, n))

    common_ngrams = original_ngrams.intersection(summary_ngrams)
    return len(common_ngrams)


def quotes_count(row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

In [17]:
def text_preprocess(data):
    data["prompt_length"] = data["prompt_text"].progress_apply(lambda x: len(word_tokenize(x)))
    data["prompt_tokens"] = data["prompt_text"].progress_apply(lambda x: word_tokenize(x))
    data["summary_length"] = data["text"].progress_apply(lambda x: len(word_tokenize(x)))
    data["summary_tokens"] = data["text"].progress_apply(lambda x: word_tokenize(x))
    
    df = data
    df['length_ratio'] = df['summary_length'] / df['prompt_length']
    df['word_overlap_count'] = df.progress_apply(word_overlap_count, axis=1)
    df['bigram_overlap_count'] = df.progress_apply(ngram_co_occurrence, args=(2,), axis=1)
    df['bigram_overlap_ratio'] = df['bigram_overlap_count'] / (df['summary_length'] - 1)
    df['trigram_overlap_count'] = df.progress_apply(ngram_co_occurrence, args=(3,), axis=1)
    df['trigram_overlap_ratio'] = df['trigram_overlap_count'] / (df['summary_length'] - 2)
    df['quotes_count'] = df.progress_apply(quotes_count, axis=1)
    return df.drop(columns=["summary_tokens", "prompt_tokens"])

In [18]:
df_train = text_preprocess(df_train)

100%|██████████| 7165/7165 [00:29<00:00, 239.37it/s]
100%|██████████| 7165/7165 [00:30<00:00, 234.46it/s]
100%|██████████| 7165/7165 [00:03<00:00, 2032.43it/s]
100%|██████████| 7165/7165 [00:03<00:00, 2017.39it/s]
100%|██████████| 7165/7165 [00:00<00:00, 8886.10it/s]
100%|██████████| 7165/7165 [00:01<00:00, 4961.55it/s]
100%|██████████| 7165/7165 [00:01<00:00, 4401.17it/s]
100%|██████████| 7165/7165 [00:00<00:00, 78257.03it/s]


In [19]:
df_train["num_words"] = df_train["text"].progress_apply(lambda x: len(str(x).split()))
df_train["num_unique_words"] = df_train["text"].progress_apply(lambda x: len(set(str(x).split())))
df_train["num_chars"] = df_train["text"].progress_apply(lambda x: len(str(x)))
df_train["num_stopwords"] = df_train["text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
df_train["num_punctuations"] =df_train['text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
df_train["num_words_upper"] = df_train["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
df_train["num_words_title"] = df_train["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df_train["mean_word_len"] = df_train["text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_train["num_paragraphs"] = df_train["text"].progress_apply(lambda x: len(x.split('\n')))
df_train["num_sentences"] = df_train["text"].progress_apply(lambda x: len(str(x).split('.')))
df_train["polarity"] = df_train['text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
df_train["subjectivity"] = df_train['text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
df_train[['nn_count','pr_count','vb_count','jj_count','uh_count','cd_count']] = df_train['text'].progress_apply(pos_count)

100%|██████████| 7165/7165 [00:00<00:00, 200115.79it/s]
100%|██████████| 7165/7165 [00:00<00:00, 96596.66it/s]
100%|██████████| 7165/7165 [00:00<00:00, 749188.25it/s]
100%|██████████| 7165/7165 [00:52<00:00, 136.94it/s]
100%|██████████| 7165/7165 [00:02<00:00, 2886.62it/s]
100%|██████████| 7165/7165 [00:00<00:00, 117967.84it/s]
100%|██████████| 7165/7165 [00:00<00:00, 111030.94it/s]
100%|██████████| 7165/7165 [00:00<00:00, 45327.65it/s]
100%|██████████| 7165/7165 [00:00<00:00, 707260.08it/s]
100%|██████████| 7165/7165 [00:00<00:00, 589374.15it/s]
100%|██████████| 7165/7165 [00:03<00:00, 2195.47it/s]
100%|██████████| 7165/7165 [00:03<00:00, 2175.74it/s]
100%|██████████| 7165/7165 [00:24<00:00, 291.13it/s]


In [20]:
df_test = text_preprocess(df_test)

100%|██████████| 4/4 [00:00<00:00, 2785.99it/s]
100%|██████████| 4/4 [00:00<00:00, 2807.43it/s]
100%|██████████| 4/4 [00:00<00:00, 3723.31it/s]
100%|██████████| 4/4 [00:00<00:00, 4198.50it/s]
100%|██████████| 4/4 [00:00<00:00, 2259.86it/s]
100%|██████████| 4/4 [00:00<00:00, 2495.50it/s]
100%|██████████| 4/4 [00:00<00:00, 2515.70it/s]
100%|██████████| 4/4 [00:00<00:00, 2685.21it/s]


In [21]:
df_test["num_words"] = df_test["text"].progress_apply(lambda x: len(str(x).split()))
df_test["num_unique_words"] = df_test["text"].progress_apply(lambda x: len(set(str(x).split())))
df_test["num_chars"] = df_test["text"].progress_apply(lambda x: len(str(x)))
df_test["num_stopwords"] = df_test["text"].progress_apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
df_test["num_punctuations"] =df_test['text'].progress_apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
df_test["num_words_upper"] = df_test["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
df_test["num_words_title"] = df_test["text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
df_test["mean_word_len"] = df_test["text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
df_test["num_paragraphs"] = df_test["text"].progress_apply(lambda x: len(x.split('\n')))
df_test["num_sentences"] = df_test["text"].progress_apply(lambda x: len(str(x).split('.')))
df_test["polarity"] = df_test['text'].progress_apply(lambda x: TextBlob(x).sentiment[0])
df_test["subjectivity"] = df_test['text'].progress_apply(lambda x: TextBlob(x).sentiment[1])
df_test[['nn_count','pr_count','vb_count','jj_count','uh_count','cd_count']] = df_test['text'].progress_apply(pos_count)

100%|██████████| 4/4 [00:00<00:00, 7570.95it/s]
100%|██████████| 4/4 [00:00<00:00, 9177.91it/s]
100%|██████████| 4/4 [00:00<00:00, 10414.16it/s]
100%|██████████| 4/4 [00:00<00:00, 1401.49it/s]
100%|██████████| 4/4 [00:00<00:00, 6705.52it/s]
100%|██████████| 4/4 [00:00<00:00, 10186.53it/s]
100%|██████████| 4/4 [00:00<00:00, 10305.42it/s]
100%|██████████| 4/4 [00:00<00:00, 7608.71it/s]
100%|██████████| 4/4 [00:00<00:00, 9393.74it/s]
100%|██████████| 4/4 [00:00<00:00, 10625.22it/s]
100%|██████████| 4/4 [00:00<00:00, 2564.15it/s]
100%|██████████| 4/4 [00:00<00:00, 4736.65it/s]
100%|██████████| 4/4 [00:00<00:00, 856.29it/s]


In [22]:
df_train['text_clean'] = df_train['text'].progress_apply(clean_text)
df_train['text_clean'] = df_train['text_clean'].progress_apply(remove_punctuations)
df_train['text_clean'] = df_train['text_clean'].progress_apply(clean_number)
df_train['text_clean'] = df_train['text_clean'].progress_apply(clean_misspell)

df_test['text_clean'] = df_test['text'].progress_apply(clean_text)
df_test['text_clean'] = df_test['text_clean'].progress_apply(remove_punctuations)
df_test['text_clean'] = df_test['text_clean'].progress_apply(clean_number)
df_test['text_clean'] = df_test['text_clean'].progress_apply(clean_misspell)

100%|██████████| 7165/7165 [00:00<00:00, 9143.16it/s]
100%|██████████| 7165/7165 [00:00<00:00, 82911.28it/s]
100%|██████████| 7165/7165 [00:00<00:00, 26218.27it/s]
100%|██████████| 7165/7165 [00:00<00:00, 18805.07it/s]
100%|██████████| 4/4 [00:00<00:00, 3095.43it/s]
100%|██████████| 4/4 [00:00<00:00, 6649.71it/s]
100%|██████████| 4/4 [00:00<00:00, 5504.34it/s]
100%|██████████| 4/4 [00:00<00:00, 8634.70it/s]


# Model Training
Training the model and monitoring the progress


In [23]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

In [24]:
class CFG:
    model_name="mdeberta-v3-base"
    learning_rate=1.5e-5
    weight_decay=0.05 # Regularization 防止过拟合
    hidden_dropout_prob=0.007 # Dropout setting 随机失活的概率
    attention_probs_dropout_prob=0.007
    num_train_epochs= 5
    n_splits=4
    batch_size= 6
    random_seed=42
    save_steps=100 
    max_length=512 # Max length of imput 限制输入序列的长度

## Pretrained deberta base

In [25]:
class ContentScoreRegressor_pretrained: 
    def __init__(self, 
                model_dir: str,
                folder_dir: str, # deberta-large-pretrained/deberta-v3-large
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                model_name:str
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "text"]#"fixed_summary_text"]
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = [target]

        self.folder_dir = folder_dir
        self.model_dir = model_dir
        self.max_length = max_length
        
        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{folder_dir}/content/fold_0")
        self.model_config = AutoConfig.from_pretrained(f"/kaggle/input/{folder_dir}/content/fold_0")
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression",
        })
        
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"/kaggle/input/{self.model_dir}")
        model_content.eval()
        
        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(f'/kaggle/input/deberta-v3-large/{self.target}/fold_', str(fold)) 

        test_args = TrainingArguments(
            output_dir='output123',
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

In [26]:
def validate_pretrained(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{model_name}/{target}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/{target}/fold_{fold}"
        
        csr = ContentScoreRegressor_pretrained(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            folder_dir = model_name,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"{target}_pred"] = pred

    return train_df
    
def predict_pretrained(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{model_name}/{target}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/{target}/fold_{fold}"

        csr = ContentScoreRegressor_pretrained(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            folder_dir = model_name,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"{target}_pred_{fold}"] = pred
    
    test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [27]:
import warnings
warnings.filterwarnings('ignore')

In [28]:
predict_res = {}
model_names = [
               'deberta-v3-base-tuned-v1/deberta-v3-base',
               'roberta-base-pretrained/roberta-base-model',
              'deberta-v3-large-8epoch-tuned/deberta-v3-large-8epoch-tuned'
#                 'deberta-v3-large-5epoch/deberta-v3-large-5epoch',
                ]

keep_fold = True
for model_name in model_names:
    model_name_short = model_name.split('/')[0]
    predict_res[model_name_short] = {}
    
    # init train_pretrained
    train_pretrained = df_train.copy()
    gkf = GroupKFold(n_splits=CFG.n_splits)

    for i, (_, val_index) in enumerate(gkf.split(train_pretrained, groups=train_pretrained["prompt_id"])):
        train_pretrained.loc[val_index, "fold"] = i

    # pred
    for target in ["content", "wording"]:
        print(model_name_short, target)
        train_pretrained = validate_pretrained(
            train_pretrained,
            target=target,
            model_name = model_name,
            save_each_model=False,
            hidden_dropout_prob=CFG.hidden_dropout_prob,
            attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
            max_length=CFG.max_length
        )

        rmse = mean_squared_error(train_pretrained[target], train_pretrained[f"{target}_pred"], squared=False)
        print(f"cv {target} rmse: {rmse}")

        test_pretrained = predict_pretrained(
            df_test,
            target=target,
            model_name = model_name,
            save_each_model=False,
            hidden_dropout_prob=CFG.hidden_dropout_prob,
            attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
            max_length=CFG.max_length
        )

    if keep_fold:
        train_pretrained = train_pretrained[['fold','content_pred', 'wording_pred']]
        keep_fold = False
    else:
        train_pretrained = train_pretrained[['content_pred', 'wording_pred']]
        
    train_pretrained = train_pretrained.rename(columns = {'content_pred':f'content_pred_{model_name_short}',
                                                          'wording_pred':f'wording_pred_{model_name_short}'})

    test_pretrained = test_pretrained[['content', 'wording']]
    test_pretrained = test_pretrained.rename(columns = {'content':f'content_pred_{model_name_short}',
                                                          'wording':f'wording_pred_{model_name_short}'})                                                          
    
    predict_res[model_name_short] = {'train': train_pretrained,
                            'test': test_pretrained}

deberta-v3-base-tuned-v1 content
fold 0:


  0%|          | 0/2057 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/2009 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/1996 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/1103 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


cv content rmse: 0.4764878046668226
fold 0:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


deberta-v3-base-tuned-v1 wording
fold 0:


  0%|          | 0/2057 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/2009 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/1996 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/1103 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


cv wording rmse: 0.6154249658446408
fold 0:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


roberta-base-pretrained content
fold 0:


  0%|          | 0/2057 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/2009 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/1996 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/1103 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


cv content rmse: 0.484327347638863
fold 0:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


roberta-base-pretrained wording
fold 0:


  0%|          | 0/2057 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/2009 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/1996 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/1103 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


cv wording rmse: 0.6335862205278374
fold 0:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


deberta-v3-large-8epoch-tuned content
fold 0:


  0%|          | 0/2057 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/2009 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/1996 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/1103 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


cv content rmse: 0.4646820430211126
fold 0:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


deberta-v3-large-8epoch-tuned wording
fold 0:


  0%|          | 0/2057 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/2009 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/1996 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/1103 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


cv wording rmse: 0.6409746540896686
fold 0:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


# add 2 output

In [29]:
class ContentScoreRegressor: 
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "text"]#"fixed_summary_text"]
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = target

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        
        model_fold_dir = os.path.join(f'/kaggle/input/{model_name}/fold_0') 
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_fold_dir)
        self.model_config = AutoConfig.from_pretrained(model_fold_dir)
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 2,
            "problem_type": "regression",
        })
        
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples['content'],examples['wording']]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
#     def train(self, 
#             fold: int,
#             train_df: pd.DataFrame,
#             valid_df: pd.DataFrame,
#             batch_size: int,
#             learning_rate: float,
#             weight_decay: float,
#             num_train_epochs: float,
#             save_steps: int,
#         ) -> None:
#         """fine-tuning"""
        
#         sep = self.tokenizer.sep_token
#         train_df[self.input_col] = (
#                     train_df["prompt_title"] + sep 
#                     + train_df["prompt_question"] + sep 
#                     + train_df["text"]
#                   )

#         valid_df[self.input_col] = (
#                     valid_df["prompt_title"] + sep 
#                     + valid_df["prompt_question"] + sep 
#                     + valid_df["text"]
#                   )
        
#         train_df = train_df[[self.input_col] + self.target_cols]
#         valid_df = valid_df[[self.input_col] + self.target_cols]
        
#         model_content = AutoModelForSequenceClassification.from_pretrained(
#             f"microsoft/deberta-v3-large", 
#             config=self.model_config
#         )

#         train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
#         val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
    
#         train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
#         val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

#         # eg. "bert/fold_0/"
#         model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
#         training_args = TrainingArguments(
#             output_dir=model_fold_dir,
#             load_best_model_at_end=True, # select best model
#             learning_rate=learning_rate,
#             per_device_train_batch_size=batch_size,
#             per_device_eval_batch_size=4,
#             num_train_epochs=num_train_epochs,
#             weight_decay=weight_decay,
#             report_to='none',
#             greater_is_better=False,
#             save_strategy="steps",
#             evaluation_strategy="steps",
#             eval_steps=save_steps,
#             save_steps=save_steps,
#             metric_for_best_model="rmse",
#             save_total_limit=1,
#         )

#         trainer = Trainer(
#             model=model_content,
#             args=training_args,
#             train_dataset=train_tokenized_datasets,
#             eval_dataset=val_tokenized_datasets,
#             tokenizer=self.tokenizer,
#             compute_metrics=compute_metrics,
#             data_collator=self.data_collator
#         )

#         trainer.train()
        
#         model_content.save_pretrained(self.model_dir)
#         self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 

        test_args = TrainingArguments(
            output_dir='output',
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)#[0]

        return preds
    
def validate(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"/kaggle/input/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"/kaggle/input/{model_name}/fold_{fold}"
        
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"content_pred"] = [x[0] for x in pred.predictions]
        train_df.loc[valid_data.index, f"wording_pred"] = [x[1] for x in pred.predictions]

    return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        model_dir =  f"/kaggle/input/{model_name}/fold_{fold}"


        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"content_pred_{fold}"] = [x[0] for x in pred.predictions]
        test_df[f"wording_pred_{fold}"] = [x[1] for x in pred.predictions]
    
    test_df[f"content"] = test_df[[f"content_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)
    test_df[f"wording"] = test_df[[f"wording_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [30]:
model_names = [
               'deberta-v3-large-2output-v1/deberta-v3-large-2output-v1'
                ]

keep_fold = True
target = ["content", "wording"]

    
for model_name in model_names:
    model_name_short = model_name.split('/')[0]
    predict_res[model_name_short] = {}
    
    # init train_pretrained
    train_pretrained = df_train.copy()
    gkf = GroupKFold(n_splits=CFG.n_splits)

    for i, (_, val_index) in enumerate(gkf.split(train_pretrained, groups=train_pretrained["prompt_id"])):
        train_pretrained.loc[val_index, "fold"] = i

    # pred
    target = ["content", "wording"]
    train_pretrained = validate(
        train_pretrained,
        target=target,
        model_name = model_name,
        save_each_model=False,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )


    test_pretrained = predict(
        df_test,
        target=target,
        model_name = model_name,
        save_each_model=False,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

    train_pretrained = train_pretrained[['content_pred', 'wording_pred']]
        
    train_pretrained = train_pretrained.rename(columns = {'content_pred':f'content_pred_{model_name_short}',
                                                          'wording_pred':f'wording_pred_{model_name_short}'})

    test_pretrained = test_pretrained[['content', 'wording']]
    test_pretrained = test_pretrained.rename(columns = {'content':f'content_pred_{model_name_short}',
                                                          'wording':f'wording_pred_{model_name_short}'})                                                          
    
    predict_res[model_name_short] = {'train': train_pretrained,
                            'test': test_pretrained}

fold 0:


  0%|          | 0/2057 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/2009 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/1996 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/1103 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 0:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 1:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 2:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


fold 3:


  0%|          | 0/4 [00:00<?, ?ex/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Starting of tree model

In [31]:
predict_res.keys()

dict_keys(['deberta-v3-base-tuned-v1', 'roberta-base-pretrained', 'deberta-v3-large-8epoch-tuned', 'deberta-v3-large-2output-v1'])

In [32]:
df_concat_ls = [df_train] + [predict_res[model_name_short]['train'] for model_name_short in predict_res.keys()]
train_copy = pd.concat(df_concat_ls, axis = 1)


targets = ["content", "wording"]
drop_columns = ["fold", "student_id", "prompt_id", "text", 
                "prompt_question", "prompt_title", 'text_clean',
                "prompt_text"
               ] + targets


## LightGBM

In [33]:
import lightgbm as lgb

In [34]:
model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train_copy[train_copy["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train_copy[train_copy["fold"] != fold][target]

        X_eval_cv = train_copy[train_copy["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train_copy[train_copy["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

        params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.01,
                  }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                               lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        models.append(model)
    
    model_dict[target] = models

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8260
[LightGBM] [Info] Number of data points in the train set: 5108, number of used features: 69
[LightGBM] [Info] Start training from score 0.017606
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.48083
[200]	train's rmse: 0.399366
Early stopping, best iteration is:
[265]	train's rmse: 0.395497
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8080
[LightGBM] [Info] Number of data points in the train set: 5156, number of used features: 66
[LightGBM] [Info] Start training from score -0.039959
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.631178
[200]	train's rmse: 0.522546
[300]	train's rmse: 0.496312
[400]	train's rmse: 0.489166
[500]	train's rmse: 0.483401
[600]	tra

In [35]:
# cv
rmses = []
pred_dict_lgb = {}
for target in targets:
    models = model_dict[target]

    preds_lgb = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train_copy[train_copy["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train_copy[train_copy["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds_lgb.extend(pred)
    pred_dict_lgb[target] = preds_lgb
    rmse = np.sqrt(mean_squared_error(trues, preds_lgb))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"LightGBM mcrmse : {sum(rmses) / len(rmses)}")

content_rmse : 0.44162148944922036
wording_rmse : 0.5619541597902915
LightGBM mcrmse : 0.5017878246197559


## XGBoost

In [36]:
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

In [37]:
model_dict_xgboost = {}
for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train_copy[train_copy["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train_copy[train_copy["fold"] != fold][target]

        X_eval_cv = train_copy[train_copy["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train_copy[train_copy["fold"] == fold][target]


        params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'learning_rate': 0.01,
            'max_depth': 4,
            'n_estimators': 10000,
            'early_stopping_rounds': 30,
            'random_state': 42,
        }
        
        model = XGBRegressor(**params)
        model.fit(X_train_cv, 
                  y_train_cv, 
                  eval_set=[(X_eval_cv, y_eval_cv)],
                  verbose=False)
        models.append(model)
    
    model_dict_xgboost[target] = models

In [38]:
# cv
rmses = []
pred_dict_xgb = {}
for target in targets:
    models = model_dict_xgboost[target]

    preds_xgb = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train_copy[train_copy["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train_copy[train_copy["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds_xgb.extend(pred)
    pred_dict_xgb[target] = preds_xgb
    rmse = np.sqrt(mean_squared_error(trues, preds_xgb))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"XGBoost mcrmse : {sum(rmses) / len(rmses)}")

content_rmse : 0.4387371706861626
wording_rmse : 0.5580276758478343
XGBoost mcrmse : 0.49838242326699844


## CatBoost

In [39]:
from catboost import CatBoostRegressor

In [40]:
model_dict_catboost = {}

content_hp = {
                "n_estimators": 10000,  

                "learning_rate": 0.01,
                "subsample": 0.7,
                "max_depth": 4,
                "min_data_in_leaf": 30,
                "rsm": 0.7,

                "early_stopping_rounds": 200,  
                "verbose": 200,
                "random_state": 42}


wording_hp = {
                "n_estimators": 10000,  

                "learning_rate": 0.01,
                "subsample": 0.6,
                "max_depth": 3,
                "min_data_in_leaf": 60,

                "early_stopping_rounds": 400, 
                "verbose": 300,
                "random_state": 42}

hps = [content_hp, wording_hp]

for target, hp in zip(targets, hps):
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train_copy[train_copy["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train_copy[train_copy["fold"] != fold][target]

        X_eval_cv = train_copy[train_copy["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train_copy[train_copy["fold"] == fold][target]

        model = CatBoostRegressor(**hp)
        model.fit(X_train_cv, 
                  y_train_cv, 
                  eval_set=[(X_eval_cv, y_eval_cv)],
                  verbose=False)
        models.append(model)
    
    model_dict_catboost[target] = models

In [41]:
# cv
rmses = []
pred_dict_cat = {}
tues_dict = {}
for target in targets:
    models = model_dict_catboost[target]

    preds_cat = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train_copy[train_copy["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train_copy[train_copy["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds_cat.extend(pred)
    pred_dict_cat[target] = preds_cat
    tues_dict[target] = trues
    rmse = np.sqrt(mean_squared_error(trues, preds_cat))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"CatBoost mcrmse : {sum(rmses) / len(rmses)}")

content_rmse : 0.43190753613172994
wording_rmse : 0.5556929405113765
CatBoost mcrmse : 0.49380023832155323


## Weight searching

In [42]:
from scipy.optimize import minimize

In [43]:
predictions = {'content':[pred_dict_lgb['content'],pred_dict_xgb['content'],pred_dict_cat['content']],
               'wording':[pred_dict_lgb['wording'],pred_dict_xgb['wording'],pred_dict_cat['wording']]}

def ensemble_loss_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = [0] * len(tues_dict['content'])
    rmses = []
    for target in targets:   
        final_prediction = [0] * len(tues_dict['content'])
        for weight, prediction in zip(weights, predictions[target]):
                tmp_prediction = [weight * x for x in prediction]
                final_prediction =  [ final_prediction[x] + tmp_prediction[x] for x in range (len (final_prediction))]  
        rmse = np.sqrt(mean_squared_error(tues_dict[target], final_prediction))
#         print(rmse)
        rmses = rmses + [rmse]
    return sum(rmses) / len(rmses)

In [44]:
starting_values = [1/3]*len(predictions['content'])

cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions['content'])

res = minimize(ensemble_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)

In [45]:
print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))

Ensamble Score: 0.4924953645481993
Best Weights: [0.06980423 0.2453128  0.68488297]


# Prediction
Making predictions on the test data  

In [46]:

df_concat_ls = [df_test] + [predict_res[model_name_short]['test'] for model_name_short in predict_res.keys()]
test = pd.concat(df_concat_ls, axis = 1)


In [47]:
drop_columns = [
                "student_id", "prompt_id", "text", 
                "prompt_question", "prompt_title", 
                "prompt_text",'text_clean',
                "input", 'content', 'wording'
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [48]:
test.drop(columns=drop_columns)

Unnamed: 0,num_words,avg_sentence_length,num_difficult_words,num_unfreq_words_500,num_unfreq_words_1000,num_unfreq_words_5000,num_unfreq_words_10000,num_unfreq_words_20000,num_unfreq_words_50000,num_unfreq_words_100000,...,uh_count,cd_count,content_pred_deberta-v3-base-tuned-v1,wording_pred_deberta-v3-base-tuned-v1,content_pred_roberta-base-pretrained,wording_pred_roberta-base-pretrained,content_pred_deberta-v3-large-8epoch-tuned,wording_pred_deberta-v3-large-8epoch-tuned,content_pred_deberta-v3-large-2output-v1,wording_pred_deberta-v3-large-2output-v1
0,3,3.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,...,0,1,-1.521705,-1.54632,-1.167308,-1.124073,-1.556885,-1.436425,-1.576154,-1.34045
1,3,3.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,...,0,1,-1.529874,-1.549486,-1.148956,-1.122546,-1.501076,-1.393221,-1.593781,-1.344372
2,3,3.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,...,0,1,-1.522412,-1.550916,-1.153747,-1.115592,-1.558781,-1.431591,-1.582225,-1.34049
3,3,3.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,...,0,1,-1.540222,-1.560823,-1.155313,-1.122723,-1.522045,-1.406734,-1.587772,-1.346653


In [49]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)
        X_eval_cv = X_eval_cv.rename({'content': "content_pred", 'wording': "wording_pred"}, axis='columns')
        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [50]:
pred_dict_xgb = {}
for target in targets:
    models = model_dict_xgboost[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)
        X_eval_cv = X_eval_cv.rename({'content': "content_pred", 'wording': "wording_pred"}, axis='columns')
        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict_xgb[target] = preds
    
pred_dict_cat = {}
for target in targets:
    models = model_dict_catboost[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)
        X_eval_cv = X_eval_cv.rename({'content': "content_pred", 'wording': "wording_pred"}, axis='columns')
        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict_cat[target] = preds

In [51]:
pred_dict_final = {}
for target in targets:
    lgb_out = pred_dict[target]
    xgb_out = pred_dict_xgb[target]
    cat_out = pred_dict_cat[target]
    out = []
    for i in range(len(lgb_out)):
        out.append(lgb_out[i]*res['x'][0] + xgb_out[i]*res['x'][1]+ cat_out[i]*res['x'][2])
    pred_dict_final[target] = out

In [52]:
for target in targets:
    preds = pred_dict_final[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

# Creating the Submission File
reating the file for submission


In [53]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)