# <div style="color:white;display:fill;border-radius:5px;background-color:steelBlue;text-align:center;letter-spacing:0.1px;overflow:hidden;padding:20px;color:white;overflow:hidden;margin:0;font-size:100%">Submission 1 (Deberta baseline)</div>

- Ref: https://www.kaggle.com/code/idv2005/deberta-baseline-inference

In [None]:
  import numpy as np 
import pandas as pd 
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import gc
import torch
import re
import copy
import polars as pl
import lightgbm as lgb
from tqdm.auto import tqdm, trange
from lightgbm import log_evaluation, early_stopping
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import cohen_kappa_score, accuracy_score

In [None]:
TEST_DATA_PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv"
MAX_LENGTH = 1024
MODEL_PATH = '/kaggle/input/es-deberta-large-fold0'
EVAL_BATCH_SIZE = 1
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

def tokenize(sample):
    return tokenizer(sample['full_text'], max_length=MAX_LENGTH, truncation=True)

df_test = pd.read_csv(TEST_DATA_PATH)
ds = Dataset.from_pandas(df_test)
ds = ds.map(tokenize).remove_columns(['essay_id', 'full_text'])

In [None]:
class DataCollator:
    def __call__(self, features):
        model_inputs = [{
            'input_ids': feature['input_ids'],
            'attention_mask': feature['attention_mask']
        } for feature in features]
        
        batch = tokenizer.pad(
            model_inputs,
            padding=True,
            max_length=MAX_LENGTH,
            return_tensors='pt',
            pad_to_multiple_of = 16
        )
        return batch

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
collator = DataCollator()
args = TrainingArguments(".", per_device_eval_batch_size=EVAL_BATCH_SIZE, report_to="none")
trainer = Trainer(model=model, args=args, data_collator=collator, tokenizer=tokenizer)
predictions = trainer.predict(ds).predictions

del model, trainer
torch.cuda.empty_cache()
gc.collect()

In [None]:
preds = predictions.argmax(-1) + 1
df_test['score'] = preds
df_test[['essay_id', 'score']].to_csv('submission_1.csv', index=False)
df_test.head(3)

# <div style="color:white;display:fill;border-radius:5px;background-color:steelBlue;text-align:center;letter-spacing:0.1px;overflow:hidden;padding:20px;color:white;overflow:hidden;margin:0;font-size:100%">Submission 2 (Tfidf + LGBM baseline)</div>
- Ref: https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline

In [None]:
columns = [  
    # paragraph
    (pl.col("full_text").str.split(by="\n\n").alias("paragraph")),
]
PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"
train = pl.read_csv(PATH + "train.csv").with_columns(columns)
test = pl.read_csv(PATH + "test.csv").with_columns(columns)

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

def dataPreprocessing(x):
    x = x.lower()
    x = removeHTML(x)
    x = re.sub("@\w+", '',x)
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    x = re.sub("http\w+", '',x)
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = x.strip()
    return x

In [None]:
# paragraph feature
def Paragraph_Preprocess(tmp):
    
    tmp = tmp.explode('paragraph')
    # preprocess
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(dataPreprocessing))
    # paragraph_len
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x)).alias("paragraph_len"))
    # paragraph_sentence_cnt/paragraph_word_cnt
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x.split('.'))).alias("paragraph_sentence_cnt"),
                    pl.col('paragraph').map_elements(lambda x: len(x.split(' '))).alias("paragraph_word_cnt"),)
    return tmp
# feature_eng
paragraph_fea = ['paragraph_len','paragraph_sentence_cnt','paragraph_word_cnt']
def Paragraph_Eng(train_tmp):
    aggs = [
        # paragraph_len_cnt
        *[pl.col('paragraph').filter(pl.col('paragraph_len') >= i).count().alias(f"paragraph_{i}_cnt") for i in [50,75,100,125,150,175,200,250,300,350,400,500,600,700] ], 
        *[pl.col('paragraph').filter(pl.col('paragraph_len') <= i).count().alias(f"paragraph_{i}_cnt") for i in [25,49]], 
        # other
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in paragraph_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in paragraph_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in paragraph_fea],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in paragraph_fea],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in paragraph_fea],
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df
tmp = Paragraph_Preprocess(train)
train_feats = Paragraph_Eng(tmp)
train_feats['score'] = train['score']
print('feature_num: ',len(train_feats.columns)-2)

In [None]:
# sentence feature
def Sentence_Preprocess(tmp):
    
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=".").alias("sentence"))
    tmp = tmp.explode('sentence')
    # sentence_len
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x)).alias("sentence_len"))
    # filter
    tmp = tmp.filter(pl.col('sentence_len')>=15)
    # sentence_word_cnt
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x.split(' '))).alias("sentence_word_cnt"))
    
    return tmp
# feature_eng
sentence_fea = ['sentence_len','sentence_word_cnt']
def Sentence_Eng(train_tmp):
    aggs = [
        # sentence_cnt
        *[pl.col('sentence').filter(pl.col('sentence_len') >= i).count().alias(f"sentence_{i}_cnt") for i in [15,50,100,150,200,250,300] ], 
        # other
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in sentence_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in sentence_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in sentence_fea],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in sentence_fea],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in sentence_fea],
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df
# merge
tmp = Sentence_Preprocess(train)
train_feats = train_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')
print('feature_num: ',len(train_feats.columns)-2)

In [None]:
# word feature
def Word_Preprocess(tmp):

    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=" ").alias("word"))
    tmp = tmp.explode('word')
    # word_len
    tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
    # filter
    tmp = tmp.filter(pl.col('word_len')!=0)
    
    return tmp
# feature_eng
def Word_Eng(train_tmp):
    aggs = [
        # word_cnt
        *[pl.col('word').filter(pl.col('word_len') >= i+1).count().alias(f"word_{i+1}_cnt") for i in range(15) ], 
        # other
        pl.col('word_len').max().alias(f"word_len_max"),
        pl.col('word_len').mean().alias(f"word_len_mean"),
        pl.col('word_len').std().alias(f"word_len_std"),
        pl.col('word_len').quantile(0.25).alias(f"word_len_q1"),
        pl.col('word_len').quantile(0.50).alias(f"word_len_q2"),
        pl.col('word_len').quantile(0.75).alias(f"word_len_q3"),
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df
# merge
tmp = Word_Preprocess(train)
train_feats = train_feats.merge(Word_Eng(tmp), on='essay_id', how='left')
print('feature_num: ',len(train_feats.columns)-2)

In [None]:
# Tfidf feature
vectorizer = TfidfVectorizer(
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            token_pattern=None,
            strip_accents='unicode',
            analyzer = 'word',
            ngram_range=(1,3),
            min_df=0.05,
            max_df=0.95,
            sublinear_tf=True,
)

train_tfid = vectorizer.fit_transform([i for i in train['full_text']])
dense_matrix = train_tfid.toarray()
df = pd.DataFrame(dense_matrix)
tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = train_feats['essay_id']
# merge
train_feats = train_feats.merge(df, on='essay_id', how='left')
feature_names = list(filter(lambda x: x not in ['essay_id','score'], train_feats.columns))
print('Features number: ',len(feature_names))

In [None]:
# idea from https://www.kaggle.com/code/rsakata/optimize-qwk-by-lgb/notebook#QWK-objective
a = 2.95
b = 1.1

num_fold = 5
models = []
for i in range(num_fold):
    models.append(lgb.Booster(model_file=f'../input/lal-lgb-baseline-4/fold_{i}.txt'))

In [None]:
tmp = Paragraph_Preprocess(test)
test_feats = Paragraph_Eng(tmp)
tmp = Sentence_Preprocess(test)
test_feats = test_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')
tmp = Word_Preprocess(test)
test_feats = test_feats.merge(Word_Eng(tmp), on='essay_id', how='left')
test_tfid = vectorizer.transform([i for i in test['full_text']])
dense_matrix = test_tfid.toarray()
df = pd.DataFrame(dense_matrix)
tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = test_feats['essay_id']
test_feats = test_feats.merge(df, on='essay_id', how='left')
feature_names = list(filter(lambda x: x not in ['essay_id','score'], test_feats.columns))
print('Features number: ',len(feature_names))

In [None]:
prediction = test_feats[['essay_id']].copy()
prediction['score'] = 0
pred_test = models[0].predict(test_feats[feature_names]) + a
for i in range(num_fold-1):
    pred_now = models[i+1].predict(test_feats[feature_names]) + a
    pred_test = np.add(pred_test,pred_now)
pred_test = pred_test/num_fold

pred_test = pred_test.clip(1, 6).round()
prediction['score'] = pred_test
prediction.to_csv('submission_2.csv', index=False)
prediction.head(3)

# <div style="color:white;display:fill;border-radius:5px;background-color:steelBlue;text-align:center;letter-spacing:0.1px;overflow:hidden;padding:20px;color:white;overflow:hidden;margin:0;font-size:100%">Ensemble</div>

In [None]:
# Load the data
df1 = pd.read_csv('/kaggle/working/submission_1.csv')
df2 = pd.read_csv('/kaggle/working/submission_2.csv')

# Merging the dataframes on 'essay_id'
df = pd.merge(left=df1, right=df2, on='essay_id', suffixes=('_1', '_2'))

# Calculating the average score directly without apply()
df['score'] = ((df['score_1'] + df['score_2']) / 2).round().astype(int)

# Saving the desired columns to a new csv file
df[['essay_id', 'score']].to_csv('submission.csv', index=False)