In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
## Submission 
sub_df=pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv")
train_df=pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
test_df=pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv")

In [None]:
sub_df.head()

In [None]:
train_df.head()

In [None]:
len(train_df)

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.countplot(x='score', data=train_df)
plt.title('Distribution of Essay Scores')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.show()


In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
# Explore the distribution of essay lengths in the training dataset
Plot_train_df=train_df

Plot_train_df['essay_length'] = Plot_train_df['full_text'].apply(lambda x: len(word_tokenize(x)))
plt.figure(figsize=(8, 6))
sns.histplot(Plot_train_df['essay_length'], bins=20, kde=True)
plt.title('Distribution of Essay Lengths')
plt.xlabel('Essay Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import gc
import torch
import re
import copy
import polars as pl
import lightgbm as lgb
from tqdm.auto import tqdm,trange
from lightgbm import log_evaluation, early_stopping
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import cohen_kappa_score, accuracy_score

## Load Deberta Model

In [None]:
max_length = 1024
model_path = '/kaggle/input/es-deberta-large-fold0'
eval_batch_size = 1

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

def tokenize(sample):
    return tokenizer(sample['full_text'], max_length=max_length, truncation=True)

In [None]:
updated_test_df = Dataset.from_pandas(test_df)
updated_test_df = updated_test_df.map(tokenize).remove_columns(['essay_id', 'full_text'])

In [None]:
updated_test_df

In [None]:
features_list = updated_test_df.column_names
features_list

In [None]:
class DataCollator:
    def __call__(self, features):
        model_inputs = [{
            'input_ids': feature['input_ids'],
            'attention_mask': feature['attention_mask']
        } for feature in features]
        
        batch = tokenizer.pad(
            model_inputs,
            padding=True,
            max_length=max_length,
            return_tensors='pt',
            pad_to_multiple_of = 16
        )
        return batch

In [None]:
from torch.nn.utils.rnn import pad_sequence
model = AutoModelForSequenceClassification.from_pretrained(model_path)
collator=DataCollator()
# Iterate over the dataset to get actual features
model_inputs = [{
    'input_ids': feature['input_ids'],
    'attention_mask': feature['attention_mask']
} for feature in updated_test_df]


args = TrainingArguments(".", per_device_eval_batch_size=eval_batch_size, report_to="none")
trainer = Trainer(model=model, args=args, data_collator=collator, tokenizer=tokenizer)

In [None]:
predictions = trainer.predict(updated_test_df).predictions


In [None]:
predictions

In [None]:
preds = predictions.argmax(-1) + 1
test_df['score'] = preds
test_df[['essay_id', 'score']].to_csv('submission_1.csv', index=False)

In [None]:
test_df.head()

## Make Submission 2 


For our second submission, we employed a baseline model using Term Frequency-Inverse Document Frequency (Tfidf) vectorization coupled with LightGBM (LGBM), a gradient boosting framework. Tfidf vectorization allows us to represent each essay in the dataset as a vector based on the frequency of words and their importance in distinguishing between essays. LGBM, known for its efficiency and effectiveness in handling large datasets, was employed to train a machine learning model on the Tfidf vectors for predicting essay scores.

In [None]:

import pandas as pd 
train=pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv")
test=pd.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv")

In [None]:
train.head()

In [None]:
columns = [  
    # paragraph
    (pl.col("full_text").str.split(by="\n\n").alias("paragraph")),
]
PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"
train = pl.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv").with_columns(columns)
test = pl.read_csv("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv").with_columns(columns)

In [None]:
train.head()

In [None]:
def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

def dataPreprocessing(x):
    x = x.lower()
    x = removeHTML(x)
    x = re.sub("@\w+", '',x)
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    x = re.sub("http\w+", '',x)
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = x.strip()
    return x

### **Paragraph Preprocess**
In this context, "Paragraph Preprocess" refers to the process of preparing each paragraph within the text data for further analysis or feature extraction.

In [None]:
def Paragraph_Preprocess(tmp):
    
    tmp = tmp.explode('paragraph')
    # preprocess
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(dataPreprocessing))
    # paragraph_len
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x)).alias("paragraph_len"))
    # paragraph_sentence_cnt/paragraph_word_cnt
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x.split('.'))).alias("paragraph_sentence_cnt"),
                    pl.col('paragraph').map_elements(lambda x: len(x.split(' '))).alias("paragraph_word_cnt"),)
    return tmp
# feature_eng
paragraph_fea = ['paragraph_len','paragraph_sentence_cnt','paragraph_word_cnt']
def Paragraph_Eng(train_tmp):
    aggs = [
        # paragraph_len_cnt
        *[pl.col('paragraph').filter(pl.col('paragraph_len') >= i).count().alias(f"paragraph_{i}_cnt") for i in [50,75,100,125,150,175,200,250,300,350,400,500,600,700] ], 
        *[pl.col('paragraph').filter(pl.col('paragraph_len') <= i).count().alias(f"paragraph_{i}_cnt") for i in [25,49]], 
        # other
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in paragraph_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in paragraph_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in paragraph_fea],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in paragraph_fea],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in paragraph_fea],
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df
tmp = Paragraph_Preprocess(train)
train_feats = Paragraph_Eng(tmp)
train_feats['score'] = train['score']
print('feature_num: ',len(train_feats.columns)-2)

### **Sentence Preprocess**

It breaks text into tokens, applies text cleaning techniques like removing punctuation and stopwords, and calculates sentence statistics such as length and vocabulary richness. Finally, it extracts features for analysis, facilitating tasks like sentiment analysis and text classification

In [None]:
# sentence feature
def Sentence_Preprocess(tmp):
    
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=".").alias("sentence"))
    tmp = tmp.explode('sentence')
    # sentence_len
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x)).alias("sentence_len"))
    # filter
    tmp = tmp.filter(pl.col('sentence_len')>=15)
    # sentence_word_cnt
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x.split(' '))).alias("sentence_word_cnt"))
    
    return tmp
# feature_eng
sentence_fea = ['sentence_len','sentence_word_cnt']
def Sentence_Eng(train_tmp):
    aggs = [
        # sentence_cnt
        *[pl.col('sentence').filter(pl.col('sentence_len') >= i).count().alias(f"sentence_{i}_cnt") for i in [15,50,100,150,200,250,300] ], 
        # other
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in sentence_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in sentence_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in sentence_fea],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in sentence_fea],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in sentence_fea],
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df
# merge
tmp = Sentence_Preprocess(train)
train_feats = train_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')
print('feature_num: ',len(train_feats.columns)-2)

### **Word Preprocessing:**
It tokenizes words, removes punctuation and stopwords, and performs lemmatization or stemming to normalize the text. Additionally, it calculates word frequency and other statistical measures to aid in feature extraction and analysis. This preprocessing step helps improve the performance of various NLP tasks such as text classification, sentiment analysis, and information retrieval.








In [None]:
# word feature
def Word_Preprocess(tmp):

    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=" ").alias("word"))
    tmp = tmp.explode('word')
    # word_len
    tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
    # filter
    tmp = tmp.filter(pl.col('word_len')!=0)
    
    return tmp
# feature_eng
def Word_Eng(train_tmp):
    aggs = [
        # word_cnt
        *[pl.col('word').filter(pl.col('word_len') >= i+1).count().alias(f"word_{i+1}_cnt") for i in range(15) ], 
        # other
        pl.col('word_len').max().alias(f"word_len_max"),
        pl.col('word_len').mean().alias(f"word_len_mean"),
        pl.col('word_len').std().alias(f"word_len_std"),
        pl.col('word_len').quantile(0.25).alias(f"word_len_q1"),
        pl.col('word_len').quantile(0.50).alias(f"word_len_q2"),
        pl.col('word_len').quantile(0.75).alias(f"word_len_q3"),
        ]
    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df
# merge
tmp = Word_Preprocess(train)
train_feats = train_feats.merge(Word_Eng(tmp), on='essay_id', how='left')
print('feature_num: ',len(train_feats.columns)-2)

### **Tfidf feature:**
 TF-IDF is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents. It calculates a weight for each word in the document based on its frequency (TF) and inverse document frequency (IDF), where rare words that appear in fewer documents receive higher weights. This representation is commonly used in text mining, information retrieval, and natural language processing tasks to capture the significance of words in a document corpus.
 

In [None]:
vectorizer = TfidfVectorizer(
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            token_pattern=None,
            strip_accents='unicode',
            analyzer = 'word',
            ngram_range=(1,3),
            min_df=0.05,
            max_df=0.95,
            sublinear_tf=True,
)
train_tfid = vectorizer.fit_transform([i for i in train['full_text']])
dense_matrix = train_tfid.toarray()
df = pd.DataFrame(dense_matrix)
tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = train_feats['essay_id']
# merge
train_feats = train_feats.merge(df, on='essay_id', how='left')
print('feature_num: ',len(train_feats.columns)-2)

In [None]:
feature_names = [col for col in train_feats.columns if col not in ['essay_id', 'score']]
print('Number of features:', len(feature_names))
train_feats.head(10)

In [None]:
models = []
for i in range(5):
    models.append(lgb.Booster(model_file=f'/kaggle/input/lal-lgb-baseline-2/fold_{i}.txt'))

In [None]:
tmp = Paragraph_Preprocess(test)
test_feats = Paragraph_Eng(tmp)
tmp = Sentence_Preprocess(test)
test_feats = test_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')
tmp = Word_Preprocess(test)
test_feats = test_feats.merge(Word_Eng(tmp), on='essay_id', how='left')
test_tfid = vectorizer.transform([i for i in test['full_text']])
dense_matrix = test_tfid.toarray()
df = pd.DataFrame(dense_matrix)
tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
df.columns = tfid_columns
df['essay_id'] = test_feats['essay_id']
test_feats = test_feats.merge(df, on='essay_id', how='left')
feature_names = list(filter(lambda x: x not in ['essay_id','score'], test_feats.columns))

In [None]:
feature_names

In [None]:
prediction = test_feats[['essay_id']].copy()
prediction['score'] = 0
pred_test = models[0].predict(test_feats[feature_names])
for i in range(4):
    pred_now = models[i+1].predict(test_feats[feature_names])
    pred_test = np.add(pred_test,pred_now)
pred_test = pred_test/5

pred_test = pred_test.clip(1, 6).round()
prediction['score'] = pred_test
prediction.to_csv('submission_2.csv', index=False)

In [None]:
prediction

# **Final Submission:**

In [None]:
dataset1 = pd.read_csv('/kaggle/working/submission_1.csv')
dataset2 = pd.read_csv('/kaggle/working/submission_2.csv')

In [None]:
dataset1.head()

In [None]:
dataset2.head()

In [None]:
merged_df = pd.merge(left=dataset1, right=dataset2, on='essay_id', suffixes=('_x', '_y'))
merged_df.head()

In [None]:
merged_df['score'] = ((merged_df['score_x'] + merged_df['score_y']) / 2).round().astype(int)

In [None]:
merged_df.head()

In [None]:
# Saving the desired columns to a new csv file
merged_df[['essay_id', 'score']].to_csv('submission.csv', index=False)

In [None]:
csv_string = merged_df[['essay_id', 'score']].to_csv(index=False)
csv_lines = csv_string.split('\n')
for line in csv_lines[:5]:
    print(line)

In [None]:
sub_df.head()