# AES 2.0 | TF-IDF & XGB Baseline

# 1. Libraries

In [None]:
import os
import re
import nltk
import string
import warnings
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')

# 2. Configuration

In [None]:
class config:
    root = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"
    train_path = os.path.join(root, "train.csv")
    test_path = os.path.join(root, "test.csv")
    sample_submission_path = os.path.join(root, "sample_submission.csv")
    
    seed = 45
    n_folds = 10
    num_rounds = 5000
    early_stopping_rounds = 500
    verbose = 500

# 3. Loading Data

In [None]:
train = pd.read_csv(config.train_path)
test = pd.read_csv(config.test_path)
sample_submission = pd.read_csv(config.sample_submission_path)

print(f"train shape: {train.shape}")
print(f"test shape: {test.shape}")
print("-"*90)
print(f"train missing values: {train.isnull().sum().sum()}")
print(f"test missing values: {test.isnull().sum().sum()}")
print("-"*90)
train.head()

# 4. Target Distribution

In [None]:
sns.set_style("whitegrid")

plt.figure(figsize = (10, 5))

sns.histplot(data=train, x='score')

plt.show()

# 5. Feature Engineering

In [None]:
def count_stopwords(text: str) -> int:
    stopword_list = set(stopwords.words('english'))
    words = text.split()
    stopwords_count = sum(1 for word in words if word.lower() in stopword_list)
    return stopwords_count

def count_punctuation(text: str) -> int:
    punctuation_set = set(string.punctuation)
    punctuation_count = sum(1 for char in text if char in punctuation_set)
    return punctuation_count

def count_numbers(text: str) -> int:
    numbers = re.findall(r'\d+', text)
    numbers_count = len(numbers)
    return numbers_count

def feature_engineer(dataframe: pd.DataFrame) -> pd.DataFrame:
    
    words = dataframe["full_text"].apply(lambda x: [len(w) for w in x.split()])
    
    dataframe["total_words"] = words.apply(lambda x: len(x))
    dataframe["word_len_max"] = words.apply(lambda x: np.max(x))
    dataframe["word_len_mean"] = words.apply(lambda x: np.mean(x))
    dataframe["word_len_min"] = words.apply(lambda x: np.min(x))
    dataframe["word_len_std"] = words.apply(lambda x: np.std(x))
    dataframe["word_len_var"] = words.apply(lambda x: np.var(x))
    
    dataframe["word_len_q25"] = words.apply(lambda x: np.quantile(x, 0.25))
    dataframe["word_len_q50"] = words.apply(lambda x: np.quantile(x, 0.50))
    dataframe["word_len_q75"] = words.apply(lambda x: np.quantile(x, 0.75))
    dataframe["word_len_q90"] = words.apply(lambda x: np.quantile(x, 0.90))
    
    dataframe["sentence_len"] = dataframe["full_text"].apply(lambda x: len(x))
    dataframe["stopword_cnt"] = dataframe["full_text"].apply(lambda x: count_stopwords(x))
    dataframe["punct_cnt"] = dataframe["full_text"].apply(lambda x: count_punctuation(x))
    dataframe["number_cnt"] = dataframe["full_text"].apply(lambda x: count_numbers(x))
    dataframe["sentence_word_ratio"] = dataframe["sentence_len"] / dataframe["total_words"]
    dataframe["stopwords_ratio"] = dataframe["total_words"] / dataframe["stopword_cnt"] 
    
    return dataframe

In [None]:
train_fe = feature_engineer(train)
test_fe = feature_engineer(test)

train_fe.head()

In [None]:
vectorizer = TfidfVectorizer(
    encoding='utf-8',
    ngram_range=(1, 3),
    strip_accents='unicode',
    analyzer='word',
    min_df=0.05,
    max_df=0.95,
    sublinear_tf=True
)

train_vectorized = pd.DataFrame(
    vectorizer.fit_transform(train['full_text']).toarray(),
    columns=[f"tfidf_{str(f)}" for f in vectorizer.get_feature_names_out()],
)

test_vectorized = pd.DataFrame(
    vectorizer.transform(test['full_text']).toarray(),
    columns=[f"tfidf_{str(f)}" for f in vectorizer.get_feature_names_out()],
)

train_vectorized.head()

# 6. Modeling

In [None]:
params = {
    'booster': 'gbtree',
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'sampling_method': 'uniform',
    'tree_method': 'gpu_hist',
    'learning_rate': 0.025,
    'max_depth': 7,
    'subsample': 0.78,
    'min_child_weight': 5
}

X = pd.concat([train_fe, train_vectorized], axis=1).drop(columns=["essay_id", "full_text", "score"], axis=1)
X_test = pd.concat([test_fe, test_vectorized], axis=1).drop(columns=["essay_id", "full_text"], axis=1)
y = train["score"]
y_test = np.zeros(shape=test.shape[0], dtype=float)
dtest = xgb.DMatrix(data=X_test)

cv = StratifiedKFold(n_splits=config.n_folds, shuffle=True, random_state=config.seed)

for idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    print(f"| Fold {idx+1} |".center(80, "-"))
    X_train, y_train = X.loc[train_idx], y.loc[train_idx]
    X_val, y_val = X.loc[val_idx], y.loc[val_idx]

    print(f'train: {X_train.shape}')
    print(f'val: {X_val.shape}')
    
    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    dval = xgb.DMatrix(data=X_val, label=y_val)

    model = xgb.train(
        params=params,
        dtrain=dtrain,
        evals=[(dtrain, 'train'), (dval, 'validation')],
        num_boost_round=config.num_rounds,
        early_stopping_rounds=config.early_stopping_rounds,
        verbose_eval=config.verbose
    )

    y_test += model.predict(dtest) / config.n_folds
    
sample_submission["score"] = [round(i) for i in y_test]

# 7. Saving Submission

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index=False)