In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from scipy.sparse import hstack
import xgboost as xgb

In [None]:
# Load dataset
dataset = load_dataset("agentlans/readability")
df_train = df_train.sample(n=20000, random_state=42).reset_index(drop=True)
df_val = dataset["validation"].to_pandas()
df_test = dataset["test"].to_pandas()

print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)

# Compute readability features
def add_readability_features(df):
    df["fk_grade"] = df["text"].apply(textstat.flesch_kincaid_grade)
    df["smog_index"] = df["text"].apply(textstat.smog_index)
    df["ari"] = df["text"].apply(textstat.automated_readability_index)
    df["coleman_liau"] = df["text"].apply(textstat.coleman_liau_index)
    return df

df_train = add_readability_features(df_train)
df_val   = add_readability_features(df_val)
df_test  = add_readability_features(df_test)

In [None]:
# Optional: round target grades to nearest integer
df_train["grade_int"] = df_train["grade"].round().astype(int)
df_val["grade_int"] = df_val["grade"].round().astype(int)
df_test["grade_int"] = df_test["grade"].round().astype(int)

# TF-IDF vectorization of text
tfidf = TfidfVectorizer(max_features=300, ngram_range=(1,2))
X_train_text = tfidf.fit_transform(df_train["text"])
X_val_text   = tfidf.transform(df_val["text"])
X_test_text  = tfidf.transform(df_test["text"])

# Prepare numeric readability features
read_train = df_train[["fk_grade", "smog_index", "ari", "coleman_liau"]].fillna(0).values
read_val   = df_val[["fk_grade", "smog_index", "ari", "coleman_liau"]].fillna(0).values
read_test  = df_test[["fk_grade", "smog_index", "ari", "coleman_liau"]].fillna(0).values

# Combine text TF-IDF and readability metrics
X_train = hstack([X_train_text, read_train])
X_val   = hstack([X_val_text, read_val])
X_test  = hstack([X_test_text, read_test])

y_train = df_train["grade_int"]
y_val   = df_val["grade_int"]
y_test  = df_test["grade_int"]

# Train XGBoost regressor
params = {
    "objective": "reg:absoluteerror",
    "learning_rate": 0.1,
    "max_depth": 4,
    "n_estimators": 200,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42,
    "eval_metric": "mae",
}

model = xgb.XGBRegressor(**params)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
)

#  Predict and round to nearest integer
preds_val = np.round(model.predict(X_val)).astype(int)
preds_test = np.round(model.predict(X_test)).astype(int)

# Evaluate performance
mae_val = mean_absolute_error(y_val, preds_val)
mae_test = mean_absolute_error(y_test, preds_test)

print(f"Validation MAE: {mae_val:.2f}")
print(f"Test MAE: {mae_test:.2f}")