In [2]:
!pip install datasets textstat xgboost --quiet

from datasets import load_dataset
import pandas as pd
import numpy as np
import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from scipy.sparse import hstack
import xgboost as xgb

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.1/177.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
# Load dataset
dataset = load_dataset("agentlans/readability")
df_train = dataset["train"].to_pandas()
df_train = df_train.sample(n=20000, random_state=42).reset_index(drop=True)
df_val = dataset["validation"].to_pandas()
df_test = dataset["test"].to_pandas()

print("Train shape:", df_train.shape)
print("Validation shape:", df_val.shape)
print("Test shape:", df_test.shape)

# Compute readability features
def add_readability_features(df):
    df["fk_grade"] = df["text"].apply(textstat.flesch_kincaid_grade)
    df["smog_index"] = df["text"].apply(textstat.smog_index)
    df["ari"] = df["text"].apply(textstat.automated_readability_index)
    df["coleman_liau"] = df["text"].apply(textstat.coleman_liau_index)
    return df

df_train = add_readability_features(df_train)
df_val   = add_readability_features(df_val)
df_test  = add_readability_features(df_test)

Train shape: (20000, 3)
Validation shape: (13095, 3)
Test shape: (13095, 3)


In [5]:
# Optional: round target grades to nearest integer
df_train["grade_int"] = df_train["grade"].round().astype(int)
df_val["grade_int"] = df_val["grade"].round().astype(int)
df_test["grade_int"] = df_test["grade"].round().astype(int)

# TF-IDF vectorization of text
tfidf = TfidfVectorizer(max_features=300, ngram_range=(1,2))
X_train_text = tfidf.fit_transform(df_train["text"])
X_val_text   = tfidf.transform(df_val["text"])
X_test_text  = tfidf.transform(df_test["text"])

# Features List
features = ["fk_grade", "smog_index", "ari", "coleman_liau"]

# Prepare numeric readability features
read_train = df_train[features].fillna(0).values
read_val   = df_val[features].fillna(0).values
read_test  = df_test[features].fillna(0).values

# Combine text TF-IDF and readability metrics
X_train = hstack([X_train_text, read_train])
X_val   = hstack([X_val_text, read_val])
X_test  = hstack([X_test_text, read_test])

y_train = df_train["grade_int"]
y_val   = df_val["grade_int"]
y_test  = df_test["grade_int"]

# Train XGBoost regressor
params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.1,
    "max_depth": 4,
    "n_estimators": 200,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42,
    "eval_metric": "mae",
}

model = xgb.XGBRegressor(**params)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)]
)

#  Predict and round to nearest integer
preds_val = np.round(model.predict(X_val)).astype(int)
preds_test = np.round(model.predict(X_test)).astype(int)

# Evaluate performance
mae_val = mean_absolute_error(y_val, preds_val)
mae_test = mean_absolute_error(y_test, preds_test)

print(f"Validation MAE: {mae_val:.2f}")
print(f"Test MAE: {mae_test:.2f}")


[0]	validation_0-mae:4.23257
[1]	validation_0-mae:3.81753
[2]	validation_0-mae:3.44428
[3]	validation_0-mae:3.10747
[4]	validation_0-mae:2.80551
[5]	validation_0-mae:2.53381
[6]	validation_0-mae:2.28850
[7]	validation_0-mae:2.06918
[8]	validation_0-mae:1.87183
[9]	validation_0-mae:1.69442
[10]	validation_0-mae:1.53609
[11]	validation_0-mae:1.39392
[12]	validation_0-mae:1.26736
[13]	validation_0-mae:1.15407
[14]	validation_0-mae:1.05328
[15]	validation_0-mae:0.96318
[16]	validation_0-mae:0.88365
[17]	validation_0-mae:0.81330
[18]	validation_0-mae:0.75132
[19]	validation_0-mae:0.69676
[20]	validation_0-mae:0.64883
[21]	validation_0-mae:0.60735
[22]	validation_0-mae:0.57156
[23]	validation_0-mae:0.54021
[24]	validation_0-mae:0.51337
[25]	validation_0-mae:0.49040
[26]	validation_0-mae:0.47079
[27]	validation_0-mae:0.45419
[28]	validation_0-mae:0.44009
[29]	validation_0-mae:0.42843
[30]	validation_0-mae:0.41816
[31]	validation_0-mae:0.40964
[32]	validation_0-mae:0.40258
[33]	validation_0-ma

In [9]:
# Features List
features = ["fk_grade", "smog_index", "ari", "coleman_liau"]

def predict_grade_level(text):
    # Create a DataFrame for the input text
    df_new = pd.DataFrame({"text": [text]})

    # Add readability features
    df_new = add_readability_features(df_new)

    # TF-IDF vectorization
    X_new_text = tfidf.transform(df_new["text"])

    # Prepare numeric readability features
    read_new = df_new[features].fillna(0).values

    # Combine features
    X_new = hstack([X_new_text, read_new])

    # Predict and round to nearest integer
    prediction = np.round(model.predict(X_new))[0].astype(int)

    return prediction

'{example_text_1}'
Predicted Grade Level: 3

'{example_text_2}'
Predicted Grade Level: 2
