In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score
from scipy.sparse import hstack
from sentence_transformers import SentenceTransformer
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
df_train = pd.read_csv(r"/content/df_train.csv")
df_test = pd.read_csv(r"/content/df_test.csv")

In [4]:
df_train.info()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9912 entries, 0 to 9911
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   prompt                  9912 non-null   object 
 1   essay                   9912 non-null   object 
 2   task_achievement        9667 non-null   float64
 3   coherence_and_cohesion  9614 non-null   float64
 4   lexical_resource        9238 non-null   float64
 5   grammatical_range       9041 non-null   float64
dtypes: float64(4), object(2)
memory usage: 464.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 473 entries, 0 to 472
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   prompt  473 non-null    object
 1   essay   473 non-null    object
dtypes: object(2)
memory usage: 7.5+ KB


In [None]:
plt.figure(figsize=(8,6))
corr_matrix = df_train[score_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Heatmap Korelasi Antar Skor", fontsize=14)
plt.show()

In [5]:
# Panjang esai
df_train["essay_length"] = df_train["essay"].apply(lambda x: len(x.split()))
df_test["essay_length"] = df_test["essay"].apply(lambda x: len(x.split()))

In [6]:
# Buang outlier
df_train_clean = df_train[(df_train["essay_length"] >= 50) & (df_train["essay_length"] <= 700)].copy()

In [8]:
score_cols = ["task_achievement", "coherence_and_cohesion", "lexical_resource", "grammatical_range"]

for col in score_cols:
    df_train_clean[col] = df_train_clean[col].replace(0, np.nan)
    df_train_clean[col] = df_train_clean[col].fillna(df_train_clean[col].median())

# Gabungkan prompt + essay
df_train_clean["full_text"] = df_train_clean["prompt"] + " " + df_train_clean["essay"]
df_test["full_text"] = df_test["prompt"] + " " + df_test["essay"]

Feature Engineering

In [10]:
stop_words = set(stopwords.words("english"))

def extract_features(text):
    words = text.split()
    num_words = len(words)
    num_sentences = max(1, len(nltk.sent_tokenize(text)))
    unique_words = len(set(words))
    stopword_count = sum(1 for w in words if w.lower() in stop_words)
    punctuation_count = sum(1 for c in text if c in string.punctuation)
    return [
        len(text),  # char length
        num_words,  # word length
        np.mean([len(w) for w in words]) if words else 0,  # avg word length
        num_sentences,  # number of sentences
        num_words / num_sentences,  # avg sentence length
        unique_words,  # unique word count
        unique_words / num_words if num_words > 0 else 0,  # type-token ratio
        stopword_count / num_words if num_words > 0 else 0,  # stopword ratio
        punctuation_count  # punctuation count
    ]

feature_names = [
    "char_length", "word_length", "avg_word_len",
    "n_sentences", "avg_sentence_len", "unique_words",
    "ttr", "stopword_ratio", "punctuation_count"
]

train_extra_features = np.array([extract_features(t) for t in df_train_clean["essay"]])
test_extra_features = np.array([extract_features(t) for t in df_test["essay"]])

TF-IDF

In [12]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(df_train_clean["full_text"])
X_test_tfidf = tfidf.transform(df_test["full_text"])

SentenceTransformer Embeddings

In [13]:
model_st = SentenceTransformer("all-MiniLM-L6-v2")
X_train_emb = model_st.encode(df_train_clean["full_text"].tolist(), convert_to_tensor=False) # Convert to list
X_test_emb = model_st.encode(df_test["full_text"].tolist(), convert_to_tensor=False) # Convert to list

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Modelling

In [14]:
lgbm_params = {
    "n_estimators": 1000,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42
}
model = MultiOutputRegressor(LGBMRegressor(**lgbm_params))

In [18]:
y_train = df_train_clean[score_cols].values
print(X_train_tfidf.shape[0], train_extra_features.shape[0], X_train_emb.shape[0], y_train.shape[0])


9909 9909 9909 9909


In [25]:
from scipy.sparse import csr_matrix, hstack

# Gabungkan semua fitur
X_train_final = hstack([
    X_train_tfidf,
    csr_matrix(train_extra_features),
    csr_matrix(X_train_emb)
])

X_test_final = hstack([
    X_test_tfidf,
    csr_matrix(test_extra_features),
    csr_matrix(X_test_emb)
])

# Target
y_train = df_train_clean[score_cols].values

In [26]:
from sklearn.metrics import make_scorer, mean_squared_error

# Custom MSE untuk multioutput
def mse_multioutput(y_true, y_pred):
    return -mean_squared_error(y_true, y_pred)  # tanda minus supaya makin kecil makin baik

mse_scorer = make_scorer(mse_multioutput)

try:
    print("X_train_final:", X_train_final.shape)
    print("y_train:", y_train.shape)
except NameError as e:
    print("Variabel belum ada:", e)

# Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train_final, y_train, cv=kf, scoring=mse_scorer)
print("CV MSE:", -cv_scores.mean())

# Train Final Model
model.fit(X_train_final, y_train)
y_pred_train = model.predict(X_train_final)
mse_train = mean_squared_error(y_train, y_pred_train)
print(f"Train MSE: {mse_train:.4f}")

X_train_final: (9909, 5393)
y_train: (9909, 4)
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.470411 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362486
[LightGBM] [Info] Number of data points in the train set: 7927, number of used features: 5388
[LightGBM] [Info] Start training from score 6.650435
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.685643 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362486
[LightGBM] [Info] Number of data points in the train set: 7927, number of used features: 5388
[LightGBM] [Info] Start training from score 6.701590
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.093041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362486
[LightGBM] [Info] Number of data points in the train set: 7927, n



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.609829 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362205
[LightGBM] [Info] Number of data points in the train set: 7927, number of used features: 5389
[LightGBM] [Info] Start training from score 6.650309
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.600457 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362205
[LightGBM] [Info] Number of data points in the train set: 7927, number of used features: 5389
[LightGBM] [Info] Start training from score 6.707456
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.601841 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362205
[LightGBM] [Info] Number of data points in the train set: 7927, number of used features: 5389
[LightGBM] [Info] 



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.607679 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362224
[LightGBM] [Info] Number of data points in the train set: 7927, number of used features: 5390
[LightGBM] [Info] Start training from score 6.658950
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.097991 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362224
[LightGBM] [Info] Number of data points in the train set: 7927, number of used features: 5390
[LightGBM] [Info] Start training from score 6.713006
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.647589 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362224
[LightGBM] [Info] Number of data points in the train set: 7927, number of used features: 5390
[LightGBM] [Info] 



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.581213 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362061
[LightGBM] [Info] Number of data points in the train set: 7927, number of used features: 5389
[LightGBM] [Info] Start training from score 6.659581
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.728825 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362061
[LightGBM] [Info] Number of data points in the train set: 7927, number of used features: 5389
[LightGBM] [Info] Start training from score 6.717926
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.768299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362061
[LightGBM] [Info] Number of data points in the train set: 7927, number of used features: 5389
[LightGBM] [Info] 



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.633953 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362334
[LightGBM] [Info] Number of data points in the train set: 7928, number of used features: 5390
[LightGBM] [Info] Start training from score 6.658552
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.766063 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362334
[LightGBM] [Info] Number of data points in the train set: 7928, number of used features: 5390
[LightGBM] [Info] Start training from score 6.711087
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.645133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 362334
[LightGBM] [Info] Number of data points in the train set: 7928, number of used features: 5390
[LightGBM] [Info] 



CV MSE: 1.1947969784286276
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.834344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 410079
[LightGBM] [Info] Number of data points in the train set: 9909, number of used features: 5393
[LightGBM] [Info] Start training from score 6.655566
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.787620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 410079
[LightGBM] [Info] Number of data points in the train set: 9909, number of used features: 5393
[LightGBM] [Info] Start training from score 6.710213
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.802246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 410079
[LightGBM] [Info] Number of data points in the train set: 9909, number of used featur



Train MSE: 0.1297


In [27]:
y_test_pred = model.predict(X_test_final)
submission = pd.DataFrame(y_test_pred, columns=score_cols)
submission.index = submission.index + 1
submission.reset_index(inplace=True)
submission.rename(columns={"index": "ID"}, inplace=True)
submission.to_csv("submission_lgbm_advanced.csv", index=False)
print("Saved: submission_lgbm_advanced.csv")



Saved: submission_lgbm_advanced.csv
