In [2]:
def sparse_to_dense(x):
    return x.toarray()

In [5]:
import matplotlib.pyplot as plt
import pandas as pd
import re
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, VotingClassifier,RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,f1_score,precision_recall_curve,mean_absolute_error, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from scipy.sparse import hstack
import numpy as np
import warnings
warnings.filterwarnings('ignore')
#LOAD
url = "https://raw.githubusercontent.com/AREEG94FAHAD/TaskComplexityEval-24/refs/heads/main/problems_data.jsonl"
df = pd.read_json(url, lines=True)
df = df.drop(columns=['sample_io', 'url', 'title'], errors='ignore')
#df.sample(5)
#df.isnull().sum()
#plt.bar(df['problem_class'].value_counts().index, df['problem_class'].value_counts())

targets = ["problem_score", "problem_class"]
text_cols = [c for c in df.columns if c not in targets]

df["full_text"] = df[text_cols].fillna("").astype(str).agg(" ".join, axis=1)
#df = df.drop(columns=text_cols)

#FEATUREENGINEERING
KEYWORDS = [ "dp","greedy","binary search","two pointers","sliding window", "recursion","backtracking","divide and conquer","bitmask", "array","string","stack","queue","heap","priority queue", "hashmap","set","tree","binary tree","bst","segment tree", "fenwick","trie","graph","dag","linked list","disjoint set","union find", "bfs","dfs","shortest path","dijkstra","bellman ford","floyd", "mst","kruskal","prim","topological","cycle","bipartite", "modulo","gcd","lcm","prime","sieve","combinatorics","probability", "matrix","prefix sum","xor","bitwise", "substring","subsequence","palindrome","z algorithm","kmp","hashing", "simulation","implementation","geometry","game theory"]

def numeric_features(X):
    # ALWAYS force 1D text
    if isinstance(X, pd.DataFrame):
        text = X.iloc[:, 0]
    elif isinstance(X, pd.Series):
        text = X
    else:
        text = pd.Series(X)

    text = text.fillna("").astype(str)

    length = text.str.len().values.reshape(-1, 1)
    math_symbols = text.str.count(r"[=<>+\-*/%^]").values.reshape(-1, 1)

    keyword_counts = np.column_stack([
        text.str.contains(rf"\b{k}\b", case=False, regex=True).astype(int)
        for k in KEYWORDS
    ])

    return np.hstack([length, math_symbols, keyword_counts])

num_feat = FunctionTransformer(numeric_features, validate=False)

features = ColumnTransformer([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=3,
        max_df=0.9,
        sublinear_tf=True
    ), "full_text"),
    ("numeric", Pipeline([
        ("extract", num_feat),
        ("scale", StandardScaler())
    ]), "full_text")
])
X = df[["full_text"]]
y = df["problem_class"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)
##########2-PASS MODEL: I first categorised data into hard vs not-hard , then from the not-hard i again labelled as medium , easy

# Binary target
y_train_s1 = (y_train == "hard").astype(int)

stage1 = Pipeline([
    ("features", clone(features)),
    ("clf", RandomForestClassifier(
        n_estimators=500,
        max_depth=18,
        min_samples_leaf=4,
        min_samples_split=10,
        class_weight={0: 1.0, 1: 1.32},
        n_jobs=-1,
        random_state=42
    ))
])
cv_probs = cross_val_predict(
    stage1,
    X_train,
    y_train_s1,
    cv=5,
    method="predict_proba",
    n_jobs=-1
)[:, 1]
best_acc = -1
best_threshold = 0.5

for t in np.linspace(0.3, 0.7, 100):
    preds = (cv_probs >= t).astype(int)
    acc = accuracy_score(y_train_s1, preds)
    if acc > best_acc:
        best_acc = acc
        best_threshold = t

HARD_T = best_threshold
print(f"Frozen HARD threshold (CV): {HARD_T:.4f}")
stage1.fit(X_train, y_train_s1)
hard_proba = stage1.predict_proba(X_test)[:, 1]
hard_pred = hard_proba >= HARD_T
mask_train_s2 = y_train != "hard"
mask_test_s2  = ~hard_pred

X_train_s2 = X_train.loc[mask_train_s2]
y_train_s2 = y_train.loc[mask_train_s2]

X_test_s2 = X_test.loc[mask_test_s2]
stage2 = Pipeline([
    ("features", clone(features)),
    ("clf", LinearSVC(
        C=0.73,
        class_weight="balanced",
        max_iter=10000
    ))
])

stage2.fit(X_train_s2, y_train_s2)
stage2_pred = stage2.predict(X_test_s2)
final_pred = np.array(["hard"] * len(X_test), dtype=object)
final_pred[mask_test_s2] = stage2_pred

print("Accuracy:", accuracy_score(y_test, final_pred))
print(confusion_matrix(y_test, final_pred))
print(classification_report(y_test, final_pred))
from sklearn.ensemble import HistGradientBoostingRegressor
X = df[["full_text"]]
y = df["problem_score"] 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


reg_pipe = Pipeline([
    ("features", clone(features)), 
    ("to_dense", FunctionTransformer(
    sparse_to_dense,
    accept_sparse=True
)),
    ("reg", HistGradientBoostingRegressor(
        max_depth=6,
        learning_rate=0.05,
        max_iter=300,
        random_state=42,
        scoring="neg_mean_squared_error"
    ))
])


reg_pipe.fit(X_train, y_train)

y_pred = reg_pipe.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
#rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Frozen HARD threshold (CV): 0.5424
Accuracy: 0.5151883353584447
[[ 90  14  49]
 [ 52 209 128]
 [ 62  94 125]]
              precision    recall  f1-score   support

        easy       0.44      0.59      0.50       153
        hard       0.66      0.54      0.59       389
      medium       0.41      0.44      0.43       281

    accuracy                           0.52       823
   macro avg       0.50      0.52      0.51       823
weighted avg       0.53      0.52      0.52       823

Mean Absolute Error (MAE): 1.6684
Root Mean Squared Error (RMSE): 1.9961


In [6]:
import joblib

joblib.dump(stage1, "stage1_hard_classifier.pkl")
joblib.dump(HARD_T, "hard_threshold.pkl")
joblib.dump(stage2, "stage2_easy_medium.pkl")
joblib.dump(reg_pipe, "score_regressor.pkl")

['score_regressor.pkl']