In [None]:
!pip install catboost
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import json
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import StackingRegressor, VotingRegressor
import warnings
warnings.filterwarnings('ignore')




In [None]:
# 1. Load and preprocess data
annon = pd.read_csv("annonimized.csv")
th = pd.read_csv("th-public.csv")
qt = pd.read_csv("qt-public.csv")
ck = pd.read_csv("ck-public.csv")


In [None]:
# Rename columns for better handling
rename_map = {
    "concat('it001',`assignment_id`)": "assignment_id",
    "concat('it001',`problem_id`)": "problem_id",
    "concat('it001', username)": "username",
    "concat('it001',`language_id`)": "language_id"
}
annon.rename(columns=rename_map, inplace=True)

# Rename score columns for consistency
th.rename(columns={"hash": "username", "TH": "th_score"}, inplace=True)
qt.rename(columns={"hash": "username", "diemqt": "qt_score"}, inplace=True)
ck.rename(columns={"hash": "username", "CK": "ck_score"}, inplace=True)

# Parse datetime columns
annon['created_at'] = pd.to_datetime(annon['created_at'], format="%d-%m %H:%M:%S", errors='coerce')
annon['updated_at'] = pd.to_datetime(annon['updated_at'], format="%d-%m %H:%M:%S", errors='coerce')


In [None]:
# 2. ENHANCED FEATURE ENGINEERING

# Basic cleanup
annon["is_final"] = annon["is_final"].astype(int)
annon["score"] = annon["pre_score"] * annon["coefficient"] / 100.0

# Parse judgment column to extract richer information
def parse_judgement_enhanced(j):
    if pd.isna(j):
        return pd.Series({'wrong': np.nan, 'time_limit': np.nan, 'runtime': np.nan, 'memory': np.nan})
    try:
        d = json.loads(j.replace('""', '"').replace('\\"', '"'))
        verdicts = d.get("verdicts", {})
        wrong = verdicts.get("WRONG", 0)
        time_limit = verdicts.get("TIME_LIMIT", 0)
        runtime = verdicts.get("RUNTIME", 0)
        memory_limit = verdicts.get("MEMORY_LIMIT", 0)
        return pd.Series({'wrong': wrong, 'time_limit': time_limit,
                         'runtime': runtime, 'memory': memory_limit})
    except:
        return pd.Series({'wrong': np.nan, 'time_limit': np.nan, 'runtime': np.nan, 'memory': np.nan})

judgment_data = annon["judgement"].apply(parse_judgement_enhanced)
annon = pd.concat([annon, judgment_data], axis=1)


In [None]:
# Extract time features
annon['hour'] = annon['created_at'].dt.hour
annon['minute'] = annon['created_at'].dt.minute
annon['is_night'] = ((annon['hour'] >= 22) | (annon['hour'] <= 5)).astype(int)
annon['is_evening'] = ((annon['hour'] >= 17) & (annon['hour'] < 22)).astype(int)
annon['is_morning'] = ((annon['hour'] >= 5) & (annon['hour'] < 12)).astype(int)
annon['is_afternoon'] = ((annon['hour'] >= 12) & (annon['hour'] < 17)).astype(int)

# Calculate time between submissions
annon = annon.sort_values(['username', 'created_at'])
annon['prev_submit_time'] = annon.groupby('username')['created_at'].shift(1)
annon['time_since_prev'] = (annon['created_at'] - annon['prev_submit_time']).dt.total_seconds() / 60  # in minutes

# Create features for problem difficulty estimation
problem_difficulty = annon.groupby('problem_id')['pre_score'].agg(['mean', 'median', 'std']).reset_index()
problem_difficulty.columns = ['problem_id', 'problem_mean_score', 'problem_median_score', 'problem_score_std']
annon = pd.merge(annon, problem_difficulty, on='problem_id', how='left')

# Group by username to extract features
grouped = annon.groupby("username")


In [None]:
features = pd.DataFrame()
# Basic submission stats
features["total_submissions"] = grouped.size()
features["final_submissions"] = grouped["is_final"].sum()
features["submission_ratio"] = features["final_submissions"] / features["total_submissions"].replace(0, 1)
features["unique_assignments"] = grouped["assignment_id"].nunique()
features["unique_problems"] = grouped["problem_id"].nunique()
features["assignments_per_problem"] = features["unique_assignments"] / features["unique_problems"].replace(0, 1)

# Score-related features
features["avg_score"] = grouped["score"].mean()
features["median_score"] = grouped["score"].median()
features["avg_score_final"] = grouped.apply(lambda x: x[x['is_final']==1]['score'].mean())
features["max_score_final"] = grouped.apply(lambda x: x[x['is_final']==1]['score'].max())
features["min_score_final"] = grouped.apply(lambda x: x[x['is_final']==1]['score'].min())
features["score_std"] = grouped["score"].std()
features["score_q25"] = grouped["score"].quantile(0.25)
features["score_q75"] = grouped["score"].quantile(0.75)
features["score_iqr"] = features["score_q75"] - features["score_q25"]

# Time and pattern features
features["avg_time_between_submissions"] = grouped["time_since_prev"].mean()
features["median_time_between_submissions"] = grouped["time_since_prev"].median()
features["night_submission_ratio"] = grouped["is_night"].mean()
features["evening_submission_ratio"] = grouped["is_evening"].mean()
features["morning_submission_ratio"] = grouped["is_morning"].mean()
features["afternoon_submission_ratio"] = grouped["is_afternoon"].mean()

# Error patterns
features["wrong_test_ratio"] = grouped["wrong"].mean()
features["time_limit_ratio"] = grouped["time_limit"].mean()
features["runtime_error_ratio"] = grouped["runtime"].mean()
features["memory_limit_ratio"] = grouped["memory"].mean()

# Efficiency metrics
features["coefficient_mean"] = grouped["coefficient"].mean()
features["coefficient_min"] = grouped["coefficient"].min()
features["success_rate"] = grouped.apply(lambda x: np.mean((x['is_final'] == 1) & (x['pre_score'] > 0)))
features["resubmission_rate"] = grouped.apply(lambda x: len(x) / x['problem_id'].nunique() if x['problem_id'].nunique() > 0 else 0)

# Relative performance metrics
features["avg_rel_to_problem_mean"] = grouped.apply(lambda x: np.mean(x['score'] / x['problem_mean_score'].replace(0, 1)))
features["performance_consistency"] = grouped.apply(lambda x: np.std(x['score'] / x['problem_mean_score'].replace(0, 1)))

# Reset index to make username a column
features = features.reset_index()


In [None]:
# Merge all scores into one dataframe
scores = pd.merge(th, qt, on="username", how="outer")
scores = pd.merge(scores, ck, on="username", how="outer")

# Convert scores to numeric values
scores["th_score"] = pd.to_numeric(scores["th_score"], errors='coerce')
scores["qt_score"] = pd.to_numeric(scores["qt_score"], errors='coerce')
scores["ck_score"] = pd.to_numeric(scores["ck_score"], errors='coerce')

# Calculate TBTL according to the formula
scores["tbtl"] = scores["qt_score"] * 0.3 + scores["th_score"] * 0.2 + scores["ck_score"] * 0.5

# Merge features with scores
all_data = features.merge(scores, on="username", how="left")


In [None]:

# 3. IMPROVED MODEL TRAINING
# Create training set
train_data = all_data.dropna(subset=["tbtl"])
X = train_data.drop(columns=["username", "th_score", "qt_score", "ck_score", "tbtl"])
y = train_data["tbtl"]

# Handle missing values
X = X.fillna(X.mean())

# Scale features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Advanced cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)


In [None]:


# 4. ENSEMBLE MODEL APPROACH

# LightGBM model with optimized parameters
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 50,
    'learning_rate': 0.03,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'min_child_samples': 30,
    'max_depth': 15,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'verbose': -1
}

# XGBoost model
xgb_params = {
    'objective': 'reg:squarederror',
    'learning_rate': 0.025,
    'max_depth': 6,
    'min_child_weight': 2,
    'gamma': 0.0,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'random_state': 42,
    'verbosity': 0
}

# CatBoost model
cb_params = {
    'loss_function': 'RMSE',
    'iterations': 1000,
    'learning_rate': 0.03,
    'depth': 7,
    'l2_leaf_reg': 3,
    'random_seed': 42,
    'verbose': False
}

# Create base models
lgb_model = lgb.LGBMRegressor(**lgb_params)
xgb_model = xgb.XGBRegressor(**xgb_params)
cb_model = cb.CatBoostRegressor(**cb_params)

# Train ensemble with cross-validation
r2_scores = []

for train_idx, val_idx in kf.split(X_scaled):
    # Split data for this fold
    X_train_fold, X_val_fold = X_scaled.iloc[train_idx], X_scaled.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    # Train models
    lgb_model.fit(X_train_fold, y_train_fold)
    xgb_model.fit(X_train_fold, y_train_fold)
    cb_model.fit(X_train_fold, y_train_fold)

    # Make predictions
    lgb_preds = lgb_model.predict(X_val_fold)
    xgb_preds = xgb_model.predict(X_val_fold)
    cb_preds = cb_model.predict(X_val_fold)

    # Simple averaging ensemble
    ensemble_preds = (lgb_preds + xgb_preds + cb_preds) / 3

    # Evaluate
    r2 = r2_score(y_val_fold, ensemble_preds)
    r2_scores.append(r2)

    print(f"Fold R²: {r2:.4f}")

print(f"Average CV R²: {np.mean(r2_scores):.4f}")

# Create final ensemble model with all data
final_ensemble = VotingRegressor([
    ('lgb', lgb.LGBMRegressor(**lgb_params)),
    ('xgb', xgb.XGBRegressor(**xgb_params)),
    ('cb', cb.CatBoostRegressor(**cb_params))
], weights=[1, 1, 1])


Fold R²: 0.3423
Fold R²: 0.3668
Fold R²: 0.1638
Fold R²: 0.3650
Fold R²: 0.2244
Average CV R²: 0.2924


In [None]:
# Train on full training dataset
final_ensemble.fit(X_scaled, y)


In [None]:
# Feature importance analysis (from LightGBM)
lgb_final = lgb.LGBMRegressor(**lgb_params)
lgb_final.fit(X_scaled, y)
importance = lgb_final.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
importance_df = importance_df.sort_values('Importance', ascending=False)
print("Top 15 most important features:")
print(importance_df.head(15))


Top 15 most important features:
                            Feature  Importance
21                 wrong_test_ratio         100
20       afternoon_submission_ratio          96
17           night_submission_ratio          92
15     avg_time_between_submissions          84
0                 total_submissions          82
5           assignments_per_problem          73
16  median_time_between_submissions          72
29          avg_rel_to_problem_mean          62
30          performance_consistency          61
11                        score_std          60
7                      median_score          53
8                   avg_score_final          52
19         morning_submission_ratio          45
4                   unique_problems          45
6                         avg_score          44


In [None]:



# 5. GENERATE PREDICTIONS
X_all = all_data.drop(columns=["username", "th_score", "qt_score", "ck_score", "tbtl"])
# Calculate mean only on numeric columns
numeric_means = X_all.select_dtypes(include=['number']).mean()
# Fill NaN values with the means of their respective columns
X_all = X_all.fillna(numeric_means)


X_all_scaled = scaler.transform(X_all)

# Generate predictions
all_data['predicted_tbtl'] = final_ensemble.predict(X_all_scaled)

# Post-processing: clip values to reasonable range
all_data['predicted_tbtl'] = np.clip(all_data['predicted_tbtl'], 0, 10)

# Format the result file
result = all_data[['username', 'predicted_tbtl']]
result.columns = ['hash', 'TBTL']
result.to_csv('result_2.csv', index=False, float_format='%.2f')

print(f"Generated predictions for {len(result)} students")

Generated predictions for 1489 students


In [None]:
from google.colab import drive
drive.mount('/content/drive')