In [1]:
import polars as pl
import pandas as pd
import numpy as np
import warnings
import pandas as pd
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.preprocessing import SplineTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.feature_selection import SelectKBest, f_regression

from preprocessing import count_by_values, dev_feats, reconstruct_essay, get_essay_df, word_feats, sent_feats 
from preprocessing import parag_feats, product_to_keys, get_keys_pressed_per_second, transform_and_clean_data, clean_column_names

from model_pipeline import load_and_prepare_data, standardize_and_clean_data, feature_selection_and_scaling, train_and_predict, ensemble_predictions

In [2]:
X = load_and_prepare_data("train_logs.csv")
X.drop('id',axis=1,inplace=True)

FileNotFoundError: No such file or directory (os error 2): train_logs.csv

In [3]:
X_test = load_and_prepare_data("test_logs.csv")
#X.drop('id',axis=1,inplace=True)
X_test.set_index('id', inplace=True)

< Count by values features >
< Input words stats features >
< Numerical columns features >
< Categorical columns features >
< Idle time features >
< P-bursts features >
< R-bursts features >


In [4]:
train_scores   = pd.read_csv('train_scores.csv',index_col='id')
y = pd.Series(train_scores['score'])


In [5]:
X_tran, X_test_tran  = standardize_and_clean_data(X, X_test)

In [6]:
from sklearn.feature_selection import SelectKBest, f_regression

selector = SelectKBest(score_func=f_regression, k=140)

selector.fit(X_tran, y)

X_tran_sel = X_tran.iloc[:,selector.get_support()]
X_test_tran_sel = X_test_tran.iloc[:,selector.get_support()]

In [7]:
# Dictionary to hold all parameters for each model
model_params = {
    'catboost': {'loss_function': 'RMSE', 'eval_metric': 'RMSE', 'verbose': False, 'random_state': 42},
    'xgb1': {'n_estimators': 250, 'learning_rate': 0.0346, 'max_depth': 3, 'min_child_weight': 18, 'subsample': 0.7,
             'colsample_bytree': 0.5, 'gamma': 0.0, 'reg_alpha': 1.0, 'reg_lambda': 1.0, 'eval_metric': 'rmse', 'random_state': 0},
    'xgb2': {'n_estimators': 300, 'learning_rate': 0.0226, 'max_depth': 4, 'min_child_weight': 9, 'subsample': 0.3,
             'colsample_bytree': 0.7, 'gamma': 0.8, 'reg_alpha': 0.3, 'reg_lambda': 0.6, 'eval_metric': 'rmse', 'random_state': 0},
    'lgbm1': {'n_estimators': 600, 'learning_rate': 0.018, 'max_depth': 13, 'min_child_samples': 18, 'subsample': 0.8,
              'colsample_bytree': 0.2, 'min_split_gain': 0.0, 'reg_alpha': 0.5, 'reg_lambda': 0.8, 'num_leaves': 20,
              'metric': 'rmse', 'random_state': 0},
    'lgbm2': {'n_estimators': 583, 'learning_rate': 0.029, 'max_depth': 28, 'min_child_samples': 9, 'subsample': 0.5,
              'colsample_bytree': 0.3, 'min_split_gain': 0.6, 'reg_alpha': 0.7, 'reg_lambda': 0.6, 'num_leaves': 25,
              'metric': 'rmse', 'random_state': 0},
    'hbc1': {'learning_rate': 0.063, 'max_leaf_nodes': 8, 'l2_regularization': 0.9, 'min_samples_leaf': 100, 'random_state': 0},
    'hbc2': {'learning_rate': 0.1, 'max_leaf_nodes': 6, 'l2_regularization': 1.1, 'min_samples_leaf': 60, 'random_state': 0}
}

# Dictionary to hold all models
models = {
    'catboost': CatBoostRegressor(**model_params['catboost']),
    'xgb1': XGBRegressor(**model_params['xgb1']),
    #'xgb2': XGBRegressor(**model_params['xgb2']),
    'lgbm1': lgb.LGBMRegressor(**model_params['lgbm1']),
    #'lgbm2': lgb.LGBMRegressor(**model_params['lgbm2']),
    'hbc1': HistGradientBoostingRegressor(**model_params['hbc1']),
    'hbc2': HistGradientBoostingRegressor(**model_params['hbc2']),
    'lr1': make_pipeline(SplineTransformer(n_knots=6, degree=2), Ridge(alpha=30, random_state=0)),
    'lr2': make_pipeline(SplineTransformer(n_knots=8, degree=4), Ridge(alpha=40, random_state=0))
}

preds = train_and_predict(models, X_tran_sel, y, X_test_tran_sel)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22966
[LightGBM] [Info] Number of data points in the train set: 2471, number of used features: 137
[LightGBM] [Info] Start training from score 3.711251


In [8]:
preds

{'catboost': array([2.61826262, 1.72108024, 1.70920116]),
 'xgb1': array([2.5512607, 1.2050683, 1.2050683], dtype=float32),
 'lgbm1': array([2.22804749, 1.26079379, 1.23883937]),
 'hbc1': array([3.0318293 , 1.55310584, 1.55310584]),
 'hbc2': array([2.86101734, 1.41710087, 1.41710087]),
 'lr1': array([2.01602765, 0.85207613, 0.98065527]),
 'lr2': array([2.18168536, 1.19012165, 1.29655884])}

In [9]:
# Ensemble predictions
weights = [0.2, 0.3, 0.2, 0.075, 0.075, 0.075, 0.075]
final_prediction = ensemble_predictions(preds, weights)

final_prediction

array([2.49143223, 1.33382567, 1.3446852 ])

In [10]:
test_ids = X_test.index

In [13]:
sub = pd.DataFrame({'id': test_ids, 'score': final_prediction})
sub.to_csv('submission.csv', index=False)