In [2]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer, OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.compose import  ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier

from catboost import CatBoostClassifier

from lightgbm import LGBMClassifier

from xgboost import XGBClassifier

In [2]:
import pandas as pd

train = pd.read_csv("data/driver_pred/train.csv")
test = pd.read_csv("data/driver_pred/test.csv")

target_col = "target"
id_col = "id"

feature_cols = [c for c in train.columns if c not in [id_col, target_col]]

cat_features = [c for c in feature_cols if c.endswith("_cat")]
bin_features = [c for c in feature_cols if c.endswith("_bin")]
num_features = sorted(list(set(feature_cols) - set(cat_features) - set(bin_features)))

train[cat_features] = train[cat_features].astype("category")
test[cat_features]  = test[cat_features].astype("category")


X = train[feature_cols]
y = train[target_col]

catboost_cat_features = [X.columns.get_loc(c) for c in cat_features]



In [3]:
X = df.drop(["id", "target"], axis=1)
y = df["target"].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'df' is not defined

In [198]:
log_params = {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.1, 'max_iter': 1500, 'class_weight': 'balanced', 'random_state': 42}
cat_boost_params = dict(
    iterations=600,
    learning_rate=0.03,
    depth=5,
    l2_leaf_reg=5,
    cat_features = cat_features,
    bootstrap_type="Bayesian",
    bagging_temperature=0.5,
    random_strength=2,
    eval_metric="AUC",
    loss_function="Logloss",
    verbose=0,
    random_state=42, 
)

lgbm_params = {
    "objective": "binary",
    "boosting_type": "gbdt",

    "n_estimators": 1700,
    "learning_rate": 0.01,

    "max_depth": -1,
    "num_leaves": 31,

    "subsample": 0.8,            # bagging
    "subsample_freq": 1,
    "colsample_bytree": 0.8,     # feature sampling

    "reg_alpha": 0.0,
    "reg_lambda": 0.0,

    "min_child_weight": 1e-3,
    "min_child_samples": 20,

    "n_jobs": 6,
    "random_state": 42
}

xgb_params = {
    "n_estimators": 600,
    "learning_rate": 0.05,

    "max_depth": 5,
    "min_child_weight": 20,

    "subsample": 0.8,
    "colsample_bytree": 0.8,

    "gamma": 0.0,
    "lambda": 1.0,
    "alpha": 0.0,

    "eval_metric": "auc",

    "tree_method": "hist",   # оптимально под CPU
    "n_jobs": 6,
    "random_state": 42
}


In [207]:
cat_boost = Pipeline(steps=[
    ('drop', FunctionTransformer(lambda df: df.drop(columns=num_features))),
    ('model', CatBoostClassifier(**cat_boost_params))
])

lgbmc = Pipeline(steps=[
    ('model', LGBMClassifier(**lgbm_params))
])

xgb = Pipeline(steps=[
    # ('drop', FunctionTransformer(lambda df: df.drop(columns=cat_features))),
    ('model', XGBClassifier(**xgb_params))
])


log_reg = Pipeline(steps=[
    ('drop', FunctionTransformer(lambda df: df.drop(columns=cat_features))),
    ('scaller', StandardScaler()),
    ('model', LogisticRegression(**log_params))
])

In [179]:
xgb.fit(X_train, y_train)

0,1,2
,steps,"[('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [170]:
lgbmc.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 12262, number of negative: 321056
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080264 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1362
[LightGBM] [Info] Number of data points in the train set: 333318, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036788 -> initscore=-3.265111
[LightGBM] [Info] Start training from score -3.265111


0,1,2
,steps,"[('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.01
,n_estimators,1700
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [201]:
cat_boost.fit(X_train, y_train)

0,1,2
,steps,"[('model', ...)]"
,transform_input,
,memory,
,verbose,False


In [172]:
log_reg.fit(X_train, y_train)

0,1,2
,steps,"[('drop', ...), ('scaller', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function <la...x789fb7b84d60>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,0.1
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1500


In [202]:
pred_proba_log = log_reg.predict_proba(X_val)[:, 1]
pred_proba_cat = cat_boost.predict_proba(X_val)[:, 1]
prd_proba_lgbmc = lgbmc.predict_proba(X_val)[:, 1]
prd_proba_xgb = xgb.predict_proba(X_val)[:, 1]

roc_auc_log = roc_auc_score(y_val, pred_proba_log)
roc_auc_cat = roc_auc_score(y_val, pred_proba_cat)
roc_auc_lgbms = roc_auc_score(y_val, prd_proba_lgbmc)
roc_auc_xgb = roc_auc_score(y_val, prd_proba_xgb)

print(f"Log reg: {roc_auc_log}")
print(f"Catboost: {roc_auc_cat}")
print(f"LGBMC: {roc_auc_lgbms}")
print(f"XGB: {roc_auc_xgb}")

Log reg: 0.6146527289428295
Catboost: 0.6389774862468341
LGBMC: 0.6403299124103791
XGB: 0.6374786854149082


In [208]:
X_dummies = train.drop(["id", "target"], axis=1)
y = train["target"]

stack = StackingClassifier(
    
    estimators=[("xgb", xgb), ("lgm", lgbmc), ('log_reg', log_reg), ("cat", cat_boost)],
    final_estimator=log_reg,
    stack_method="predict_proba",
    cv=5,
    n_jobs=4,
    passthrough=False
)

stack.fit(X, y)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:ps_ind_02_cat: category, ps_ind_04_cat: category, ps_ind_05_cat: category, ps_car_01_cat: category, ps_car_02_cat: category, ps_car_03_cat: category, ps_car_04_cat: category, ps_car_05_cat: category, ps_car_06_cat: category, ps_car_07_cat: category, ps_car_08_cat: category, ps_car_09_cat: category, ps_car_10_cat: category, ps_car_11_cat: category

In [3]:
!pip install google
!pip install colab
from google.colab import drive

drive.mount('/content/drive')


Collecting colab
  Downloading colab-1.13.5.tar.gz (567 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.7/567.7 kB[0m [31m336.6 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[7 lines of output][0m
  [31m   [0m error in colab setup command: 'install_requires' must be a string or iterable of strings containing valid project/version requirement specifiers; Expected end or semicolon (after version specifier)
  [31m   [0m     pytz>=2011n
  [31m   [0m         ~~~~~~^
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0

ModuleNotFoundError: No module named 'google.colab'