In [1]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit # 使用 RandomizedSearchCV 和 TimeSeriesSplit
from sklearn.metrics import roc_auc_score, make_scorer, f1_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier 
from sklearn.naive_bayes import GaussianNB
import numpy as np
import lightgbm as lgb

In [2]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "03_10_day_window_sliced.parquet"
df_final = pd.read_parquet(checkpoint_file_path)

In [3]:
def get_feature_types(df, target_col="label"):
    numerical_features = []
    categorical_features = []

    feature_columns = []
    for col in df.columns:
            if col == target_col or col == "userId" or pd.api.types.is_datetime64_any_dtype(df[col]):
                continue
            feature_columns.append(col)
        
    for col in feature_columns:
            if pd.api.types.is_numeric_dtype(df[col].dtype):
                numerical_features.append(col)
            else:
                categorical_features.append(col)

    return numerical_features, categorical_features

final_numerical_features, final_categorical_features = get_feature_types(df_final)

print(final_numerical_features, final_categorical_features )





['num_unique_artists', 'count_about', 'count_add_friend', 'count_add_to_playlist', 'count_cancel', 'count_cancellation_confirmation', 'count_downgrade', 'count_error', 'count_help', 'count_home', 'count_logout', 'count_nextsong', 'count_roll_advert', 'count_save_settings', 'count_settings', 'count_submit_downgrade', 'count_submit_upgrade', 'count_thumbs_down', 'count_thumbs_up', 'count_upgrade', 'count_total_sessions', 'user_lifecycle_h', 'ttl_length', 'item_per_session', 'frequency', 'avg_songs_session', 'thumbs_ratio', 'errors_per_session', 'ads_per_session', 'hours_since_last_session', 'active_days', 'active_days_ratio', 'session_length_variance', 'is_new_user', 'hours_since_downgrade', 'num_unique_songs', 'unique_songs_ratio', 'early_actions', 'late_actions', 'within_window_activity_ratio', 'within_window_activity_change', 'early_songs_played', 'late_songs_played', 'song_listening_change', 'recent_actions_last_3d', 'recent_activity_ratio', 'early_avg_items_per_session', 'late_avg_i

In [4]:
final_numerical_features = [
    'num_unique_artists', 'count_about', 'count_add_friend', 'count_add_to_playlist', 
    'count_downgrade', 'count_error', 
    'count_help', 'count_home', 'count_logout', 'count_nextsong', 'count_roll_advert', 
    'count_save_settings', 'count_settings', 'count_submit_downgrade', 
    'count_submit_upgrade', 'count_thumbs_down', 'count_thumbs_up', 'count_upgrade', 
    'count_total_sessions', 'user_lifecycle_h', 'ttl_length', 'item_per_session', 
    'frequency', 'avg_songs_session', 'thumbs_ratio', 'errors_per_session', 
    'ads_per_session', 'hours_since_last_session', 'active_days', 'active_days_ratio', 
    'session_length_variance', 'hours_since_downgrade', 
    'num_unique_songs', 'unique_songs_ratio', 'early_actions', 'late_actions', 
    'within_window_activity_ratio', 'within_window_activity_change', 
    'early_songs_played', 'late_songs_played', 'song_listening_change', 
    'recent_actions_last_3d', 'recent_activity_ratio', 'early_avg_items_per_session', 
    'late_avg_items_per_session', 'session_depth_change'
]

final_categorical_features = [
    'gender', 'last_level','is_new_user'
]



In [5]:
df_final["num_unique_songs"] = df_final["num_unique_songs"].fillna(0)

In [6]:
all_features = final_numerical_features + final_categorical_features
X = df_final[all_features]
y = df_final["label"]
print("Shape of Dataset loaded:", X.shape)

Shape of Dataset loaded: (60434, 49)


In [7]:
from sklearn.feature_selection import SelectFromModel

# Build LASSO selector for meaningful feature selection
lasso_selector = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", SelectFromModel(
        LogisticRegression(
            penalty="l1", solver="liblinear", C=0.1,
            class_weight="balanced", random_state=42
        )
    ))
])

lasso_selector.fit(X[final_numerical_features], y)

# Get Boolean mask from LASSO
selected_mask = lasso_selector.named_steps["lasso"].get_support()

# print out selected feature from LASSO
selected_num_features = []
for feature_name, keep_bool in zip(final_numerical_features, selected_mask):
    if keep_bool == True:
        selected_num_features.append(feature_name)

print("Selected num features:", selected_num_features)
print("Count of selected num features", len(selected_num_features))



Selected num features: ['num_unique_artists', 'count_about', 'count_add_friend', 'count_add_to_playlist', 'count_downgrade', 'count_error', 'count_help', 'count_home', 'count_logout', 'count_roll_advert', 'count_save_settings', 'count_settings', 'count_submit_downgrade', 'count_submit_upgrade', 'count_thumbs_down', 'count_thumbs_up', 'count_upgrade', 'count_total_sessions', 'user_lifecycle_h', 'item_per_session', 'frequency', 'avg_songs_session', 'thumbs_ratio', 'errors_per_session', 'ads_per_session', 'hours_since_last_session', 'active_days', 'active_days_ratio', 'session_length_variance', 'hours_since_downgrade', 'unique_songs_ratio', 'within_window_activity_ratio', 'within_window_activity_change', 'recent_actions_last_3d', 'recent_activity_ratio', 'late_avg_items_per_session', 'session_depth_change']
Count of selected num features 37


In [8]:
# Build final preprocessor with selected features

preprocessor_final = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), selected_num_features),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), final_categorical_features)
    ],
    remainder="drop"
)

In [9]:
# Define Models and Random Search Space

# Base Pipeline
base_pipeline = ImbPipeline([
    ("preprocessor", preprocessor_final),
    ("classifier", LogisticRegression()) 
])

# Logistic Regression model
log_reg = LogisticRegression(
    penalty="l2",
    class_weight="balanced",
    solver="liblinear",
    random_state=42
)

# Random Forest
rf = RandomForestClassifier(
    class_weight="balanced_subsample",
    random_state=42
)

# XGB imbalance weight
neg = (y == 0).sum()
pos = (y == 1).sum()
xgb_scale = neg / pos

xgb = XGBClassifier(
    eval_metric="logloss",
    tree_method="hist",
    random_state=42
)

lgbm = lgb.LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    n_estimators=300,
    learning_rate=0.05,
    class_weight="balanced",
    random_state=42
)

In [10]:
model_candidates = [
    {
        "classifier": [log_reg],
        "classifier__C": np.logspace(-2, 2, 5)
    },
    {
        "classifier": [rf],
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [5, 10, 15]
    },
    {
        "classifier": [xgb],
        "classifier__learning_rate": [0.05, 0.1],
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [3, 5],
        "classifier__scale_pos_weight": [xgb_scale]
    },
    {
        "classifier": [lgbm],
        "classifier__num_leaves": [31, 63, 127],
        "classifier__max_depth": [-1, 5, 10],
        "classifier__min_data_in_leaf": [20, 50, 100],
        "classifier__subsample": [0.7, 0.9, 1.0],
        "classifier__colsample_bytree": [0.7, 0.9, 1.0],
    }
]

In [11]:
# Group-Fold CV + Randomized Search

from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, make_scorer

groups = df_final["snapshot_day"]
cv = GroupKFold(n_splits=4)

random_search = RandomizedSearchCV(
    estimator=base_pipeline,
    param_distributions=model_candidates,
    n_iter=20,
    scoring="roc_auc",
    cv=cv.split(X, y, groups),
    verbose=2,
    n_jobs=-1,
    random_state=42,
    error_score='raise'   #check error
)

random_search.fit(X, y)

best_model = random_search.best_estimator_
best_score = random_search.best_score_

print("Best AUC:", best_score)
print("Best classifier:", best_model["classifier"])
print("Best params:", random_search.best_params_)

Fitting 4 folds for each of 20 candidates, totalling 80 fits


[LightGBM] [Info] Number of positive: 2427, number of negative: 42731
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006460 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5664
[LightGBM] [Info] Number of data points in the train set: 45158, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Number of positive: 2273, number of negative: 42709[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.065009 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.

[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in th



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=5, classifier__min_data_in_leaf=20, classifier__num_leaves=127, classifier__subsample=0.9; total time=  10.3s

[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035047 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=5, classifier__min_data_in_leaf=20, classifier__num_leaves=127, classifier__subsample=0.9; total time=  12.1s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=5, classifier__min_data_in_leaf=20, classifier__num_leaves=127, classifier__subsample=0.9; total time=  12.2s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=5, classifier__min_data_in_leaf=20, classifier__num_leaves=127, classifier__subsample=0.9; total time=  12.6s
[LightGBM] [Info] Number of positive: 2273, number of negative: 42709
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 2427, number of negative: 42731
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013932 seconds.
You can set `force







[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=10, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=0.9; total time=  22.0s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=10, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=0.9; total time=  22.2s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=10, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=0.9; total time=  22.1s
[CV] END classifier=LGBMClassifier(class_weight='



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=0.9; total time=  23.6s
[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051915 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=0.9; total time=  23.6s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=0.9; total time=  23.9s
[LightGBM] [Info] Number of positive: 2273, number of negative: 42709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

[LightGBM] [Info] Number of positive: 2427, number of negative: 42731
[LightGBM] [Info] Auto-choosing col-wise multi-threading,




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=0.9; total time=  28.2s

[LightGBM] [Info] Number of positive: 2269, number of negative: 43989
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5904
[LightGBM] [Info] Number of data points in the train set: 46258, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=5, classifier__min_data_in_leaf=100, classifier__num_leaves=31, classifier__subsample=1.0; total time=   7.3s
[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011829 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=10, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=1.0; total time=  20.9s













[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=10, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=1.0; total time=  21.4s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=5, classifier__min_data_in_leaf=100, classifier__num_leaves=31, classifier__subsample=1.0; total time=   8.1s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=10, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=1.0; total time=  21.6s
[LightGBM] [Info] Number of positive: 2273, numb



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=5, classifier__min_data_in_leaf=100, classifier__num_leaves=31, classifier__subsample=1.0; total time=  10.2s






[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=10, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=1.0; total time=  25.5s
[LightGBM] [Info] Number of positive: 2273, number of negative: 42709

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030368 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


[LightGBM] [Info] Number of positive: 2427, number of negative: 42731
[LightGBM] [Info] Auto-choosing row-wise multi-thread





[LightGBM] [Info] Number of positive: 2269, number of negative: 43989
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006965 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5904
[LightGBM] [Info] Number of data points in the train set: 46258, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=5, classifier__min_data_in_leaf=50, classifier__num_leaves=31, classifier__subsample=0.9; total time=  11.7s




[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=5, classifier__min_data_in_leaf=50, classifier__num_leaves=31, classifier__subsample=0.9; total time=  10.7s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=5, classifier__min_data_in_leaf=50, classifier__num_leaves=31, classifier__subsample=0.9; total time=  10.5s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=5, classifier__min_data_in_leaf=50, classifier__num_leaves=31, classifier__subsample=0.9; total time=  10.6s
[LightGBM] [Info] Number of positive: 2273, number of negative: 42709
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018410 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=63, classifier__subsample=0.7; total time=  20.6s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=63, classifier__subsample=0.7; total time=  21.9s








[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=63, classifier__subsample=0.7; total time=  21.3s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=63, classifier__subsample=0.7; total time=  22.4s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=127, classifier__subsample=0.9; total time=  22.0s
[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017029 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=127, classifier__subsample=0.9; total time=  22.5s
[LightGBM] [Info] Number of positive: 2273, number of negative: 42709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004942 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=127, classifier__subsample=0.9; total time=  23.4s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=127, classifier__subsample=0.9; total time=  24.4s
[LightGBM] [Info] Number of positive: 2427, number of negative: 42731
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006304 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5664
[LightGBM] [Info] Number of data points in the train set: 45158, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 2269, number of negative: 43989
[LightGBM] [Info] Auto-choosing row-wise multi-threading



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=1.0; total time=  16.7s
[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.051390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=1.0; total time=  16.9s




[LightGBM] [Info] Number of positive: 2273, number of negative: 42709
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008217 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=1.0; total time=  16.9s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.9, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=63, classifier__subsample=1.0; total time=  16.7s
[LightGBM] [Info] Number of positive: 2427, number of negative: 42731
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004507 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5664
[LightGBM] [Info] Number of data points in the train set: 45158, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 2269, number of negative: 43989
[LightGBM] [Info] Auto-choosing row-wise multi-threading, 



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=-1, classifier__min_data_in_leaf=20, classifier__num_leaves=63, classifier__subsample=1.0; total time=  14.0s
[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009866 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=-1, classifier__min_data_in_leaf=20, classifier__num_leaves=63, classifier__subsample=1.0; total time=  13.9s




[LightGBM] [Info] Number of positive: 2273, number of negative: 42709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=-1, classifier__min_data_in_leaf=20, classifier__num_leaves=63, classifier__subsample=1.0; total time=  14.0s
[LightGBM] [Info] Number of positive: 2427, number of negative: 42731
[LightGBM] [Info] Auto-choosing row-wise multi-threading, 



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=-1, classifier__min_data_in_leaf=20, classifier__num_leaves=63, classifier__subsample=1.0; total time=  16.4s

[LightGBM] [Info] Number of positive: 2269, number of negative: 43989
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5904
[LightGBM] [Info] Number of data points in the train set: 46258, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='bi





[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006056 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

[CV] END classifier=RandomForestClassifier(class_weight='balanced_subsample', random_state=42), classifier__max_depth=15, classifier__n_estimators=100; total time=  49.3s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=5, classifier__min_data_in_leaf=50, classifier__num_leaves=127, classifier__subsample=1.0; total time=   5.5s
[LightGBM] [Info] Number of positive: 2273, number of negative: 42709




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.086036 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=5, classifier__min_data_in_leaf=50, classifier__num_leaves=127, classifier__subsample=1.0; total time=   5.4s
[LightGBM] [Info] Number of positive: 2427, number of negative: 42731
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010355 seconds.
You can set `force_col_w



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=5, classifier__min_data_in_leaf=50, classifier__num_leaves=127, classifier__subsample=1.0; total time=   5.5s
[CV] END classifier=RandomForestClassifier(class_weight='balanced_subsample', random_state=42), classifier__max_depth=15, classifier__n_estimators=100; total time=  47.6s
[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[CV] END classifier=R



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=5, classifier__min_data_in_leaf=20, classifier__num_leaves=31, classifier__subsample=0.7; total time=   6.0s
[LightGBM] [Info] Number of positive: 2269, number of negative: 43989
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5904
[LightGBM] [Info] Number of data points in the train set: 46258, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

[CV] END classifier=RandomForestClassifier(class_weight='balanced_subsample', random_state=42), classifier__max_depth=15, classi



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=5, classifier__min_data_in_leaf=20, classifier__num_leaves=31, classifier__subsample=0.7; total time=   6.8s
[LightGBM] [Info] Number of positive: 2273, number of negative: 42709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021973 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=5, classifier__min_data_in_leaf=20, classifier__num_leaves=31, classifier__subsample=0.7; total time=   7.1s
[LightGBM] [Info] Number of positive: 2427, number of negative: 42731
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002684 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5664
[LightGBM] [Info] Number of data points in the train set: 45158, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=5, classifier__min_data_in_leaf=20, classifier__num_leaves=31, classifier__subsample=0.7; total time=   8.5s
[LightGBM] [Info] Number of positive: 2269, number of negative: 43989
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5904
[LightGBM] [Info] Number of data points in the train set: 46258, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=31, classifier__subsample=1.0; total time=  13.6s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=31, classifier__subsample=1.0; total time=  13.5s
[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=31, classifier__subsample=1.0; total time=  15.8s
[LightGBM] [Info] Number of positive: 2273, number of negative: 42709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=50, classifier__num_leaves=31, classifier__subsample=1.0; total time=  13.9s
[LightGBM] [Info] Number of positive: 2427, number of negative: 42731
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012401 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5664
[LightGBM] [Info] Number of data points in the train set: 45158, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 2269, number of negative: 43989
[LightGBM] [Info] Auto-choosing row-wise multi-threading, 



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=31, classifier__subsample=0.7; total time=  12.1s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=31, classifier__subsample=0.7; total time=  12.3s
[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014626 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=31, classifier__subsample=0.7; total time=  12.2s
[LightGBM] [Info] Number of positive: 2273, number of negative: 42709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009265 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




[LightGBM] [Info] Number of positive: 2427, number of negative: 42731[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=10, classifier__min_data_in_leaf=100, classifier__num_leaves=31, classifier__subsample=0.7; total time=  11.8s

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5664
[LightGBM] [Info] Number of data points in the train set: 45158, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 2269, number of negative: 43989
[LightGBM] [Info] Auto-choosing row-wise multi-threading,



[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=100, classifier__num_leaves=127, classifier__subsample=0.9; total time=  42.2s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=100, classifier__num_leaves=127, classifier__subsample=0.9; total time=  42.6s








[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=100, classifier__num_leaves=127, classifier__subsample=0.9; total time=  46.1s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=-1, classifier__min_data_in_leaf=100, classifier__num_leaves=127, classifier__subsample=0.9; total time=  46.7s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=10, classifier__min_data_in_leaf=20, classifier__num_leaves=127, classifier__subsample=0.7; total time=  25.3s
[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004573 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000








[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=10, classifier__min_data_in_leaf=20, classifier__num_leaves=127, classifier__subsample=0.7; total time=  25.0s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=10, classifier__min_data_in_leaf=20, classifier__num_leaves=127, classifier__subsample=0.7; total time=  24.4s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=10, classifier__min_data_in_leaf=20, classifier__num_leaves=127, classifier__subsample=0.7; total time=  25.7s
[LightGBM] [Info] Number of positive: 2273, nu



[LightGBM] [Info] Number of positive: 2301, number of negative: 42603
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5825
[LightGBM] [Info] Number of data points in the train set: 44904, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=5, classifier__min_data_in_leaf=20, classifier__num_leaves=31, classifier__subsample=0.9; total time=   5.6s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=1.0, classifier__max_depth=5, classifier__min_data_in_leaf=20, classifier__num_leaves=31, classifier__subsample=0.9; total time=   5.6s
[LightGBM] [Info] Number of positive: 2273, number of negative: 42709
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5890
[LightGBM] [Info] Number of data points in the train set: 44982, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 2427, number of negative: 42731
[LightGBM] [Info] Auto-choosing row-wise multi-threading, t




[LightGBM] [Info] Number of positive: 2269, number of negative: 43989
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005537 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5904
[LightGBM] [Info] Number of data points in the train set: 46258, number of used features: 40

[LightGBM] [Info] Start training from score 0.000000









[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=5, classifier__min_data_in_leaf=100, classifier__num_leaves=63, classifier__subsample=0.7; total time=   5.8s




[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=5, classifier__min_data_in_leaf=100, classifier__num_leaves=63, classifier__subsample=0.7; total time=   5.0s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=5, classifier__min_data_in_leaf=100, classifier__num_leaves=63, classifier__subsample=0.7; total time=   5.1s
[CV] END classifier=LGBMClassifier(class_weight='balanced', learning_rate=0.05, n_estimators=300,
               objective='binary', random_state=42), classifier__colsample_bytree=0.7, classifier__max_depth=5, classifier__min_data_in_leaf=100, classifier__num_leaves=63, classifier__subsample=0.7; total time=   4.3s




[CV] END classifier=RandomForestClassifier(class_weight='balanced_subsample', random_state=42), classifier__max_depth=5, classifier__n_estimators=200; total time=  31.2s
[CV] END classifier=RandomForestClassifier(class_weight='balanced_subsample', random_state=42), classifier__max_depth=5, classifier__n_estimators=200; total time=  27.9s
[CV] END classifier=RandomForestClassifier(class_weight='balanced_subsample', random_state=42), classifier__max_depth=5, classifier__n_estimators=200; total time=  29.6s
[CV] END classifier=RandomForestClassifier(class_weight='balanced_subsample', random_state=42), classifier__max_depth=5, classifier__n_estimators=200; total time=  28.6s
[LightGBM] [Info] Number of positive: 3090, number of negative: 57344
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total 

In [12]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "04_user_base_test.parquet"
df_test = pd.read_parquet(checkpoint_file_path)
X_test = df_test[all_features]
y_pred = best_model.predict(X_test)
submission_df = pd.DataFrame({
    "id": df_test["userId"],
    "target": y_pred
})
submission_df.to_csv("submission_Jessie.csv", index=False)

print("Submission saved as submission_Jessie.csv")

Submission saved as submission_Jessie.csv




In [13]:
import pandas as pd

feature_names = best_model['preprocessor'].get_feature_names_out()
print("Final features count:", len(feature_names))
print(pd.DataFrame(feature_names, columns=["feature"]))

Final features count: 40
                               feature
0              num__num_unique_artists
1                     num__count_about
2                num__count_add_friend
3           num__count_add_to_playlist
4                 num__count_downgrade
5                     num__count_error
6                      num__count_help
7                      num__count_home
8                    num__count_logout
9               num__count_roll_advert
10            num__count_save_settings
11                 num__count_settings
12         num__count_submit_downgrade
13           num__count_submit_upgrade
14              num__count_thumbs_down
15                num__count_thumbs_up
16                  num__count_upgrade
17           num__count_total_sessions
18               num__user_lifecycle_h
19               num__item_per_session
20                      num__frequency
21              num__avg_songs_session
22                   num__thumbs_ratio
23             num__errors_per_session


In [14]:
print(df_final.groupby("snapshot_day")["label"].value_counts())


snapshot_day  label
9             0        13355
              1          821
19            0        14635
              1          817
29            0        14741
              1          789
39            0        14613
              1          663
Name: count, dtype: int64


In [15]:
print("Total positives in full dataset:", y.sum())
print(df_final.groupby("snapshot_day")["label"].sum())

Total positives in full dataset: 3090
snapshot_day
9     821
19    817
29    789
39    663
Name: label, dtype: int64


In [16]:
def evaluate_ensemble(lr_model, lxgb_model, X, y, groups):

    cv = GroupKFold(n_splits=4)
    best_auc = -1
    best_weight = 0.5

    for w in np.linspace(0, 1, 11):   # weights from 0.0 to 1.0
        aucs = []

        for train_idx, valid_idx in cv.split(X, y, groups):
            X_valid = X.iloc[valid_idx]
            y_valid = y.iloc[valid_idx]

            # Predict probability
            p_lr = lr_model.predict_proba(X_valid)[:, 1]
            p_xgb = lxgb_model.predict_proba(X_valid)[:, 1]

            # Weighted ensemble
            p_ens = w * p_lr + (1 - w) * p_xgb

            aucs.append(roc_auc_score(y_valid, p_ens))

        mean_auc = np.mean(aucs)
        print(f"weight_lr = {w:.1f}, AUC = {mean_auc:.5f}")

        if mean_auc > best_auc:
            best_auc = mean_auc
            best_weight = w

    print("Best ensemble weight:", best_weight)
    print("Best ensemble AUC:", best_auc)
    
    return best_weight, best_auc


In [17]:
preprocessor_2 = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", drop='first'), final_categorical_features),
        ("num", StandardScaler(), selected_num_features)
    ],
    remainder="drop"
)

lr_model = LogisticRegression(solver="liblinear", penalty="l1", C=0.15)

lr_pipeline = ImbPipeline(steps=[
    ("preprocessor", preprocessor_2),
    ('smote', SMOTE(random_state=42)),
    ("classifier", lr_model)
])


In [18]:
# 1. Fit LR pipeline first
lr_pipeline.fit(X, y)

# 2. Fit the tuned model (best_model already comes from RandomizedSearchCV)
best_model.fit(X, y)

# 3. Now ensemble evaluation is valid
best_w, best_ens_auc = evaluate_ensemble(
    lr_pipeline,
    best_model,
    X,
    y,
    groups=df_final["snapshot_day"]
)

print(best_w, best_ens_auc)


[LightGBM] [Info] Number of positive: 3090, number of negative: 57344
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001352 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5906
[LightGBM] [Info] Number of data points in the train set: 60434, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000








weight_lr = 0.0, AUC = 0.86962








weight_lr = 0.1, AUC = 0.86198








weight_lr = 0.2, AUC = 0.85325








weight_lr = 0.3, AUC = 0.84332








weight_lr = 0.4, AUC = 0.83215








weight_lr = 0.5, AUC = 0.81959








weight_lr = 0.6, AUC = 0.80558








weight_lr = 0.7, AUC = 0.79002








weight_lr = 0.8, AUC = 0.77311








weight_lr = 0.9, AUC = 0.75512




weight_lr = 1.0, AUC = 0.73654
Best ensemble weight: 0.0
Best ensemble AUC: 0.8696236451055194
0.0 0.8696236451055194




In [19]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score

groups = df_final["snapshot_day"]
cv = GroupKFold(n_splits=4)

lr_auc_scores = []

for train_idx, val_idx in cv.split(X, y, groups):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    lr_pipeline.fit(X_train, y_train)
    y_pred_proba = lr_pipeline.predict_proba(X_val)[:, 1]

    auc = roc_auc_score(y_val, y_pred_proba)
    lr_auc_scores.append(auc)

print("LR fold AUCs:", lr_auc_scores)
print("LR mean AUC:", sum(lr_auc_scores)/len(lr_auc_scores))


LR fold AUCs: [0.7410730905902155, 0.7304423969801271, 0.7409767269561731, 0.7081244804233316]
LR mean AUC: 0.7301541737374618


In [20]:
PROCESSED_DATA_DIR = Path("data/processing_checkpoint")
checkpoint_file_path = PROCESSED_DATA_DIR / "04_user_base_test.parquet"
df_test = pd.read_parquet(checkpoint_file_path)
X_test = df_test[all_features]
X_test = X_test.fillna({
    'gender': 'M',
    'num_unique_songs': 0
})

y_pred = lr_pipeline.predict(X_test)
submission_df = pd.DataFrame({
    "id": df_test["userId"],
    "target": y_pred
})
submission_df.to_csv("submission_lr_Jessie.csv", index=False)

print("Submission saved as submission_Jessie.csv")

Submission saved as submission_Jessie.csv


In [21]:
# Extract classifier
lgbm_clf = best_model.named_steps["classifier"]

# Extract feature names
preprocessor = best_model.named_steps["preprocessor"]
feature_names = preprocessor.get_feature_names_out()

# LightGBM importances
importance_gain = lgbm_clf.booster_.feature_importance(importance_type='gain')
importance_split = lgbm_clf.booster_.feature_importance(importance_type='split')

importance_df = pd.DataFrame({
    "feature": feature_names,
    "gain_importance": importance_gain,
    "split_importance": importance_split
}).sort_values("gain_importance", ascending=False)

display(importance_df.head(15))


Unnamed: 0,feature,gain_importance,split_importance
27,num__active_days_ratio,60298.214337,284
22,num__thumbs_ratio,17467.997234,360
24,num__ads_per_session,15813.764542,409
9,num__count_roll_advert,14850.299748,234
14,num__count_thumbs_down,10910.633997,162
25,num__hours_since_last_session,10758.245053,421
21,num__avg_songs_session,8944.825762,300
15,num__count_thumbs_up,8334.407408,262
29,num__hours_since_downgrade,8084.686315,174
19,num__item_per_session,7477.850551,281


In [24]:
lr_clf = lr_pipeline['classifier']

feature_names_lr = lr_pipeline['preprocessor'].get_feature_names_out()

coefficients = lr_clf.coef_[0]

lr_importance_df = pd.DataFrame({
    'feature': feature_names_lr,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)
}).sort_values('abs_coefficient', ascending=False)

print(lr_importance_df.head(15).to_markdown(index=False))

| feature                    |   coefficient |   abs_coefficient |
|:---------------------------|--------------:|------------------:|
| cat__last_level_paid       |      0.847302 |          0.847302 |
| num__count_thumbs_up       |     -0.81614  |          0.81614  |
| num__active_days_ratio     |      0.531582 |          0.531582 |
| num__ads_per_session       |      0.35399  |          0.35399  |
| num__active_days           |     -0.328019 |          0.328019 |
| num__count_add_to_playlist |      0.309814 |          0.309814 |
| num__num_unique_artists    |      0.277677 |          0.277677 |
| num__count_roll_advert     |      0.277317 |          0.277317 |
| num__count_downgrade       |      0.269587 |          0.269587 |
| num__count_upgrade         |     -0.261324 |          0.261324 |
| num__count_total_sessions  |      0.260688 |          0.260688 |
| num__hours_since_downgrade |     -0.23263  |          0.23263  |
| num__item_per_session      |     -0.215378 |          0.2153