In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [2]:
# top features:
top_features = ['feature-1', 'feature-3', 'feature-28', 'feature-43', 'feature-45',
       'feature-47', 'feature-50', 'feature-64', 'feature-65',
       'feature-124', 'feature-134', 'feature-152', 'feature-161',
       'feature-215', 'feature-221', 'feature-223', 'feature-224',
       'feature-226', 'feature-230', 'feature-235', 'feature-277',
       'feature-296', 'feature-314', 'feature-318', 'feature-373',
       'feature-401', 'feature-410', 'feature-464', 'feature-470',
       'feature-472', 'feature-473', 'feature-475', 'feature-482',
       'feature-494', 'feature-510', 'feature-526', 'feature-538',
       'feature-565', 'feature-588', 'feature-624', 'feature-669',
       'feature-703', 'feature-704', 'feature-713', 'feature-719',
       'feature-721', 'feature-722', 'feature-724', 'feature-731',
       'feature-733', 'feature-750']

In [3]:
df = pd.read_csv('source.csv')
features = list(df.drop(columns=['data_split','target']).columns)
features = top_features
le = LabelEncoder()
df['target_encoded'] = le.fit_transform(df['target']) # 0 > -1 and 1 > +4

In [4]:
train = df[df['data_split']=='TRAIN']
valid = df[df['data_split']=='VALIDATE']
test = df[df['data_split']=='TEST']

In [5]:
# Preprocessing
scaler = MinMaxScaler().fit(train[features])

train_ = scaler.transform(train[features])
valid_ = scaler.transform(valid[features])
test_ = scaler.transform(test[features])

x_tr, y_tr = train_, train['target_encoded']
x_va, y_va = valid_, valid['target_encoded']
x_ts, y_ts = test_, test['target_encoded']

In [None]:
x_tr, y_tr = train[features], train['target_encoded']
x_va, y_va = valid[features], valid['target_encoded']
x_ts, y_ts = test[features], test['target_encoded']

In [None]:
scale_pos_weight = len(df[df['target_encoded']==0]) / len(df[df['target_encoded']==1])
scale_pos_weight

In [6]:
model = LogisticRegression(
    max_iter=1000,
    penalty="l1",  
    C=0.01,  
    class_weight={0: 1, 1: 10},  # 10x weight for +4
    solver="liblinear",
)

model.fit(x_tr, y_tr)

In [7]:
calibrated_model = CalibratedClassifierCV(model, method='isotonic', cv=5)
calibrated_model.fit(x_va, y_va)

In [7]:
proba_tr = model.predict_proba(x_tr)[:, 1]
proba_va = model.predict_proba(x_va)[:, 1]
proba_ts = model.predict_proba(x_ts)[:, 1]

In [None]:
cal_proba_tr = calibrated_model.predict_proba(x_tr)[:, 1]
cal_proba_va = calibrated_model.predict_proba(x_va)[:, 1]
cal_proba_ts = calibrated_model.predict_proba(x_ts)[:, 1]

proba_tr = cal_proba_tr
proba_va = cal_proba_va
proba_ts = cal_proba_ts

In [8]:
threshold_list = list()
train_sum = list()
valid_sum = list()
test_sum = list()

thresholds = np.arange(0.05, 0.99, 0.01)
for t in thresholds:
    filter_tr = proba_tr >= t
    filter_va = proba_va >= t
    filter_ts = proba_ts >= t
    sum_target_before_tr = train["target"].sum()
    sum_target_before_va = valid["target"].sum()
    sum_target_before_ts = test["target"].sum()
    sum_target_after_tr = train["target"][filter_tr].sum()
    sum_target_after_va = valid["target"][filter_va].sum()
    sum_target_after_ts = test["target"][filter_ts].sum()
    # print(f"Train:| {sum_target_before_tr} | {sum_target_after_tr}")
    # print(f"Valid:| {sum_target_before_va} | {sum_target_after_va}")
    # print(f"Test :| {sum_target_before_ts} | {sum_target_after_ts}")
    threshold_list.append(t)
    train_sum.append(sum_target_after_tr)
    valid_sum.append(sum_target_after_va)
    test_sum.append(sum_target_after_ts)

In [9]:
pd.DataFrame(
    {
        'Threshold': threshold_list,
        'Train Post': train_sum,
        'Valid Post': valid_sum,
        'Test Post': test_sum,
    }
).head(59)

Unnamed: 0,Threshold,Train Post,Valid Post,Test Post
0,0.05,-1089,289,-199
1,0.06,-1089,289,-199
2,0.07,-1089,289,-199
3,0.08,-1089,289,-199
4,0.09,-1089,289,-199
5,0.1,-1089,289,-199
6,0.11,-1089,289,-199
7,0.12,-1089,289,-199
8,0.13,-1089,289,-199
9,0.14,-1089,289,-199


In [None]:
result = permutation_importance(
    model, x_va, y_va, n_repeats=10, random_state=1, scoring="roc_auc", n_jobs=-1
)

In [None]:
perm_imp = pd.DataFrame(
    {
        'Feature': features,
        'Importance': result.importances_mean
    }
)
perm_imp.sort_values(by='Importance', ascending=False)

In [None]:
perm_imp[perm_imp["Importance"] > 0]["Feature"].values