In [38]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Selected features from experiment notebook

top_features = ['feature-1', 'feature-3', 'feature-4', 'feature-8', 'feature-19',
       'feature-28', 'feature-34', 'feature-43', 'feature-45',
       'feature-47', 'feature-50', 'feature-65', 'feature-69',
       'feature-70', 'feature-123', 'feature-124', 'feature-127',
       'feature-128', 'feature-134', 'feature-152', 'feature-161',
       'feature-193', 'feature-215', 'feature-223', 'feature-234',
       'feature-235', 'feature-252', 'feature-253', 'feature-268',
       'feature-277', 'feature-281', 'feature-296', 'feature-318',
       'feature-339', 'feature-373', 'feature-401', 'feature-410',
       'feature-455', 'feature-468', 'feature-472', 'feature-473',
       'feature-479', 'feature-484', 'feature-494', 'feature-510',
       'feature-526', 'feature-531', 'feature-532', 'feature-538',
       'feature-548', 'feature-563', 'feature-565', 'feature-567',
       'feature-588', 'feature-593', 'feature-624', 'feature-632',
       'feature-669', 'feature-703', 'feature-704', 'feature-713',
       'feature-721', 'feature-722', 'feature-724', 'feature-733',
       'feature-741', 'feature-750']

In [40]:
df = pd.read_csv("source.csv")
features = list(df.drop(columns=["data_split", "target"]).columns)
features = top_features
le = LabelEncoder()
df["target_encoded"] = le.fit_transform(df["target"])  # 0 > -1 and 1 > +4

In [41]:
train = df[df["data_split"] == "TRAIN"]
valid = df[df["data_split"] == "VALIDATE"]
test = df[df["data_split"] == "TEST"]

In [42]:
# Preprocessing
scaler = MinMaxScaler().fit(train[features])

train_ = scaler.transform(train[features])
valid_ = scaler.transform(valid[features])
test_ = scaler.transform(test[features])

x_tr, y_tr = train_, train['target_encoded']
x_va, y_va = valid_, valid['target_encoded']
x_ts, y_ts = test_, test['target_encoded']

In [43]:
# XGB
# best_params = {
#     "objective": "binary:logistic",
#     "eval_metric": "logloss",
#     "n_estimators": 1500,
#     "tree_method": "hist",
#     "random_state": 1,
#     "early_stopping_rounds": 20,
#     "scale_pos_weight": scale_pos_weight,

#     "max_depth": 5,
#     "reg_alpha": 5,
#     "reg_lambda": 1,


#     "colsample_bytree": 0.6312082363473505,
#     "gamma": 1.1726609768193053,
#     "learning_rate": 0.008795775594469167,
#     "min_child_weight": 2.3194651634135894,
#     "reg_alpha": 0.02527744494135579,
#     "reg_lambda": 0.002584212748040726,
#     "subsample": 0.7299796125452439,

#     "colsample_bytree": 0.5,
#     "gamma": 0.4,
#     "learning_rate": 0.05,
#     "max_delta_step": 5,
#     "max_depth": 10,
#     "min_child_weight": 10,
#     "scale_pos_weight": 4.118126995388436,
#     "subsample": 0.8,
# }

# model = xgb.XGBClassifier(**best_params)

# model.fit(x_tr, y_tr,
#                eval_set=[(x_tr, y_tr), (x_va, y_va), (x_ts, y_ts)],
#                verbose=10)

In [44]:
# RandomizedSearchCV
# params = {
#     "max_depth": [3, 5, 8, 10, 15],
#     "learning_rate": [0.1, 0.05, 0.03, 0.008],
#     "min_child_weight": [1, 3, 5, 7, 10],
#     "gamma": [0, 0.1, 0.2, 0.3, 0.4],
#     "subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
#     "colsample_bytree": [0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
#     "reg_alpha": [0, 0.1, 0.5, 1, 2, 3, 5],
#     "reg_lambda": [0.5, 1, 1.5, 2, 3, 5],
#     "max_delta_step": [0, 1, 3, 5, 7],
# }

# model = xgb.XGBClassifier(
#     objective="binary:logistic",
#     eval_metric=["logloss", "aucpr"],
#     n_estimators=1500,
#     early_stopping_rounds=20,
#     scale_pos_weight=scale_pos_weight,
#     random_state=1,
# )

# search = RandomizedSearchCV(
#     estimator=model,
#     param_distributions=params,
#     cv=3,
#     n_iter=50,
#     n_jobs=-1,
#     scoring="average_precision",
#     verbose=2,
# )

# search.fit(x_tr, y_tr, eval_set=[(x_va, y_va)], verbose=10 )
# best_model = search.best_estimator_


In [45]:
# Logistic Regression
model = LogisticRegression(
    max_iter=1000,
    penalty="l1",
    C=0.01,
    class_weight={0: 1, 1: 6},  # 6x weight for +4
    solver="liblinear",
    random_state=1
)
model.fit(x_tr, y_tr)

In [46]:
proba_tr = model.predict_proba(x_tr)[:, 1]
proba_va = model.predict_proba(x_va)[:, 1]
proba_ts = model.predict_proba(x_ts)[:, 1]

In [85]:
threshold = 0.67
filter_tr = proba_tr >= threshold
filter_va = proba_va >= threshold
filter_ts = proba_ts >= threshold

In [86]:
sum_target_before_tr = train['target'].sum()
sum_target_before_va = valid['target'].sum()
sum_target_before_ts = test['target'].sum()

In [87]:
sum_target_after_tr = train['target'][filter_tr].sum()
sum_target_after_va = valid['target'][filter_va].sum()
sum_target_after_ts = test['target'][filter_ts].sum()

In [88]:
print('Split | Before | After \n----------------------')
print(f'Train:| {sum_target_before_tr} | {sum_target_after_tr}')
print(f'Valid:| {sum_target_before_va} | {sum_target_after_va}')
print(f'Test :| {sum_target_before_ts} | {sum_target_after_ts}')

Split | Before | After 
----------------------
Train:| -1089 | 750
Valid:| 289 | -13
Test :| -199 | 20


In [52]:
# from sklearn.feature_selection import VarianceThreshold

# # Strategy 1: Remove low-variance features (automated)
# selector = VarianceThreshold(threshold=0.1)  # Keeps features with variance > 0.1
# df_clean = selector.fit_transform(df[top_features])

# # Get retained feature names (if needed)
# retained_features = df[top_features].columns[selector.get_support()]
# retained_features