In [25]:
from mypdata import get_train_xy,resample_xy,get_split
import pandas as pd

In [26]:
X,y = get_train_xy("train.csv")
X_train, X_test, y_train, y_test=get_split(X,y)
X_resampled, y_resampled = resample_xy(X,y)
Xr_train, Xr_test, yr_train, yr_test=get_split(X_resampled,y_resampled)
print(y_resampled.value_counts())

target
0    93028
1    93028
Name: count, dtype: int64


In [27]:
print(y.value_counts())

target
0    93028
1    21486
Name: count, dtype: int64


In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

def get_models(scale_pos_weight):
    rf_model = RandomForestClassifier(
        n_estimators=201,
        max_depth=7,
        class_weight={0: 1, 1: scale_pos_weight}, 
        max_samples=0.8,     
        max_features=0.7,     
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1
    )

    xgb_model = XGBClassifier(
        n_estimators=201,
        max_depth=7,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight,
        subsample=0.8,
        colsample_bytree=0.7,
        gamma=0.1,
        tree_method='hist',
        # use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )

    lgb_model = LGBMClassifier(
        n_estimators=201,
        max_depth=7,
        learning_rate=0.1,
        class_weight={0: 1, 1: scale_pos_weight}, 
        subsample=0.8,
        colsample_bytree=0.7,
        reg_alpha=0.1,
        reg_lambda=0.1,
        random_state=42
    )

    meta_model = XGBClassifier(
        n_estimators=300,
        max_depth=9,
        learning_rate=0.05,
        scale_pos_weight=scale_pos_weight,
        subsample=0.9,
        colsample_bytree=0.8,
        gamma=0.1,
        tree_method='hist',
        # use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    )

    base_rf = RandomForestClassifier(
        n_estimators=200,       
        max_depth=5,            
        min_samples_split=5,      
        min_samples_leaf=2,        
        max_samples=0.5,   
        max_features=0.65,
        bootstrap=True,            
        class_weight='balanced',    
        random_state=42,            
    )

    ada_model = AdaBoostClassifier(
        estimator=base_rf, 
        n_estimators=100,        
        learning_rate=0.1,
        random_state=42
    )

    return rf_model,xgb_model,lgb_model,meta_model,ada_model

In [None]:
from sklearn.metrics import classification_report

scale_pos_weight = (len(y) - sum(y)) / sum(y)

rf_model,xgb_model,lgb_model,meta_model,ada_model = get_models(scale_pos_weight)

stacked_boost = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('rf', rf_model)
    ],
    final_estimator=meta_model,
    passthrough=True
)

stacked_boost.fit(X_train, y_train)

y_pred = stacked_boost.predict(X_test)
print(classification_report(y_test, y_pred))


[LightGBM] [Info] Number of positive: 17273, number of negative: 74338
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8540
[LightGBM] [Info] Number of data points in the train set: 91611, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501505 -> initscore=0.006021
[LightGBM] [Info] Start training from score 0.006021
[LightGBM] [Info] Number of positive: 13818, number of negative: 59470
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8540
[LightGBM] [Info] Number of data points in the train set: 73288, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501500 -> initscore=0.005999
[Ligh

In [22]:
rf_model,xgb_model,lgb_model,meta_model,ada_model = get_models(scale_pos_weight)

stacked_boost = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('rf', rf_model)
    ],
    final_estimator=meta_model,
    passthrough=True
)
model1 = stacked_boost.fit(X, y)

[LightGBM] [Info] Number of positive: 21486, number of negative: 93028
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8540
[LightGBM] [Info] Number of data points in the train set: 114514, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 17189, number of negative: 74422
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001763 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8540
[LightGBM] [Info] Number of data points in the train set: 91611, number of used features: 95
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500004 -> initscore=0.000017
[L

In [23]:
df_test = pd.read_csv("test.csv")

In [24]:
y1_pred = model1.predict(df_test.drop("index",axis=1))
# y2_pred = model2.predict(df_test.drop("index",axis=1))

results1 = pd.DataFrame({
    "index": df_test["index"],  # 保留原始 ID
    "target": y1_pred
})

# results2 = pd.DataFrame({
#     "index": df_test["index"],  # 保留原始 ID
#     "target": y2_pred
# })

results1.to_csv("predictions1.csv", index=False)
# results2.to_csv("predictions2.csv", index=False)