In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
from mypdata import get_train_xy,resample_xy,get_split,get_c_s_data

In [None]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
test_s,test_c = get_c_s_data(df_test)

Xo,yo,df2 = get_train_xy(df)
s_df,c_df=get_c_s_data(df2)
# print(s_df,c_df)
Xc,yc = c_df.drop(["target"],axis=1),c_df["target"]
Xs,ys = s_df.drop(["target"],axis=1),s_df["target"]

In [None]:
print(test_c,Xc)

In [None]:
scale_pos_weight_s = (len(ys) - sum(ys)) / sum(ys)
scale_pos_weight_c = (len(yc) - sum(yc)) / sum(yc)
print("scale_pos_weight =", scale_pos_weight_s,scale_pos_weight_c)

In [None]:
Xc_train, Xc_test, yc_train, yc_test = train_test_split(Xc, yc, test_size=0.2, stratify=yc, random_state=42)
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, ys, test_size=0.2, stratify=ys, random_state=42)

In [None]:
print(len(Xc_train),len(yc_train))

In [None]:
# XGBoost 處理連續型
xgb_model = XGBClassifier(
    n_estimators=300,
    max_depth=7,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    tree_method='hist',
    random_state=42,
    scale_pos_weight=scale_pos_weight_c,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Random Forest 處理結構化
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(Xc, yc)
rf_model.fit(Xs, ys)

In [None]:
# 將模型包成 pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Meta 模型使用邏輯回歸
meta_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)

# # 用 stacking 將兩個模型整合
# stacked_model = StackingClassifier(
#     estimators=[
#         ('xgb', xgb_model),
#         ('rf', rf_model)
#     ],
#     final_estimator=meta_model,
#     passthrough=True,
#     n_jobs=-1
# )

# 用 base 模型對 test 資料做預測（這是給 meta model 的輸入）
cont_pred_train = xgb_model.predict_proba(Xc)[:, 1]
struct_pred_train = rf_model.predict_proba(Xs)[:, 1]

# meta model 訓練
meta_X_train = pd.DataFrame({
    'cont_pred': cont_pred_train,
    'struct_pred': struct_pred_train
})

In [None]:
print((yc == ys).all())

In [None]:
meta_model = LogisticRegression()
meta_model.fit(meta_X_train, yc)

In [None]:
print(test_s)

In [None]:
cont_pred_test = xgb_model.predict_proba(test_c)[:, 1]
struct_pred_test = rf_model.predict_proba(test_s)[:, 1]

meta_X_test = pd.DataFrame({
    'cont_pred': cont_pred_test,
    'struct_pred': struct_pred_test
})

In [None]:
final_pred = meta_model.predict(meta_X_test)

In [None]:
results1 = pd.DataFrame({
    "index": df_test["index"],  # 保留原始 ID
    "target": final_pred
})

# results2 = pd.DataFrame({
#     "index": df_test["index"],  # 保留原始 ID
#     "target": y2_pred
# })

results1.to_csv("predictionsxx.csv", index=False)

In [None]:
# 合併連續與結構化資料
from numpy import concatenate

X_train_full = pd.concat([Xc_train.reset_index(drop=True), Xs_train.reset_index(drop=True)], axis=1)
X_test_full = pd.concat([Xc_test.reset_index(drop=True), Xs_test.reset_index(drop=True)], axis=1)

# 訓練模型
stacked_model.fit(X_train_full, y_train)

# 預測
y_pred = stacked_model.predict(X_test_full)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
