In [35]:
import pandas as pd

data = pd.read_csv('data/bank_marketing_train.csv')
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,72,retired,married,basic.4y,no,no,no,telephone,apr,tue,1,999,0,nonexistent,-1.8,93.075,-47.1,1.453,5099.1,no
1,30,admin.,single,university.degree,no,yes,no,cellular,aug,tue,1,999,0,nonexistent,-1.7,94.027,-38.3,0.886,4991.6,no
2,31,unemployed,married,university.degree,no,yes,no,cellular,aug,fri,4,999,0,nonexistent,1.4,93.444,-36.1,4.966,5228.1,no
3,37,admin.,married,high.school,no,yes,yes,cellular,nov,mon,1,999,0,nonexistent,-3.4,92.649,-30.1,0.722,5017.5,no
4,53,unemployed,divorced,basic.9y,unknown,no,no,telephone,may,thu,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,no


In [36]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
import xgboost as xgb
import pandas as pd

In [37]:
#数据预处理
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

#分割数据集为X和y
X_train = data.drop('y', axis=1)
y_train = data['y']
# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

array([[ 3.04731251,  0.35298919, -0.27175801, ..., -1.42232164,
        -1.22943639, -0.90737059],
       [-0.97169418, -1.03883738,  1.36527684, ...,  0.47164059,
        -1.55456028, -2.37549294],
       [-0.87600354,  1.74481576, -0.27175801, ...,  0.94513114,
         0.78495555,  0.85437624],
       ...,
       [-1.25876608,  1.18808514,  1.36527684, ..., -1.22862095,
        -1.28620406, -0.90737059],
       [ 1.70764361, -1.03883738, -0.27175801, ...,  0.94513114,
         0.78495555,  0.85437624],
       [-0.58893164,  0.07462388,  1.36527684, ..., -0.32468444,
         0.29985006,  0.41325669]])

In [38]:
#定义模型
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Linear Perceptron": Perceptron(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier(),
    "XGBoost":xgb.XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=200, eval_metric='logloss')
}

In [39]:
# 对模型进行交叉验证
cv_roc_auc_scores = {}
for name, model in models.items():
    if name == "Linear Perceptron":
        # Perceptron没有predict_proba方法
        y_pred = cross_val_predict(model, X_train_scaled, y_train, cv=5, method="predict")
    else:
        y_pred = cross_val_predict(model, X_train_scaled, y_train, cv=5, method="predict_proba")[:, 1]

    cv_roc_auc_scores[name] = roc_auc_score(y_train, y_pred)

cv_roc_auc_scores

{'Decision Tree': 0.6157336055586898,
 'Linear Perceptron': 0.5754680002229575,
 'Logistic Regression': 0.7870314886480607,
 'SVM': 0.6980531304863256,
 'Naive Bayes': 0.7660989699739842,
 'KNN': 0.7188934866487569,
 'XGBoost': 0.8023589655236991}

In [40]:
#使用集成学习stack堆叠模型进行训练
from sklearn.ensemble import StackingClassifier

# 定义基模型
base_models = [
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("Logistic Regression", LogisticRegression(random_state=42)),
    ("SVM", SVC(probability=True, random_state=42)),
    ("Naive Bayes", GaussianNB()),
    ("KNN", KNeighborsClassifier()),
    ("XGBoost", xgb.XGBClassifier(learning_rate=0.1, max_depth=3, n_estimators=200, eval_metric='logloss'))
]

# 定义最终的聚合模型
final_model = LogisticRegression(random_state=42)

# 创建堆叠分类器
stacked_model = StackingClassifier(estimators=base_models, final_estimator=final_model, cv=5)

# 拟合堆叠模型
stacked_model.fit(X_train_scaled, y_train)

# 预测概率
y_pred_proba = stacked_model.predict_proba(X_train_scaled)[:, 1]

# 计算ROC AUC分数
stack_roc_auc = roc_auc_score(y_train, y_pred_proba)

cv_roc_auc_scores["Stack"] = stack_roc_auc

cv_roc_auc_scores

{'Decision Tree': 0.6157336055586898,
 'Linear Perceptron': 0.5754680002229575,
 'Logistic Regression': 0.7870314886480607,
 'SVM': 0.6980531304863256,
 'Naive Bayes': 0.7660989699739842,
 'KNN': 0.7188934866487569,
 'XGBoost': 0.8023589655236991,
 'Stack': 0.8471261835337014}

In [41]:
#导入测试集数据
X_test = pd.read_csv('data/bank_marketing_test.csv')
X_test.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,34,blue-collar,married,basic.6y,no,no,no,telephone,may,mon,4.0,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
1,29,admin.,married,high.school,no,yes,no,telephone,jun,fri,3.0,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1
2,38,housemaid,married,high.school,unknown,no,no,telephone,may,thu,4.0,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0
3,31,services,married,high.school,unknown,no,no,telephone,jun,thu,2.0,999,0,nonexistent,1.4,94.465,-41.8,4.866,5228.1
4,45,blue-collar,married,professional.course,no,yes,no,telephone,may,thu,,6,2,success,-1.8,93.876,-40.0,0.683,5008.7


In [42]:
#encode测试集数据,X_test
for column in X_test.select_dtypes(include=['object']).columns:
    if column in label_encoders:
        X_test[column] = label_encoders[column].transform(X_test[column])
        
X_test

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,34,1,1,1,0,0,0,1,6,1,4.0,999,0,1,1.1,93.994,-36.4,4.857,5191.0
1,29,0,1,3,0,2,0,1,4,0,3.0,999,0,1,1.4,94.465,-41.8,4.959,5228.1
2,38,3,1,3,1,0,0,1,6,2,4.0,999,0,1,1.1,93.994,-36.4,4.860,5191.0
3,31,7,1,3,1,0,0,1,4,2,2.0,999,0,1,1.4,94.465,-41.8,4.866,5228.1
4,45,1,1,5,0,2,0,1,6,2,,6,2,2,-1.8,93.876,-40.0,0.683,5008.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,56,3,1,0,0,0,0,1,6,1,1.0,999,0,1,1.1,93.994,-36.4,4.857,5191.0
7996,58,5,1,0,0,2,0,1,9,2,1.0,999,0,1,-1.1,94.199,-37.5,0.884,4963.6
7997,52,0,1,2,0,0,0,0,3,4,2.0,999,0,1,1.4,93.918,-42.7,4.963,5228.1
7998,37,0,2,6,0,2,2,0,7,0,1.0,999,0,1,-0.1,93.200,-42.0,4.021,5195.8


In [43]:
#发现X_test有缺失值，消除缺失值
missing_values = X_test.isnull().sum()
missing_values = missing_values[missing_values > 0]

median_value = data['campaign'].median()
X_test['campaign'].fillna(median_value, inplace=True)
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-0.58893164, -0.76047206, -0.27175801, ...,  0.88056425,
         0.72245378,  0.34770332],
       [-1.06738481, -1.03883738, -0.27175801, ..., -0.28163984,
         0.78094167,  0.85437624],
       [-0.20616909, -0.20374143, -0.27175801, ...,  0.88056425,
         0.72417401,  0.34770332],
       ...,
       [ 1.1334998 , -1.03883738, -0.27175801, ..., -0.47534052,
         0.78323531,  0.85437624],
       [-0.30185973, -1.03883738,  1.36527684, ..., -0.32468444,
         0.24308239,  0.41325669],
       [ 0.08090281, -1.03883738,  1.36527684, ..., -0.28163984,
         0.78094167,  0.85437624]])

In [44]:
#选择最好的模型为Stack集成模型
test_predictions = stacked_model.predict_proba(X_test_scaled)[:, 1]
test_predictions

array([0.04583232, 0.0512307 , 0.04933628, ..., 0.06932013, 0.06740187,
       0.05253857])

In [45]:
# 创建一个 DataFrame 包含预测值
predictions_df = pd.DataFrame({'Predictions': test_predictions},)

# 将 DataFrame 存储为 CSV 文件
predictions_df.to_csv('data/bank_marketing_test_scores.csv', index=False,header=False)