In [86]:
import pandas as pd

data = pd.read_csv('data/bank_marketing_train.csv')
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,72,retired,married,basic.4y,no,no,no,telephone,apr,tue,1,999,0,nonexistent,-1.8,93.075,-47.1,1.453,5099.1,no
1,30,admin.,single,university.degree,no,yes,no,cellular,aug,tue,1,999,0,nonexistent,-1.7,94.027,-38.3,0.886,4991.6,no
2,31,unemployed,married,university.degree,no,yes,no,cellular,aug,fri,4,999,0,nonexistent,1.4,93.444,-36.1,4.966,5228.1,no
3,37,admin.,married,high.school,no,yes,yes,cellular,nov,mon,1,999,0,nonexistent,-3.4,92.649,-30.1,0.722,5017.5,no
4,53,unemployed,divorced,basic.9y,unknown,no,no,telephone,may,thu,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,no


In [87]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

In [88]:
#数据预处理
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

#分割数据集为X和y
X_train = data.drop('y', axis=1)
y_train = data['y']
# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

array([[ 3.04731251,  0.35298919, -0.27175801, ..., -1.42232164,
        -1.22943639, -0.90737059],
       [-0.97169418, -1.03883738,  1.36527684, ...,  0.47164059,
        -1.55456028, -2.37549294],
       [-0.87600354,  1.74481576, -0.27175801, ...,  0.94513114,
         0.78495555,  0.85437624],
       ...,
       [-1.25876608,  1.18808514,  1.36527684, ..., -1.22862095,
        -1.28620406, -0.90737059],
       [ 1.70764361, -1.03883738, -0.27175801, ...,  0.94513114,
         0.78495555,  0.85437624],
       [-0.58893164,  0.07462388,  1.36527684, ..., -0.32468444,
         0.29985006,  0.41325669]])

In [89]:
#定义模型
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Linear Perceptron": Perceptron(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "Naive Bayes": GaussianNB(),
    "KNN": KNeighborsClassifier()
}

In [90]:
roc_auc_scores = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    if name == "Linear Perceptron":  # Perceptron 没有 predict_proba
        y_pred = model.predict(X_train_scaled)
    else:
        y_pred = model.predict_proba(X_train_scaled)[:, 1]
    roc_auc_scores[name] = roc_auc_score(y_train, y_pred)
roc_auc_scores

{'Decision Tree': 0.9999361712100908,
 'Linear Perceptron': 0.6296461624436054,
 'Logistic Regression': 0.7889748038068111,
 'SVM': 0.8411986179626404,
 'Naive Bayes': 0.7666545630215663,
 'KNN': 0.9241153396206908}

In [91]:
# Ensemble model
ensemble_model = VotingClassifier(estimators=[(name, model) for name, model in models.items() if name != "Linear Perceptron"], voting='soft')
ensemble_model.fit(X_train_scaled, y_train)
ensemble_pred = ensemble_model.predict_proba(X_train_scaled)[:, 1]
roc_auc_scores["Ensemble"] = roc_auc_score(y_train, ensemble_pred)

roc_auc_scores

{'Decision Tree': 0.9999361712100908,
 'Linear Perceptron': 0.6296461624436054,
 'Logistic Regression': 0.7889748038068111,
 'SVM': 0.8411986179626404,
 'Naive Bayes': 0.7666545630215663,
 'KNN': 0.9241153396206908,
 'Ensemble': 0.9710635477659314}