In [1]:
import pandas as pd

caravan = pd.read_csv("../../data/caravan.csv")
caravan

Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,...,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,Purchase
0,33,1,3,2,8,0,5,1,3,7,...,0,0,0,1,0,0,0,0,0,No
1,37,1,2,2,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,No
2,37,1,2,2,8,0,4,2,4,3,...,0,0,0,1,0,0,0,0,0,No
3,9,1,3,3,3,2,3,2,4,5,...,0,0,0,1,0,0,0,0,0,No
4,40,1,4,2,10,1,4,1,4,7,...,0,0,0,1,0,0,0,0,0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5817,36,1,1,2,8,0,6,1,2,1,...,0,0,0,1,0,0,0,0,0,No
5818,35,1,4,4,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,No
5819,33,1,3,4,8,0,6,0,3,5,...,0,0,0,1,0,0,0,0,0,Yes
5820,34,1,3,2,8,0,7,0,2,7,...,0,0,0,0,0,0,0,0,0,No


In [9]:
import sklearn.model_selection as skm
from sklearn.ensemble import GradientBoostingClassifier as GBC
import matplotlib.pyplot as plt
import numpy as np

X = caravan[caravan.columns.drop("Purchase")]
Y = caravan["Purchase"]

(X_train,
 X_test,
 y_train,
 y_test) = skm.train_test_split(X,
                                Y,
                                test_size=0.8,
                                random_state=0)

boost = GBC(n_estimators=1000, learning_rate=0.01, random_state=0)
boost.fit(X_train, y_train)

feature_names = X.columns
feature_imp = pd.DataFrame(
    {'importance':boost.feature_importances_},
    index=feature_names)
feature_imp.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
PPERSAUT,0.118666
ALEVEN,0.082175
PBRAND,0.049191
MGEMOMV,0.037160
MKOOPKLA,0.036583
...,...
AAANHANG,0.000000
AMOTSCO,0.000000
AVRAAUT,0.000000
AZEILPL,0.000000


In [22]:
from ISLP import confusion_table

y_test_pred_proba = boost.predict_proba(X_test)[:, 1]
y_test_pred = np.where(y_test_pred_proba > 0.2, "Yes", "No")
confusion_table(y_test, y_test_pred)

Truth,No,Yes
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,4181,194
Yes,245,38


In [21]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(C=1e10, solver='liblinear')
logit.fit(X_train, y_train)
logit_pred = logit.predict_proba(X_test)
logit_labels = np.where(logit_pred[:,1] > .2, 'Yes', 'No')
confusion_table(logit_labels, y_test)

Truth,No,Yes
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,4053,224
Yes,322,59


In [26]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn_pred = knn.fit(X_train, y_train).predict(X_test)
confusion_table(knn_pred, y_test)

Truth,No,Yes
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,4145,255
Yes,230,28


The comparison of boosting, logistic regression, and KNN (k=1) on the Caravan dataset highlights the impact of method choice on identifying purchasers in a highly imbalanced dataset. Boosting with a 20% cutoff predicts 283 individuals as purchasers, of whom 38 actually made a purchase, yielding a higher precision than the other methods. Logistic regression predicts more purchasers (381), but only 59 are correct, reflecting lower precision due to overprediction of the majority class. KNN (k=1) predicts 258 purchasers, but only 28 are actual buyers, giving the lowest precision of the three methods. Overall, boosting achieves the best balance, identifying a larger fraction of true purchasers among its predicted buyers, while logistic regression and KNN struggle more with the severe class imbalance, often predicting non-purchasers as buyers.