In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import xgboost as xgb

In [5]:
df = pd.read_csv("../data/processed/data.csv")

In [9]:
numerical_features = ['age', 'balance', 'campaign', 'duration', 'DaysSinceLastContact']
categorical_features = ['job', 'marital', 'education', 'contact']
numerical_transformer = PowerTransformer(method='yeo-johnson')
categorical_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
transformed_data = pipeline.fit_transform(df)

print(transformed_data)
print(transformed_data.shape)

[[ 1.62508495  0.45666613 -1.14050478 ...  0.          0.
   1.        ]
 [ 0.46360128 -0.38747819 -1.14050478 ...  0.          0.
   1.        ]
 [-0.74892732 -0.40790439 -1.14050478 ...  0.          0.
   1.        ]
 ...
 [ 1.32502009 -0.29300839 -1.14050478 ...  0.          0.
   0.        ]
 [-0.62302951  0.06380804 -1.14050478 ...  0.          0.
   0.        ]
 [-0.15406758  0.21062104 -1.14050478 ...  0.          0.
   0.        ]]
(40000, 23)


In [16]:
X_t = transformed_data[:, : -1]
y_t = transformed_data[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X_t, y_t, test_size=0.2)

# Define hyperparameters
params = {
    'objective': 'binary:logistic',  # For binary classification
    'eval_metric': 'logloss',  # Logarithmic loss metric
    'max_depth': 3,  # Maximum depth of individual trees
    'learning_rate': 0.1,  # Step size shrinkage
    'n_estimators': 100  # Number of boosting rounds
}

# Create and train the model
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")




Accuracy: 0.917625
Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      0.93      0.94      5496
         1.0       0.86      0.88      0.87      2504

    accuracy                           0.92      8000
   macro avg       0.90      0.91      0.91      8000
weighted avg       0.92      0.92      0.92      8000



In [15]:
import matplotlib.pyplot as plt

feature_importance = model.feature_importances_
plt.barh(range(len(feature_importance)), feature_importance, tick_label=X.columns)
plt.show()


AttributeError: 'numpy.ndarray' object has no attribute 'columns'