In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from scipy.stats import zscore

In [2]:
df = pd.read_csv("../data/processed/data.csv")

#### DATA PREPROCESSING

###### TRANSFORMING NUMERICAL FEATURES
Finding skew and using the transformer that produces the least amount of skew:

In [3]:
numerical_features = ['age', 'balance', 'duration', 'campaign', 'DaysSinceLastContact']
df[numerical_features].agg(['skew', 'kurtosis']).T

Unnamed: 0,skew,kurtosis
age,0.43608,-0.503828
balance,8.259236,141.833247
duration,3.165307,18.189852
campaign,4.730901,36.223536
DaysSinceLastContact,-0.392312,0.253429


In [4]:
SS = StandardScaler()
SSArray = SS.fit_transform(df[numerical_features])
SSData = pd.DataFrame(SSArray, columns=numerical_features)
SSData.agg(['skew', 'kurtosis']).T

Unnamed: 0,skew,kurtosis
age,0.43608,-0.503828
balance,8.259236,141.833247
duration,3.165307,18.189852
campaign,4.730901,36.223536
DaysSinceLastContact,-0.392312,0.253429


In [5]:
MM = MinMaxScaler()
MMArray = MM.fit_transform(df[numerical_features])
MMData = pd.DataFrame(MMArray, columns=numerical_features)
MMData.agg(['skew', 'kurtosis']).T

Unnamed: 0,skew,kurtosis
age,0.43608,-0.503828
balance,8.259236,141.833247
duration,3.165307,18.189852
campaign,4.730901,36.223536
DaysSinceLastContact,-0.392312,0.253429


In [6]:
PTY = PowerTransformer(method='yeo-johnson')
PTArray = PTY.fit_transform(df[numerical_features])
PTYData = pd.DataFrame(PTArray, columns=numerical_features)
PTYData.agg(['skew', 'kurtosis']).T

Unnamed: 0,skew,kurtosis
age,0.001901,-0.770615
balance,1.058649,82.982952
duration,0.013301,0.360479
campaign,0.213011,-1.16958
DaysSinceLastContact,-0.002016,0.150112


###### CONCLUSION
Yeo-Johnson transformation seems to be the most suitable.

###### CLIPPING
The feature 'balance' has a skew greater than 0.5.  Let's see if there are any outliers affecting it

In [7]:
zTransform = np.abs(zscore(PTYData.balance))
print('Number of enries where p(z) > 3: ', len(np.where(zTransform > 3)[0]))
print('Data loss upon removing entries where p(z) > 3: ', (len(np.where(zTransform > 3)[0]))/len(PTYData.balance))
df2 = df.drop(np.where(zTransform > 3)[0], axis=0).reset_index(drop=True)
print(df.shape)
print(df2.shape)

Number of enries where p(z) > 3:  765
Data loss upon removing entries where p(z) > 3:  0.019125
(40000, 13)
(39235, 13)


In [8]:
numerical_features = ['age', 'balance', 'duration', 'campaign', 'DaysSinceLastContact']
categorical_features = ['job', 'marital', 'education', 'contact']
numerical_transformer = PowerTransformer(method='yeo-johnson')
categorical_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
transformed_data = pipeline.fit_transform(df2)

X_transformed = transformed_data[:, :-1]
y_transformed = transformed_data[:, -1]

print(X_transformed, y_transformed)
print(X_transformed.shape)
print(y_transformed.shape)
print(type(X_transformed))

[[ 1.62878715  0.74864653  0.41302279 ...  1.          0.
   0.        ]
 [ 0.46908395 -0.50599955 -0.19023953 ...  0.          0.
   0.        ]
 [-0.74375393 -0.53669789 -0.89188542 ...  0.          0.
   0.        ]
 ...
 [ 1.32937774 -0.36480578 -0.06296466 ...  0.          0.
   0.        ]
 [-0.61772143  0.16595194  0.72597005 ...  1.          0.
   0.        ]
 [-0.1484667   0.38384466 -0.49385192 ...  0.          0.
   0.        ]] [1. 1. 1. ... 0. 0. 0.]
(39235, 22)
(39235,)
<class 'numpy.ndarray'>


In [9]:
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_transformed, y_transformed)
print(X_smote.shape, y_smote.shape)


(53416, 22) (53416,)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.2)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(42732, 22) (10684, 22) (42732,) (10684,)


In [11]:
dtc_model = DecisionTreeClassifier()
# Train the model
dtc_model.fit(X_train, y_train)

# Make predictions
y_pred = dtc_model.predict(X_test)

f1_1 = f1_score(y_test, y_pred, average='binary')
print(f1_1)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)

print(cr)
dtc_model

0.9334577694820345
Accuracy: 0.93
[[4970  361]
 [ 352 5001]]
              precision    recall  f1-score   support

         0.0       0.93      0.93      0.93      5331
         1.0       0.93      0.93      0.93      5353

    accuracy                           0.93     10684
   macro avg       0.93      0.93      0.93     10684
weighted avg       0.93      0.93      0.93     10684



In [12]:
lr_model = LogisticRegression()

# Train the model
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

f1_1 = f1_score(y_test, y_pred, average='binary')
print(f1_1)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)

print(cr)

0.6645234164566861
Accuracy: 0.64
[[2999 2332]
 [1529 3824]]
              precision    recall  f1-score   support

         0.0       0.66      0.56      0.61      5331
         1.0       0.62      0.71      0.66      5353

    accuracy                           0.64     10684
   macro avg       0.64      0.64      0.64     10684
weighted avg       0.64      0.64      0.64     10684



In [16]:
knn_model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)

In [17]:
knn_model.fit(X_train, y_train)

# Make predictions
y_pred = knn_model.predict(X_test)

f1_1 = f1_score(y_test, y_pred, average='binary')
print(f1_1)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)

print(cr)

0.8528619528619528
Accuracy: 0.84
[[3870 1461]
 [ 287 5066]]
              precision    recall  f1-score   support

         0.0       0.93      0.73      0.82      5331
         1.0       0.78      0.95      0.85      5353

    accuracy                           0.84     10684
   macro avg       0.85      0.84      0.83     10684
weighted avg       0.85      0.84      0.83     10684



In [15]:
svc_model = SVC(kernel = 'linear')
svc_model.fit(X_train, y_train)

# Make predictions
y_pred = svc_model.predict(X_test)

f1_1 = f1_score(y_test, y_pred, average='binary')
print(f1_1)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)

print(cr)

0.7159108378170638
Accuracy: 0.65
[[2331 3000]
 [ 696 4657]]
              precision    recall  f1-score   support

         0.0       0.77      0.44      0.56      5331
         1.0       0.61      0.87      0.72      5353

    accuracy                           0.65     10684
   macro avg       0.69      0.65      0.64     10684
weighted avg       0.69      0.65      0.64     10684



In [18]:
rf_model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

f1_1 = f1_score(y_test, y_pred, average='binary')
print(f1_1)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print(cm)

print(cr)

0.925942920069744
Accuracy: 0.92
[[4832  499]
 [ 308 5045]]
              precision    recall  f1-score   support

         0.0       0.94      0.91      0.92      5331
         1.0       0.91      0.94      0.93      5353

    accuracy                           0.92     10684
   macro avg       0.93      0.92      0.92     10684
weighted avg       0.93      0.92      0.92     10684



In [34]:
model_param = {
    'DTC' : {
        'model': DecisionTreeClassifier(),
        'param': {
            'criterion':['gini', 'entropy'],
            'max_depth': np.arange(1, 22)
        }        
    },
    'LR' : {
        'model': LogisticRegression(),
        'param': {
            'penalty':['l1', 'l2'],
            'C': np.logspace(-4, 4, 20)[:4],
            'solver': ['liblinear'],
            'max_iter': [100, 1000]
        }
        
    },
    'KNN' : {
        'model': KNeighborsClassifier(),
        'param': {
            'n_neighbors':[5, 10],
            'metric': ['minkowski', 'euclidean', 'manhattan'],
            'p': [1, 2]
        }        
    },
    'RF' : {
        'model': RandomForestClassifier(n_estimators = 10, criterion = 'entropy'),
        'param': {
            'n_estimators':[5, 10, 15, 20],
            'criterion': ['gini', 'entropy'],
            'max_depth': [5, 10, 15, 20]
        }        
    },
    'SVC' : {
        'model': SVC(),
        'param': {
            'kernel':['linear', 'poly', 'rbf', 'sigmoid']
        }
    }
}
scores = []
for model_name, parameter in model_param.items():
    model_selection = GridSearchCV(
        estimator = parameter['model'],
        param_grid = parameter['param'],
        cv=5,
        return_train_score = False
    )
    model_selection.fit(X_train, y_train)
    scores.append({
        'Model': model_name,
        'Best_Score': model_selection.best_score_,
        'Best_params': model_selection.best_params_
    })


In [35]:
scores

[{'Model': 'DTC',
  'Best_Score': 0.9411915354344561,
  'Best_params': {'criterion': 'entropy', 'max_depth': 17}},
 {'Model': 'LR',
  'Best_Score': 0.6870263052570565,
  'Best_params': {'C': 0.0006951927961775605,
   'max_iter': 100,
   'penalty': 'l1',
   'solver': 'liblinear'}},
 {'Model': 'KNN',
  'Best_Score': 0.8411494696072275,
  'Best_params': {'metric': 'minkowski', 'n_neighbors': 10, 'p': 1}},
 {'Model': 'RF',
  'Best_Score': 0.9226107066032178,
  'Best_params': {'criterion': 'gini', 'max_depth': 20, 'n_estimators': 20}},
 {'Model': 'SVC',
  'Best_Score': 0.8872039575994644,
  'Best_params': {'kernel': 'rbf'}}]

In [38]:
import xgboost as xgb

params = {
    'objective': 'binary:logistic',  
    'eval_metric': 'logloss',
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 100
}

# Create and train the model
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")



Accuracy: 0.9282104080868588
Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      0.87      0.92      5331
         1.0       0.88      0.99      0.93      5353

    accuracy                           0.93     10684
   macro avg       0.93      0.93      0.93     10684
weighted avg       0.93      0.93      0.93     10684

