In [18]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix

In [2]:
df_train = pd.read_csv("../data/train_transformed.csv")
df_test = pd.read_csv("../data/test_transformed.csv")

In [3]:
df_train.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24382,24383,24384,24385,24386,24387,24388,24389,24390,24391
job_type_blue-collar,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_entrepreneur,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_housemaid,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_management,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
job_type_retired,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_self-employed,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_services,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
job_type_student,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_technician,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
job_type_unemployed,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [4]:
df_test.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6088,6089,6090,6091,6092,6093,6094,6095,6096,6097
job_type_blue-collar,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
job_type_entrepreneur,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_housemaid,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_management,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_retired,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_self-employed,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_services,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
job_type_student,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
job_type_technician,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
job_type_unemployed,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#filtering out columns not required for traditional modeling from df_train and df_test
df_train = df_train.loc[:,'job_type_blue-collar':'day_of_month_cos']
df_test = df_test.loc[:,'job_type_blue-collar':'day_of_month_cos']

In [6]:
#separating the x and y columns in train and test data 
df_train_x = df_train[[x for x in df_train.columns if x != 'term_deposit_subscribed']]
df_train_y = df_train[['term_deposit_subscribed']]
#reshaping the y variable to make it suitable for modeling
df_train_y = np.array(df_train_y).reshape(-1)

df_test_x = df_test[[x for x in df_test.columns if x != 'term_deposit_subscribed']]
df_test_y = df_test[['term_deposit_subscribed']]
#reshaping the y variable to make it suitable for modeling
df_test_y = np.array(df_test_y).reshape(-1)

### Logisitic Regression

In [8]:
parameters = [{'penalty' : ['l2'], 'C' : np.logspace(-2, 2, 30)}]
lr_gridcv = GridSearchCV(LogisticRegression(max_iter = 800), param_grid = parameters, cv = 5, n_jobs=-1, verbose = True, scoring = 'neg_log_loss')
lr_tuned = lr_gridcv.fit(df_train_x,df_train_y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#pd.DataFrame(lr_tuned.cv_results_).sort_values('rank_test_score')

In [9]:
#fitting the logistic regression model using the best set of parameters obtained above 
lr_tuned = LogisticRegression(**lr_tuned.best_params_ ).fit(df_train_x,df_train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
y_pred = lr_tuned.predict(df_test_x)
y_true = df_test_y
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95      5452
           1       0.67      0.30      0.42       646

    accuracy                           0.91      6098
   macro avg       0.80      0.64      0.69      6098
weighted avg       0.90      0.91      0.90      6098



### RandomForest 

In [14]:
param_grid = [{'criterion' : ['gini', 'entropy'], 'max_features' : ['auto', 'sqrt', 'log2'], 'n_estimators' : [100]}]
rf_gridcv = GridSearchCV(RandomForestClassifier(n_jobs = -1), param_grid = param_grid, cv = 5, n_jobs=-1, verbose = True, scoring = 'neg_log_loss')
rf_gridcv.fit(df_train_x, np.array(df_train_y).reshape(-1))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1), n_jobs=-1,
             param_grid=[{'criterion': ['gini', 'entropy'],
                          'max_features': ['auto', 'sqrt', 'log2'],
                          'n_estimators': [100]}],
             scoring='neg_log_loss', verbose=True)

In [15]:
rf_tuned = RandomForestClassifier(**rf_gridcv.best_params_).fit(df_train_x,df_train_y)

In [16]:
y_pred = rf_tuned.predict(df_test_x)
y_true = df_test_y
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.98      0.95      5452
           1       0.68      0.38      0.49       646

    accuracy                           0.92      6098
   macro avg       0.80      0.68      0.72      6098
weighted avg       0.90      0.92      0.90      6098



In [26]:
param_grid = [{'criterion' : ['gini', 'entropy'], 'max_features' : ['auto', 'sqrt', 'log2'], 'n_estimators' : [100]}]
rf_gridcv = GridSearchCV(RandomForestClassifier(n_jobs = -1), param_grid = param_grid, cv = 5, n_jobs=-1, verbose = True, scoring = 'recall')
rf_gridcv.fit(df_train_x, np.array(df_train_y).reshape(-1))
rf_tuned = RandomForestClassifier(**rf_gridcv.best_params_).fit(df_train_x,df_train_y)
y_pred = rf_tuned.predict(df_test_x)
y_true = df_test_y
print(classification_report(y_true, y_pred))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
              precision    recall  f1-score   support

           0       0.93      0.98      0.95      5452
           1       0.68      0.40      0.51       646

    accuracy                           0.92      6098
   macro avg       0.81      0.69      0.73      6098
weighted avg       0.91      0.92      0.91      6098



In [27]:
confusion_matrix(y_true,y_pred)

array([[5332,  120],
       [ 386,  260]])