# Marketing Classifier
Will use UCI ML dataset from https://archive.ics.uci.edu/ml/datasets/bank+marketing    


Useful links

- https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier.score
- https://discuss.analyticsvidhya.com/t/how-to-exclude-the-elements-from-the-legend-in-python/5393
- https://machinelearningmastery.com/framework-for-imbalanced-classification-projects/
- https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/
- https://towardsdatascience.com/machine-learning-classification-with-python-for-direct-marketing-2da27906ddac
- https://github.com/kunalBhashkar/Bank-Marketing-Data-Set-Classification/blob/master/Claffication_of_Bank_Marketing_Data_Set.ipynb



## Set Up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../data/bank-additional/bank-additional/bank-additional-full.csv', sep=";")#delimiter=';', decimal=',')

In [3]:
df.size

864948

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [5]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [6]:
df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [7]:
df.groupby('poutcome')['poutcome'].count()

poutcome
failure         4252
nonexistent    35563
success         1373
Name: poutcome, dtype: int64

### Data Prep

1. Cleaning
2. Feature engineering

In [8]:
# Any nulls?
df.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [9]:
# Assume there was some EDA and feature analysis to select below
cat_feature_cols = ["marital", "education", "contact", "default", "housing", "loan", "poutcome"]
num_feature_cols = ["age", "pdays", "previous", "emp.var.rate", "euribor3m", "nr.employed"]
feature_cols = cat_feature_cols + num_feature_cols 

In [10]:
X = df[feature_cols].copy()
y = df['y'].apply(lambda x: 1 if x=='yes' else 0).copy()

## Train/Test Split

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,random_state=42)

In [12]:
X_train[cat_feature_cols]

Unnamed: 0,marital,education,contact,default,housing,loan,poutcome
21419,married,professional.course,cellular,unknown,no,no,nonexistent
208,single,basic.4y,telephone,no,no,no,nonexistent
20222,married,basic.4y,cellular,no,yes,no,nonexistent
6886,married,basic.9y,telephone,no,yes,yes,nonexistent
40525,single,university.degree,telephone,no,yes,no,nonexistent
...,...,...,...,...,...,...,...
6265,married,professional.course,telephone,unknown,no,no,nonexistent
11284,married,university.degree,telephone,no,no,no,nonexistent
38158,married,high.school,cellular,no,yes,no,success
860,married,university.degree,telephone,no,yes,no,nonexistent


## Feature Engineering

In [13]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X_train_cat_encoded = enc.fit_transform(X_train[cat_feature_cols])
X_test_cat_encoded = enc.transform(X_test[cat_feature_cols])

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[num_feature_cols])
X_test_num_scaled = scaler.transform(X_test[num_feature_cols])

In [15]:
X_train_cat_encoded.shape, X_train_num_scaled.shape

((24712, 26), (24712, 6))

In [16]:
X_train = np.concatenate((X_train_cat_encoded.toarray(), X_train_num_scaled), axis=1)
X_test = np.concatenate((X_test_cat_encoded.toarray(), X_test_num_scaled), axis=1)

In [17]:
X_test.shape, X_train.shape

((16476, 32), (24712, 32))

### Imbalanced Classes

In [23]:
import sys
!{sys.executable} -m pip install imblearn
from imblearn.over_sampling import SMOTE
sm=SMOTE()
X_balanced,y_balanced=sm.fit_resample(X_train,y_train)



## Classifier

In [24]:
# Random Forest Classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [25]:
rfc = RandomForestClassifier(n_estimators=1000)

In [26]:
rfc.fit(X_balanced, y_balanced)

In [27]:
f1_score(y_test.values, rfc.predict(X_test))


0.38067427661268916

In [28]:
y_balanced.value_counts()

0    21925
1    21925
Name: y, dtype: int64

In [29]:
y_test.value_counts()

0    14623
1     1853
Name: y, dtype: int64

## Hyperparam search

In [30]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 6)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [31]:
if len(X_balanced)<1000: # can get too long!
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    rf = RandomForestClassifier()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    rf_random = RandomizedSearchCV(
        estimator = rf, 
        param_distributions = random_grid, 
        n_iter = 10, 
        cv = 3, 
        verbose=2, 
        random_state=42, 
        n_jobs = -1,
        scoring='f1'
    )# Fit the random search model
    rf_random.fit(X_balanced, y_balanced)
    print(rf_random.best_score_)
    print(f1_score(y_test.values, rf_random.best_estimator_.predict(enc.transform(X_test))))
    print(f1_score(y_balanced.values, rf_random.best_estimator_.predict(X_balanced)))
else: 
    pass