# Assignment 7
## Bank Marketing Decision Trees



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing

bank = pd.read_csv('bank.csv',delimiter=";")

bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


### 1. Attribute review:

| Attribute | Data type | Variable type |
|---|---|---|
| Age | Integer | Continuous |
| Job | String | Discrete |
| Marital [status] | String | Discrete |
| Education | String | Discrete |
| Default: client has credit in default | String | Discrete |
| Balance | Integer | Continuous |
| Housing (has housing loan) | String | Discrete |
| Loan: has personal loan | String | Discrete |
| Contact: contact communication type | String | Discrete |
| Day | Integer | Continuous (1-31) |
| Month | String | Somewhat continuous - as much as Day... |
| Duration | Integer | Continuous |
| Campaign: number of client contacts during campaign | Integer | Continuous |
| pdays: days since last client contact from previous campain (max: 999) | Integer | Continuous |
| Previous: number of past client contacts before current campaign | Integer | Continuous |
| poutcome: outcome of previous marketing campaign | String | Discrete |
| y (**target**): has client srubscribed to a term deposit | String | Binary |

### 2. Label encoding

In [2]:
# Label exploration

print bank["poutcome"].value_counts()

unknown    3705
failure     490
other       197
success     129
dtype: int64


In [3]:
# job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
le_job = preprocessing.LabelEncoder()
le_job.fit(['admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown'])
bank["job"] = le_job.transform(bank["job"])

# marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
le_marital = preprocessing.LabelEncoder()
le_marital.fit(['divorced','married','single','unknown'])
bank["marital"] = le_marital.transform(bank["marital"])

# education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
### This seems to have changed - categorical: 'secondary', 'tertiary', 'primary', 'unknown'
le_education = preprocessing.LabelEncoder()
le_education.fit(['secondary', 'tertiary', 'primary', 'unknown'])
bank["education"] = le_education.transform(bank["education"])

# default: has credit in default? (categorical: 'no','yes','unknown')
le_default = preprocessing.LabelEncoder()
le_default.fit(['no','yes','unknown'])
bank["default"] = le_default.transform(bank["default"])

# housing: has housing loan? (categorical: 'no','yes','unknown')
le_housing = preprocessing.LabelEncoder()
le_housing.fit(['no','yes','unknown'])
bank["housing"] = le_housing.transform(bank["housing"])

# loan: has personal loan? (categorical: 'no','yes','unknown')
le_loan = preprocessing.LabelEncoder()
le_loan.fit(['no','yes','unknown'])
bank["loan"] = le_loan.transform(bank["loan"])

# contact: contact communication type (categorical: 'cellular','telephone') 
le_contact = preprocessing.LabelEncoder()
le_contact.fit(['cellular','telephone', 'unknown'])
bank["contact"] = le_contact.transform(bank["contact"])

# month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
le_month = preprocessing.LabelEncoder()
le_month.fit(['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'])
bank["month"] = le_month.transform(bank["month"])

# poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
le_poutcome = preprocessing.LabelEncoder()
le_poutcome.fit(['failure','other','success','unknown'])
bank["poutcome"] = le_poutcome.transform(bank["poutcome"])

# y - has the client subscribed a term deposit? (binary: 'yes','no') 
le_y = preprocessing.LabelEncoder()
le_y.fit(['yes','no'])
bank["y"] = le_y.transform(bank["y"])

bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10,1,0,0,1787,0,0,0,19,10,79,1,-1,0,3,0
1,33,7,1,1,0,4789,2,2,0,11,8,220,1,339,4,0,0
2,35,4,2,2,0,1350,2,0,0,16,0,185,1,330,1,0,0
3,30,4,1,2,0,1476,2,2,2,3,6,199,4,-1,0,3,0
4,59,1,1,1,0,0,2,0,2,5,8,226,1,-1,0,3,0


### 3. Decision tree

In [4]:
from sklearn import tree

X = bank.drop("y", axis=1)
y = bank["y"]

y.value_counts()

0    4000
1     521
dtype: int64

In [28]:
# Build CART classifier
clf = tree.DecisionTreeClassifier(max_depth=3, random_state=1)
clf = clf.fit(X, y)

N = len(X)
tp=0
fp=0
tn=0
fn=0
pred = []

print "Score: ", clf.score(X, y)
    
# Loop through all contacts and predict outcome using the classifier
for i in range (1, N):
    p = clf.predict(X[i-1:i])[0]
    pred.append(p)
    if(p == 1):
        if(y[i] == 1):
            tp+=1
        else:
            fp+=1
    else:
        if(y[i] == 0):
            tn+=1
        else:
            fn+=1

# Calculate precision and recall
precision = tp/float(tp+fp)
recall = tp/float(tp+fn)
accuracy = (tp+tn)/float(N)
F1 = 2 * (precision * recall) / (precision + recall)

# Print findings
print "TP: ",tp
print "FP: ",fp
print "FN: ",fn
print "TN: ",tn
print "Precision: ",precision
print "Recall: ",recall
print "Accuracy: ",accuracy
print "F1: ",F1

 Score:  0.893386418934
TP:  30
FP:  161
FN:  491
TN:  3838
Precision:  0.157068062827
Recall:  0.0575815738964
Accuracy:  0.855562928556
F1:  0.0842696629213


### 4. Cross-validation of model

In [47]:
from sklearn.cross_validation import KFold

N = len(X)
folds = 5

kf = KFold(N, n_folds=folds)

#inits
fold = 0;
trainSum = 0
testSum = 0

for train, test in kf:    
    fold +=1
    X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
    clf = tree.DecisionTreeClassifier(max_depth=3, random_state=1)
    clf = clf.fit(X_train, y_train)
    print "Fold",fold,"training score: ", clf.score(X_train, y_train)
    print "Fold",fold,"test score: ", clf.score(X_test, y_test)
    trainSum += clf.score(X_train, y_train)
    testSum += clf.score(X_test, y_test)
    
avgTrainingScore = trainSum/folds
avgTestScore = testSum/folds

print
print "Average training score:",avgTrainingScore
print "Average test score:",avgTestScore

Fold 1 training score:  0.895113808801
Fold 1 test score:  0.965040058267
Fold 2 training score:  0.898118361153
Fold 2 test score:  0.953750910415
Fold 3 training score:  0.899059180577
Fold 3 test score:  0.949866472445
Fold 4 training score:  0.912536797062
Fold 4 test score:  0.896078669418
Fold 5 training score:  0.942186883554
Fold 5 test score:  0.709845817652

Average training score: 0.90940300623
Average test score: 0.894916385639


### 5. Full dataset

In [51]:
bankFull = pd.read_csv('bank-additional-full.csv',delimiter=";")

# job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
le_job = preprocessing.LabelEncoder()
le_job.fit(['admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown'])
bankFull["job"] = le_job.transform(bankFull["job"])

# marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
le_marital = preprocessing.LabelEncoder()
le_marital.fit(['divorced','married','single','unknown'])
bankFull["marital"] = le_marital.transform(bankFull["marital"])

# education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
### This seems to have changed - categorical: 'secondary', 'tertiary', 'primary', 'unknown'
le_education = preprocessing.LabelEncoder()
le_education.fit(['basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown'])
bankFull["education"] = le_education.transform(bankFull["education"])

# default: has credit in default? (categorical: 'no','yes','unknown')
le_default = preprocessing.LabelEncoder()
le_default.fit(['no','yes','unknown'])
bankFull["default"] = le_default.transform(bankFull["default"])

# housing: has housing loan? (categorical: 'no','yes','unknown')
le_housing = preprocessing.LabelEncoder()
le_housing.fit(['no','yes','unknown'])
bankFull["housing"] = le_housing.transform(bankFull["housing"])

# loan: has personal loan? (categorical: 'no','yes','unknown')
le_loan = preprocessing.LabelEncoder()
le_loan.fit(['no','yes','unknown'])
bankFull["loan"] = le_loan.transform(bankFull["loan"])

# contact: contact communication type (categorical: 'cellular','telephone') 
le_contact = preprocessing.LabelEncoder()
le_contact.fit(['cellular','telephone', 'unknown'])
bankFull["contact"] = le_contact.transform(bankFull["contact"])

# month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
le_month = preprocessing.LabelEncoder()
le_month.fit(['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'])
bankFull["month"] = le_month.transform(bankFull["month"])

# day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
le_day_of_week = preprocessing.LabelEncoder()
le_day_of_week.fit(['mon','tue','wed','thu','fri'])
bankFull["day_of_week"] = le_day_of_week.transform(bankFull["day_of_week"])

# poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
le_poutcome = preprocessing.LabelEncoder()
le_poutcome.fit(['failure','nonexistent','success','unknown'])
bankFull["poutcome"] = le_poutcome.transform(bankFull["poutcome"])

# y - has the client subscribed a term deposit? (binary: 'yes','no') 
le_y = preprocessing.LabelEncoder()
le_y.fit(['yes','no'])
bankFull["y"] = le_y.transform(bankFull["y"])


X = bankFull.drop("y", axis=1)
y = bankFull["y"]

# Evaluation

N = len(X)
folds = 5

kf = KFold(N, n_folds=folds)

#inits
fold = 0;
trainSum = 0
testSum = 0

for train, test in kf:    
    fold +=1
    X_train, X_test, y_train, y_test = X.iloc[train], X.iloc[test], y.iloc[train], y.iloc[test]
    clf = tree.DecisionTreeClassifier(max_depth=3, random_state=1)
    clf = clf.fit(X_train, y_train)
    print "Fold",fold,"training score: ", clf.score(X_train, y_train)
    print "Fold",fold,"test score: ", clf.score(X_test, y_test)
    trainSum += clf.score(X_train, y_train)
    testSum += clf.score(X_test, y_test)
    
print
print "Average training score:",trainSum/folds
print "Average test score:",testSum/folds

print
print "Average training score improvement with full dataset:",(trainSum/folds) - avgTrainingScore
print "Average test score improvement with full dataset:",testSum/folds - avgTestScore


      age  job  marital  education  default  housing  loan  contact  month  \
8238   44    0        2          6        0        0     2        1      6   
8239   38    0        2          6        0        2     0        1      6   
8240   40    1        1          0        1        2     0        1      6   
8241   30    6        1          5        0        2     0        1      6   
8242   36    8        2          2        0        2     0        1      6   

      day_of_week  duration  campaign  pdays  previous  poutcome  \
8238            3       143         1    999         0         1   
8239            3       284         1    999         0         1   
8240            3       161         3    999         0         1   
8241            3       123         4    999         0         1   
8242            3       771         1    999         0         1   

      emp.var.rate  cons.price.idx  cons.conf.idx  euribor3m  nr.employed  
8238           1.4          94.465          -4