# Bank data analysis

The goal of your project is to create a robust classifier and use the data, with the help of which you will build a model that will recognize whether specific client will subscribe to a long term bank deposit (Binary classification).  
Make feature engineering but also try differnet models in order to get as much accuracy as possible.


In [140]:
import matplotlib.pyplot as plt
import time
import numpy as np
import pandas as pd
import seaborn as sns
import missingno #find missing data
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, model_selection, tree, preprocessing, metrics, linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.manifold import TSNE
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from scipy.stats import randint as sp_randint
import datetime
import xgboost as xgb
from xgboost import XGBClassifier

%matplotlib inline

In [141]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

## Read data

In [142]:
#read data
data = pd.read_csv('dataset.csv', sep=';')  

Helper Methods 

In [143]:
#data['y'].replace({"no":0, "yes":1}, inplace=True)

In [144]:
education_map = {
    'basic.4y': 'Hauptschule',
    'high.school': 'Hochschulreife',
    'basic.6y': 'Realschule',
    'basic.9y': 'Abitur',
    'professional.course': 'Berufliche Weiterbildung',
    'unknown': 'unknown',
    'university.degree': 'Universitätsabschluss',
    'illiterate': 'Analphabet'
}
data['education'] = data['education'].map(education_map)

In [145]:
data.job.unique()

array(['housemaid', 'services', 'admin.', 'blue-collar', 'technician',
       'retired', 'management', 'unemployed', 'self-employed', 'unknown',
       'entrepreneur', 'student'], dtype=object)

In [146]:
education_map = {
    'basic.4y': 'Hauptschule',
    'high.school': 'Hochschulreife',
    'basic.6y': 'Realschule',
    'basic.9y': 'Abitur',
    'professional.course': 'Berufliche Weiterbildung',
    'unknown': 'unknown',
    'university.degree': 'Universitätsabschluss',
    'illiterate': 'Analphabet'
}
data['education'] = data['education'].map(education_map)

In [147]:
job_map = {
    'housemaid': 'Service',
    'services': 'Dienstleistungen',
    'admin.': 'Verwaltung',
    'blue-collar': 'Industrie',
    'technician': 'Technik',
    'retired': 'Rentne',
    'management': 'Management',
    'unemployed': 'Arbeitslos',
    'self-employed': 'Selbstständig',
    'unknown': 'Unbekannt',
    'entrepreneur': 'Unternehmung',
    'student': 'Studierend'
}
data['job'] = data['job'].map(job_map)

In [148]:
marital_map = {
    'married': 'Verheiratet',
    'divorced': 'Geschieden',
    'single': 'Single',
    'unknown': 'Unbekannt'
}

data['marital'] = data['marital'].map(marital_map)

In [149]:
contact_map = {
    'cellular': 'Online',
    'telephone': 'Telefon',
    'unknown': 'Unbekannt'
}

data['contact'] = data['contact'].map(contact_map)

In [150]:
for column in data.select_dtypes(include=['object']).columns:
    data[column] = data[column].replace({'yes': 'ja', 'no': 'nein'})

In [151]:
previous_map = {
    'nonexistent': 'Nicht vorhanden',
    'failure': 'Fehlschlag',
    'success': 'Erfolg'
}

data['poutcome'] = data['poutcome'].map(previous_map)

In [152]:
month_map = {
    'jan': 'Januar',
    'feb': 'Februar',
    'mar': 'März',
    'apr': 'April',
    'may': 'Mai',
    'jun': 'Juni',
    'jul': 'Juli',
    'aug': 'August',
    'sep': 'September',
    'oct': 'Oktober',
    'nov': 'November',
    'dec': 'Dezember'
}

data['month'] = data['month'].map(month_map)

In [153]:
day_of_week_map = {
    'mon': 'Montag',
    'tue': 'Dienstag',
    'wed': 'Mittwoch',
    'thu': 'Donnerstag',
    'fri': 'Freitag',
    'sat': 'Samstag',
    'sun': 'Sonntag'
}

data['day_of_week'] = data['day_of_week'].map(day_of_week_map)

In [7]:
data = data.rename(columns={
    'age': 'Alter',
    'job': 'Beruf',
    'marital': 'Familienstand',
    'education': 'Bildung',
    'default': 'Kreditverpflichtung',
    'housing': 'Wohnsituation',
    'loan': 'Kredit',
    'contact': 'Kontakt',
    'month': 'Monat',
    'day_of_week': 'Wochentag',
    'duration': 'Dauer',
    'campaign': 'Kampagne',
    'pdays': 'Kontaktversuche',
    'previous': 'Vorherige Kontaktanzahl',
    'poutcome': 'Vorheriges Ergebnis',
    'emp.var.rate': 'Arbeitslosenquote',
    'cons.price.idx': 'Verbraucherpreisindex',
    'cons.conf.idx': 'Verbraucher-Konfidenzindex',
    'euribor3m': 'Euribor 3M',
    'nr.employed': 'Arbeitnehmerquote'
})

In [8]:
data

Unnamed: 0,Alter,Beruf,Familienstand,Bildung,Kreditverpflichtung,Wohnsituation,Kredit,Kontakt,Monat,Wochentag,...,Kampagne,Kontaktversuche,Vorherige Kontaktanzahl,Vorheriges Ergebnis,Arbeitslosenquote,Verbraucherpreisindex,Verbraucher-Konfidenzindex,Euribor 3M,Arbeitnehmerquote,y
0,56,Service,Verheiratet,,nein,nein,nein,Telefon,Mai,Montag,...,1,999,0,Nicht vorhanden,1.1,93.994,-36.4,4.857,5191.0,nein
1,57,Dienstleistungen,Verheiratet,,unknown,nein,nein,Telefon,Mai,Montag,...,1,999,0,Nicht vorhanden,1.1,93.994,-36.4,4.857,5191.0,nein
2,37,Dienstleistungen,Verheiratet,,nein,ja,nein,Telefon,Mai,Montag,...,1,999,0,Nicht vorhanden,1.1,93.994,-36.4,4.857,5191.0,nein
3,40,Verwaltung,Verheiratet,,nein,nein,nein,Telefon,Mai,Montag,...,1,999,0,Nicht vorhanden,1.1,93.994,-36.4,4.857,5191.0,nein
4,56,Dienstleistungen,Verheiratet,,nein,nein,ja,Telefon,Mai,Montag,...,1,999,0,Nicht vorhanden,1.1,93.994,-36.4,4.857,5191.0,nein
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,Rentne,Verheiratet,,nein,ja,nein,Online,November,Freitag,...,1,999,0,Nicht vorhanden,-1.1,94.767,-50.8,1.028,4963.6,ja
41184,46,Industrie,Verheiratet,,nein,nein,nein,Online,November,Freitag,...,1,999,0,Nicht vorhanden,-1.1,94.767,-50.8,1.028,4963.6,nein
41185,56,Rentne,Verheiratet,,nein,ja,nein,Online,November,Freitag,...,2,999,0,Nicht vorhanden,-1.1,94.767,-50.8,1.028,4963.6,nein
41186,44,Technik,Verheiratet,,nein,nein,nein,Online,November,Freitag,...,1,999,0,Nicht vorhanden,-1.1,94.767,-50.8,1.028,4963.6,ja


In [9]:
data.to_csv('dataset-students.csv', index=False)