In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("train-bank-campaign-data.csv")

In [3]:
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y,id_var
0,56,services,married,high.school,no,no,yes,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,65790
1,45,services,married,basic.9y,unknown,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,55643
2,59,admin.,married,professional.course,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,66189
3,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,70807
4,24,technician,single,professional.course,no,yes,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no,42625


In [4]:
data.shape

(37084, 22)

In [31]:
data.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
target              int32
dtype: object

In [5]:
stats = data.describe().T
stats['unique'] = [len(data[column].unique()) for column in stats.index]
stats['null'] = [data[column].isnull().sum() for column in stats.index]
stats['unknown'] = [(data[column] == "unknown").sum() for column in stats.index]
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,unique,null,unknown
age,37084.0,40.042714,10.432965,17.0,32.0,38.0,47.0,98.0,77,0,0
duration,37084.0,258.237946,258.730909,0.0,102.0,180.0,319.25,4918.0,1509,0,0
campaign,37084.0,2.569545,2.770611,1.0,1.0,2.0,3.0,56.0,42,0,0
pdays,37084.0,962.530849,186.773063,0.0,999.0,999.0,999.0,999.0,27,0,0
previous,37084.0,0.172986,0.495681,0.0,0.0,0.0,0.0,7.0,8,0,0
emp.var.rate,37084.0,0.082669,1.568997,-3.4,-1.8,1.1,1.4,1.4,10,0,0
cons.price.idx,37084.0,93.576076,0.578493,92.201,93.075,93.749,93.994,94.767,26,0,0
cons.conf.idx,37084.0,-40.505183,4.622045,-50.8,-42.7,-41.8,-36.4,-26.9,26,0,0
euribor3m,37084.0,3.621668,1.733972,0.634,1.344,4.857,4.961,5.045,314,0,0
nr.employed,37084.0,5167.058664,72.196605,4963.6,5099.1,5191.0,5228.1,5228.1,11,0,0


In [6]:
cat_stats = data.describe(include=['object']).T
cat_stats['null'] = [data[column].isnull().sum() for column in cat_stats.index]
cat_stats['unknown'] = [(data[column] == "unknown").sum() for column in cat_stats.index]
cat_stats

Unnamed: 0,count,unique,top,freq,null,unknown
job,37084,12,admin.,9420,0,306
marital,37084,4,married,22479,0,72
education,37084,8,university.degree,10971,0,1549
default,37084,3,no,29382,0,7700
housing,37084,3,yes,19433,0,882
loan,37084,3,no,30561,0,882
contact,37084,2,cellular,23522,0,0
month,37084,10,may,12420,0,0
day_of_week,37084,5,thu,7778,0,0
poutcome,37084,3,nonexistent,32023,0,0


What does all this information tell us?

1) There is not null values

2) We can see which columns don't contain useful information for our predictions:
* The id_var column contains a unique identifier for each row and will not be useful for prediction
* The duration column is the last contact duration. This attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

3) The column y is categorical, so we have to transform it to numerical.

4) We should analyze the correlation of numerical features and delete the columns highly correlated.

5) Transform categorical features to numerical using one-hot encode (get_dummmies)

6) There are some categorical columns that contain "unknown" values. One option is after transform these, delete the columns referring to the "unknown".

In [7]:
data.drop(columns = ['id_var', 'duration'], inplace = True)

In [8]:
# Label Encoding
data['target'] = np.where(data['y'] == "yes", 1, 0)
data.drop(columns = 'y', inplace = True)

In [26]:
features = data[[col for col in data if col not in ['target']]].reset_index(drop = True)
features.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
1,45,services,married,basic.9y,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
2,59,admin.,married,professional.course,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
3,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
4,24,technician,single,professional.course,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0


In [10]:
target = data[['target']]

In [11]:
corr_matrix = features.corr().abs()
corr_matrix

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
age,1.0,0.002777,0.035689,0.022897,9.9e-05,0.002773,0.129262,0.010845,0.018226
campaign,0.002777,1.0,0.052624,0.0783,0.15039,0.126598,0.012388,0.13464,0.143609
pdays,0.035689,0.052624,1.0,0.586728,0.271053,0.078072,0.089587,0.297038,0.372449
previous,0.022897,0.0783,0.586728,1.0,0.41888,0.200008,0.052069,0.45322,0.500861
emp.var.rate,9.9e-05,0.15039,0.271053,0.41888,1.0,0.774727,0.199778,0.972243,0.906857
cons.price.idx,0.002773,0.126598,0.078072,0.200008,0.774727,1.0,0.062526,0.687433,0.52101
cons.conf.idx,0.129262,0.012388,0.089587,0.052069,0.199778,0.062526,1.0,0.280493,0.102931
euribor3m,0.010845,0.13464,0.297038,0.45322,0.972243,0.687433,0.280493,1.0,0.945145
nr.employed,0.018226,0.143609,0.372449,0.500861,0.906857,0.52101,0.102931,0.945145,1.0


In [12]:
tri = np.triu(np.ones(corr_matrix.shape), k = 1).astype(np.bool)
upper = corr_matrix.where(tri)
# Find index of feature columns with correlation greater than 90%
to_drop = [col for col in upper.columns if any(upper[col] > 0.9)]
to_drop

['euribor3m', 'nr.employed']

In [13]:
# Drop highly colinear features
features.drop(features[to_drop], axis = 1, inplace = True)
features.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx
0,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4
1,45,services,married,basic.9y,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4
2,59,admin.,married,professional.course,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4
3,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4
4,24,technician,single,professional.course,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4


In [23]:
np.array(features.iloc[0]).reshape(1,-1)

array([[56, 'services', 'married', 'high.school', 'no', 'no', 'yes',
        'telephone', 'may', 'mon', 1, 999, 0, 'nonexistent', 1.1, 93.994,
        -36.4]], dtype=object)

In [21]:
list(features.iloc[0])

[56,
 'services',
 'married',
 'high.school',
 'no',
 'no',
 'yes',
 'telephone',
 'may',
 'mon',
 1,
 999,
 0,
 'nonexistent',
 1.1,
 93.994,
 -36.4]

In [14]:
cat_data = data.select_dtypes(include=['object']).copy()

In [15]:
cat_data.columns

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')

In [29]:
cat_data['job'].unique()

array(['services', 'admin.', 'blue-collar', 'technician', 'housemaid',
       'retired', 'management', 'unknown', 'entrepreneur', 'unemployed',
       'student', 'self-employed'], dtype=object)

In [26]:
cat_data_dummies = pd.get_dummies(data[cat_data.columns])
cat_data_dummies

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37079,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
37080,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
37081,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
37082,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0


In [33]:
df = features.copy()
df.drop(columns = cat_data.columns, inplace = True)
df.head()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0
1,45,1,999,0,1.1,93.994,-36.4,4.857,5191.0
2,59,1,999,0,1.1,93.994,-36.4,4.857,5191.0
3,41,1,999,0,1.1,93.994,-36.4,4.857,5191.0
4,24,1,999,0,1.1,93.994,-36.4,4.857,5191.0


In [41]:
list(df[-1:].values)

[array([ 7.4000e+01,  3.0000e+00,  9.9900e+02,  1.0000e+00, -1.1000e+00,
         9.4767e+01, -5.0800e+01,  1.0280e+00,  4.9636e+03])]

In [47]:
list(df.iloc[-1].values)

[74.0, 3.0, 999.0, 1.0, -1.1, 94.76700000000001, -50.8, 1.028, 4963.6]

In [36]:
features = pd.concat([cat_data_dummies, df], axis=1)
features

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_failure,poutcome_nonexistent,poutcome_success,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx
0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,56,1,999,0,1.1,93.994,-36.4
1,0,0,0,0,0,0,0,1,0,0,...,0,1,0,45,1,999,0,1.1,93.994,-36.4
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,59,1,999,0,1.1,93.994,-36.4
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,41,1,999,0,1.1,93.994,-36.4
4,0,0,0,0,0,0,0,0,0,1,...,0,1,0,24,1,999,0,1.1,93.994,-36.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37079,0,0,0,0,0,1,0,0,0,0,...,0,1,0,73,1,999,0,-1.1,94.767,-50.8
37080,0,1,0,0,0,0,0,0,0,0,...,0,1,0,46,1,999,0,-1.1,94.767,-50.8
37081,0,0,0,0,0,1,0,0,0,0,...,0,1,0,56,2,999,0,-1.1,94.767,-50.8
37082,0,0,0,0,0,0,0,0,0,1,...,0,1,0,44,1,999,0,-1.1,94.767,-50.8


In [37]:
unknow_columns = [c for c in features.columns if 'unknow' in c]
unknow_columns

['job_unknown',
 'marital_unknown',
 'education_unknown',
 'default_unknown',
 'housing_unknown',
 'loan_unknown']

In [39]:
features.drop(columns = unknow_columns, inplace = True)

In [40]:
features.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_failure,poutcome_nonexistent,poutcome_success,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx
0,0,0,0,0,0,0,0,1,0,0,...,0,1,0,56,1,999,0,1.1,93.994,-36.4
1,0,0,0,0,0,0,0,1,0,0,...,0,1,0,45,1,999,0,1.1,93.994,-36.4
2,1,0,0,0,0,0,0,0,0,0,...,0,1,0,59,1,999,0,1.1,93.994,-36.4
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,41,1,999,0,1.1,93.994,-36.4
4,0,0,0,0,0,0,0,0,0,1,...,0,1,0,24,1,999,0,1.1,93.994,-36.4


In [55]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(
    features,
    target,
    test_size = .2,
    random_state = 42
)

train_X.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_failure,poutcome_nonexistent,poutcome_success,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx
26924,0,1,0,0,0,0,0,0,0,0,...,1,0,0,43,2,999,1,-1.8,93.075,-47.1
36434,1,0,0,0,0,0,0,0,0,0,...,0,0,1,55,2,6,3,-1.7,94.027,-38.3
9444,0,0,0,0,1,0,0,0,0,0,...,0,1,0,49,18,999,0,1.4,94.465,-41.8
25809,0,0,0,0,0,0,0,1,0,0,...,0,1,0,52,2,999,0,-1.8,93.075,-47.1
9641,0,0,0,0,0,1,0,0,0,0,...,0,1,0,55,4,999,0,1.4,94.465,-41.8


In [56]:
from sklearn.linear_model import LogisticRegression

log_regr = LogisticRegression(solver = 'lbfgs')
log_regr.fit(train_X, train_y.values.ravel())
log_regr.score(test_X, test_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8984764729675071

In [58]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier


classifiers = {'Logistic Regression': LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr'),
              'Gaussian NB': GaussianNB(),
              'Bernoulli NB': BernoulliNB(),
              'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0),
              'KNeighborsClassifier': KNeighborsClassifier(), 
              'Linear SVC': LinearSVC(),
              'Neural Network':  MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)}
              
def compare_models(x, y, model_dict, folds=3):
    results = []
    for name, model in model_dict.items():
        scores = cross_val_score(model, x, y, cv=folds)
        stats = [name, scores.mean(), min(scores), max(scores), scores.std(), pd.Series(scores).mad(), scores]
        results.append(stats)
    df = pd.DataFrame(results, columns = ['Model', 'Mean', 'Min', 'Max','Std', 'Mad', 'Score'])
    df = df.sort_values('Mean', ascending = False)
    return df

In [59]:
compare_models(features, target, classifiers, folds=3)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation f

Unnamed: 0,Model,Mean,Min,Max,Std,Mad,Score
3,Random Forest,0.700135,0.32732,0.886579,0.26362,0.248544,"[0.8865070376961657, 0.8865787557640967, 0.327..."
6,Neural Network,0.628863,0.113502,0.886579,0.364415,0.343574,"[0.8865070376961657, 0.8865787557640967, 0.113..."
5,Linear SVC,0.60764,0.114554,0.886912,0.349687,0.328724,"[0.8869115029930432, 0.8214545748725831, 0.114..."
2,Bernoulli NB,0.535316,0.234447,0.890794,0.270721,0.236986,"[0.8907943698430675, 0.48070544454332176, 0.23..."
4,KNeighborsClassifier,0.502498,0.254834,0.883757,0.273583,0.254173,"[0.8837566736773985, 0.36890219237925737, 0.25..."
0,Logistic Regression,0.469598,0.259283,0.88675,0.294974,0.278101,"[0.8867497168742922, 0.2627619124666289, 0.259..."
1,Gaussian NB,0.448456,0.22304,0.889905,0.312174,0.294299,"[0.8899045461899369, 0.2230402071029852, 0.232..."


## Analysing test and sample csv

In [42]:
test = pd.read_csv('test.csv')

In [43]:
test.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,id_var
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,43935
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,73973
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,72312
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,74889
4,30,unemployed,married,high.school,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,49787


In [53]:
test.shape

(4104, 17)

In [51]:
sample = pd.read_csv('sample-submission.csv')

In [52]:
sample.shape

(4104, 2)

In [44]:
test.drop(columns = ['id_var', 'duration'], inplace = True)

In [46]:
test.drop(test[to_drop], axis = 1, inplace = True)
test.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4
4,30,unemployed,married,high.school,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4


In [47]:
cat_test_dummies = pd.get_dummies(test[cat_data.columns])
cat_test_dummies

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_oct,month_sep,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4099,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4100,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
4101,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
4102,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [48]:
df1 = test.copy()
df1.drop(columns = cat_data.columns, inplace = True)
df1.head()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx
0,56,1,999,0,1.1,93.994,-36.4
1,57,1,999,0,1.1,93.994,-36.4
2,37,1,999,0,1.1,93.994,-36.4
3,40,1,999,0,1.1,93.994,-36.4
4,30,1,999,0,1.1,93.994,-36.4


In [49]:
test_X = pd.concat([cat_test_dummies, df1], axis=1)
test_X

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_failure,poutcome_nonexistent,poutcome_success,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx
0,0,0,0,1,0,0,0,0,0,0,...,0,1,0,56,1,999,0,1.1,93.994,-36.4
1,0,0,0,0,0,0,0,1,0,0,...,0,1,0,57,1,999,0,1.1,93.994,-36.4
2,0,0,0,0,0,0,0,1,0,0,...,0,1,0,37,1,999,0,1.1,93.994,-36.4
3,1,0,0,0,0,0,0,0,0,0,...,0,1,0,40,1,999,0,1.1,93.994,-36.4
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,30,1,999,0,1.1,93.994,-36.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4099,1,0,0,0,0,0,0,0,0,0,...,1,0,0,32,2,999,2,-1.1,94.767,-50.8
4100,0,0,0,0,0,0,0,0,0,1,...,0,0,1,34,1,1,4,-1.1,94.767,-50.8
4101,0,0,0,0,0,0,0,0,0,1,...,0,0,1,34,1,6,2,-1.1,94.767,-50.8
4102,1,0,0,0,0,0,0,0,0,0,...,0,1,0,36,2,999,0,-1.1,94.767,-50.8


In [50]:
test_X.drop(columns = unknow_columns, inplace = True)
test_X

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_failure,poutcome_nonexistent,poutcome_success,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx
0,0,0,0,1,0,0,0,0,0,0,...,0,1,0,56,1,999,0,1.1,93.994,-36.4
1,0,0,0,0,0,0,0,1,0,0,...,0,1,0,57,1,999,0,1.1,93.994,-36.4
2,0,0,0,0,0,0,0,1,0,0,...,0,1,0,37,1,999,0,1.1,93.994,-36.4
3,1,0,0,0,0,0,0,0,0,0,...,0,1,0,40,1,999,0,1.1,93.994,-36.4
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,30,1,999,0,1.1,93.994,-36.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4099,1,0,0,0,0,0,0,0,0,0,...,1,0,0,32,2,999,2,-1.1,94.767,-50.8
4100,0,0,0,0,0,0,0,0,0,1,...,0,0,1,34,1,1,4,-1.1,94.767,-50.8
4101,0,0,0,0,0,0,0,0,0,1,...,0,0,1,34,1,6,2,-1.1,94.767,-50.8
4102,1,0,0,0,0,0,0,0,0,0,...,0,1,0,36,2,999,0,-1.1,94.767,-50.8


In [0]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

for column in cat_data:
    count = cat_data[column].value_counts()
    sns.set(style="darkgrid")
    f, axes = plt.subplots(1,1, figsize=(15, 4), sharex=True)
    sns.barplot(count.index, count.values, alpha=0.9)
    plt.title(column)
    plt.ylabel('Number of Occurrences', fontsize=12)
    plt.xlabel(column, fontsize=12)
    plt.show()

In [0]:
# import matplotlib.pyplot as plt
# plt.scatter(data['nr.employed'], data['euribor3m'])
# plt.show()

import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
ax = sns.scatterplot(x="nr.employed", y="euribor3m", hue="age" data=data)

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(data)
plt.show()

In [0]:
from pandas.plotting import scatter_matrix
scatter_matrix(features, figsize = (15, 15))