In [1]:
# Load libraries
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from pyspark.sql import SparkSession
from sklearn.model_selection import train_test_split
from pyspark import SparkContext, SparkConf
from pandas.tools.plotting import scatter_matrix


In [6]:
# Load dataset

OpenFile = pd.read_csv('bank-additional-full.csv',encoding='utf-8',sep=';') #use seperater to distinguish differrent columns on csv
DF =pd.DataFrame(OpenFile) # convert csv into dataframe
Column_names = ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month',
                    'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate',
                    'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'result']
Categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week',
                       'poutcome', 'result']
DF.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
age               41188 non-null int64
job               41188 non-null object
marital           41188 non-null object
education         41188 non-null object
default           41188 non-null object
housing           41188 non-null object
loan              41188 non-null object
contact           41188 non-null object
month             41188 non-null object
day_of_week       41188 non-null object
duration          41188 non-null int64
campaign          41188 non-null int64
pdays             41188 non-null int64
previous          41188 non-null int64
poutcome          41188 non-null object
emp.var.rate      41188 non-null float64
cons.price.idx    41188 non-null float64
cons.conf.idx     41188 non-null float64
euribor3m         41188 non-null float64
nr.employed       41188 non-null float64
y                 41188 non-null object
dtypes: float64(5), int64(5), object(11)
memory usa

In [7]:
DF.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [8]:
DF.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [9]:
#check for any missing values
DF.apply(lambda x: sum(x.isnull()),axis=0)

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [10]:
 #'y'  is found that data is imbalanced becouse there is approx 88% is 'no' and 12% is 'yes'.
count = DF.groupby('y').size()
percent = count/len(DF)*100
print(percent)

y
no     88.734583
yes    11.265417
dtype: float64


In [None]:
# Impute outliers function
def impute_outliers(DF, Column , Minimum, Maximum):
    Column_values = DF[Column].values
    DF[Column] = np.where(np.logical_or(Column_values<minimum, Column_values>maximum), Column_values.mean(), Column_values)
    return DF
DF.describe()

In [4]:
# Divide data into training and testing and display the information
TrainData, TestData = train_test_split(DF, test_size=0.2)
print(TrainData.info())
print(TrainData.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32950 entries, 9344 to 17768
Data columns (total 21 columns):
age               32950 non-null int64
job               32950 non-null object
marital           32950 non-null object
education         32950 non-null object
default           32950 non-null object
housing           32950 non-null object
loan              32950 non-null object
contact           32950 non-null object
month             32950 non-null object
day_of_week       32950 non-null object
duration          32950 non-null int64
campaign          32950 non-null int64
pdays             32950 non-null int64
previous          32950 non-null int64
poutcome          32950 non-null object
emp.var.rate      32950 non-null float64
cons.price.idx    32950 non-null float64
cons.conf.idx     32950 non-null float64
euribor3m         32950 non-null float64
nr.employed       32950 non-null float64
y                 32950 non-null object
dtypes: float64(5), int64(5), object(11)
memory 

In [None]:
#Visualising the data
#Age
fig = plt.figure(figsize=(10,10))
FirstGraph = fig.add_subplot(331)
FirstGraph.set_title("Age")
Age = TrainData['age']
Age.hist(alpha=0.7,label = 'age histogram')

#job
fig = plt.figure(figsize=(10,10))
FirstGraph = fig.add_subplot(332)
FirstGraph.set_title("job")
Job = TrainData['job']
Job.hist(alpha=0.7,label = 'job histogram')

#Housing
fig = plt.figure(figsize=(10,10))
FirstGraph = fig.add_subplot(333)
FirstGraph.set_title("housing")
housing = TrainData['housing']
housing.hist(alpha=0.7,label = 'housing histogram')

#Loan
fig = plt.figure(figsize=(10,10))
FirstGraph = fig.add_subplot(334)
FirstGraph.set_title("loan")
loan = TrainData['loan']
loan.hist(alpha=0.7,label = 'loan histogram')



