In [None]:
# To write a Python 2/3 compatible codebase, the first step is to add this line to the top of each module
from __future__ import division, print_function, unicode_literals

# Import necessary libraries.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

np.random.seed(42) # Ensure reproducability

In [None]:
################################################## Load Data ##################################################

#WARNING: place csv file in same folder as this code for this to import the data

df = pd.read_csv("/Users/baixiao/Desktop/bank-additional-full.csv")

print(list(df.columns.values))
#print(df.isnull().values.any())

In [None]:
print(df.shape)
df.head()

In [None]:
# Retrieving Attributes (X)
X = df.iloc[:, 0:19]
features = []               #list of feature names to use
for col in X.columns:     
    features.append(col)  
print("X:")
print(X.head())
    
# Retriving Target Variable (y)
y = df.iloc[:,19]     # grab target column
print("y:")
print(y.head())

In [None]:
# categorical and numeric feature lists
numeric =['age' , 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed' ]
categorical = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']

In [None]:
#replacing the YES and NO with 1 and 0 into binomial column
y[y == 'yes'] = 1
y[y == 'no'] = 0 
classifications = ['1', '0']

print(y)

In [None]:
#remove "default" column:
if 'default' in X:
    X = X.drop(['default'], axis=1)

print("New X:")
print(X.shape)          #Now 18 indicators left
print(X.head())

In [None]:
#try removing any row where there is 'unknown' in columns
indices = X[(X['job']=='unknown') | (X['marital']=='unknown') | (X['education']=='unknown') | (X['housing']=='unknown') | (X['loan']=='unknown')].index
print(indices)        #2943 observations would be removed (7.15% of original data)

In [None]:
X = X.drop(index=indices)        #drop 'unknown' observations(from "index")
#print(type(X))
print("New X:")
print(X.shape)         #Now: 38245 observations left (92.85% amount of original data left, which is ok)
print(X.head())
print()

y = y.drop(index=indices)
print("New y:")
print(y.shape)
print(y.head())

##### ENCODING All Categorical Variables ("string" format into numeric categorical variables) -- using LabelEncoder (not dummies variables here yet)

In [None]:
print("Categories for 'job' now:", X['job'].unique())
print("Number of Categories for 'job' now:", len(X['job'].unique()))

lab_encoder = LabelEncoder()
job_encoded = lab_encoder.fit_transform(X['job'])
print("\nCorresponding Encodings Display:")
print(job_encoded[:50])
print(lab_encoder.inverse_transform(job_encoded[:50]))
X['job_encoded'] = job_encoded
#Observation: encoding: '0': 'admin.'; '1': 'blue=collar'; '2': 'entrepreneur'; '3': 'housemaid'; '4': 'management'; 
# '5': 'retired'; 6': 'self-employed'; '7': 'services'; '8': 'student'; '9': 'technician'; '10': 'unemployed' 

In [None]:
print("Categories for 'marital' now:", X['marital'].unique())
print("Number of Categories for 'marital' now:", len(X['marital'].unique()))

lab_encoder = LabelEncoder()
marital_encoded = lab_encoder.fit_transform(X['marital'])
print("\nCorresponding Encodings Display:")
print(marital_encoded[:20])
print(lab_encoder.inverse_transform(marital_encoded[:20]))
X['marital_encoded'] = marital_encoded
#Observation: encoding: '0': 'divorced'; '1': 'married'; '2': 'single'

In [None]:
print("Categories for 'education' now:", X['education'].unique())
print("Number of Categories for 'education' now:", len(X['education'].unique()))

lab_encoder = LabelEncoder()
education_encoded = lab_encoder.fit_transform(X['education'])
print("\nCorresponding Encodings Display:")
print(education_encoded[:30])
print(lab_encoder.inverse_transform(education_encoded[:30]))
X['education_encoded'] = education_encoded
#Observation: encoding: '0': 'basic.4y'; '1': 'basic.6y'; '2': 'basic.9y'; '3': 'high.school'; '4': 'illiterate'; 5': 'professional.course'; '6': 'university.degree'

In [None]:
print("Categories for 'housing' now:", X['housing'].unique())
print("Number of Categories for 'education' now:", len(X['housing'].unique()))

lab_encoder = LabelEncoder()
housing_encoded = lab_encoder.fit_transform(X['housing'])
print("\nCorresponding Encodings Display:")
print(housing_encoded[:30])
print(lab_encoder.inverse_transform(housing_encoded[:30]))
X['housing_encoded'] = housing_encoded
#Observation: Binary variable "housing_encoded": '0': 'no'; '1': 'yes'

In [None]:
print("Categories for 'loan' now:", X['loan'].unique())
print("Number of Categories for 'loan' now:", len(X['loan'].unique()))

lab_encoder = LabelEncoder()
loan_encoded = lab_encoder.fit_transform(X['loan'])
print("\nCorresponding Encodings Display:")
print(loan_encoded[:30])
print(lab_encoder.inverse_transform(loan_encoded[:30]))
X['loan_encoded'] = loan_encoded
#Observation: Binary variable "loan_encoded": '0': 'no'; '1': 'yes'

In [None]:
print("Categories for 'contact' now:", X['contact'].unique())
print("Number of Categories for 'contact' now:", len(X['contact'].unique()))

lab_encoder = LabelEncoder()
contact_encoded = lab_encoder.fit_transform(X['contact'])
print("\nCorresponding Encodings Display:")
print(contact_encoded[:10])
print(lab_encoder.inverse_transform(contact_encoded[:10]))
X['contact_encoded'] = contact_encoded
#Observation: encoding: '0': 'cellular'; '1': 'telephone'

In [None]:
print("Categories for 'month' now:", X['month'].unique())
print("Number of Categories for 'month' now:", len(X['month'].unique()))

lab_encoder = LabelEncoder()
month_encoded = lab_encoder.fit_transform(X['month'])
print("\nCorresponding Encodings Display:")
print(month_encoded[:10])
print(lab_encoder.inverse_transform(month_encoded[:10]))
X['month_encoded'] = month_encoded
#Observation: encoding: '0': 'apr'; '1': 'aug'; '2': 'dec'; '3': 'jul'; '4': 'jun'; '5': 'mar'; '6': 'may'; '7': 'nov'; '8': 'oct'; '9': 'sep'

In [None]:
print("Categories for 'day_of_week' now:", X['day_of_week'].unique())
print("Number of Categories for 'day_of_week' now:", len(X['day_of_week'].unique()))

lab_encoder = LabelEncoder()
dayofweek_encoded = lab_encoder.fit_transform(X['day_of_week'])
print("\nCorresponding Encodings Display:")
print(dayofweek_encoded[:10])
print(lab_encoder.inverse_transform(dayofweek_encoded[:10]))
X['dayofweek_encoded'] = dayofweek_encoded
#Observation: encoding: '0': 'fri'; '1': 'mon'; '2': 'thu'; '3': 'tue'; '4': 'wed'

In [None]:
print("Categories for 'poutcome' now:", X['poutcome'].unique())
print("Number of Categories for 'poutcome' now:", len(X['poutcome'].unique()))

lab_encoder = LabelEncoder()
poutcome_encoded = lab_encoder.fit_transform(X['poutcome'])
print("\nCorresponding Encodings Display:")
print(poutcome_encoded[:10])
print(lab_encoder.inverse_transform(poutcome_encoded[:10]))
X['poutcome_encoded'] = poutcome_encoded
#Observation: encoding: '0': 'failure'; '1': 'nonexistent'; '2': 'success'

In [None]:
X.head()

In [None]:
#re-retrieve the X variables (using the encoded categorical variables instead of in their string format)
X_encoded = X[['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 
      'job_encoded', 'marital_encoded', 'education_encoded', 'housing_encoded', 'loan_encoded',
     'contact_encoded', 'month_encoded', "dayofweek_encoded", "poutcome_encoded"]]
print(X_encoded.shape)
X_encoded.head()

In [None]:
print(X_encoded.shape)
print(y.shape)
print(y.head())

In [None]:
#change "999" in 'pdays' into "-1" (avoid unnecessary skewness while remaining the same meaning)
X_encoded.loc[X_encoded['pdays']==999, 'pdays'] = -1
X_encoded.head()
#X_encoded.shape

In [None]:
X_encoded.to_csv("/Users/baixiao/Desktop/X_encoded.csv", index=False)
y.to_csv("/Users/baixiao/Desktop/y.csv", index=False, header=True)

product:
-"X_encoded.csv"
-"y.csv"

#### Create dummies for original 'unbalanced' data

In [None]:
# create dummies for categorical variables in original "unbalanced data" using for loop
X_dummy_ori = X_encoded
for i in ['job','marital','education','contact','month','day_of_week','poutcome']:
    colname = "is_" + str(i)
    dummy = pd.get_dummies(X[i])
    dummy = dummy.drop(dummy.columns[dummy.shape[1]-1], axis=1)
    #print(dummy.head())
    X_dummy_ori = pd.concat([X_dummy_ori, dummy], axis=1, verify_integrity=False)     #[***]"verify_integrity":avoid the result from including "same"(duplicate") index values which would cause an error

print(X_dummy_ori.head())

In [None]:
X_dummy_ori.rename(columns={'housing_encoded': 'housing', 'loan_encoded': 'loan'}, inplace=True)
#X_encoded['loan_encoded'].columns='loan'
X_dummy_ori.columns

X_dummy_ori = X_dummy_ori.drop(['job_encoded', "marital_encoded", "education_encoded", "contact_encoded", "month_encoded", "dayofweek_encoded", "poutcome_encoded"], axis=1)
print(X_dummy_ori.columns)

X_dummy_ori.head()

In [None]:
X_dummy_ori.to_csv("/Users/baixiao/Desktop/X_dummy_unbalanced.csv", index=False)

product:
-"X_dummy_unbalanced.csv"