In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("https://raw.githubusercontent.com/tassalor1/Bank-Term-Deposit-Prediction/Virginia's-Branch/Bank-Additional-full.csv")

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,year,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,2008,...,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,2008,...,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,2008,...,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,2008,...,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,2008,...,1,-1,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
#replacing "unknown" with NaN for ease of filtering
df2=df.replace({'Unknown' : np.nan, 'unknown' : np.nan})

In [5]:
df2.isnull().sum()

age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month                0
year                 0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64

In [6]:
# Replacing NaN in 'housing' and 'loan' with mode values
housing=df['housing']
df2['housing'] = df2['housing'].fillna(df['housing'].mode()[0])

loan=df['loan']
df2['loan'] = df2['loan'].fillna(df['loan'].mode()[0])

In [7]:
# For now replacing NaN back with 'unknown' for default and leaving that as it's own category. The reason for this is that their is an extreme 
# imbalance in the 'yes'/'no' cases for this feature, with only three data points indicating 'yes'. Yet the number of "unknown" cases is so large.
# Since classification would be skewed by the imbalance, we are simply keeping it as 'unknown'.

df2['default']=df2['default'].replace({np.nan:'unknown'})

In [8]:
df2.isnull().sum() 

age                  0
job                330
marital             80
education         1731
default              0
housing              0
loan                 0
contact              0
month                0
year                 0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64

In [9]:
# Identify Outlier Dating

import sklearn
from sklearn.datasets import load_boston
# IQR
Q1 = np.percentile(df['duration'], 25, interpolation = 'midpoint')
Q3 = np.percentile(df['duration'], 75,interpolation = 'midpoint')
IQR = Q3 - Q1
 
print("Old Shape: ", df.shape)

# Upper bound
upper=Q3+3*IQR

# Lower bound is not necessary to define

# Removing the outliers
df2.drop(df2[df2['duration']>=upper].index,inplace=True)

Old Shape:  (41188, 22)


Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  Q1 = np.percentile(df['duration'], 25, interpolation = 'midpoint')
Users of the modes 'nearest', 'lower', 'higher', or 'midpoint' are encouraged to review the method they used. (Deprecated NumPy 1.22)
  Q3 = np.percentile(df['duration'], 75,interpolation = 'midpoint')


In [10]:
#Dropping rows where 'job' and 'marital' contain NaN
df2=df2.dropna(subset=['job','marital'])

In [11]:
df2.isnull().sum() 

age                  0
job                  0
marital              0
education         1552
default              0
housing              0
loan                 0
contact              0
month                0
year                 0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64

In [12]:
df2.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
year                int64
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [13]:
convert_dict = {'age': float,'year': float,'duration':float,'campaign': float,'pdays': float,'previous': float,'emp.var.rate': float,
                'cons.price.idx': float,'cons.conf.idx': float,'euribor3m': float, 'nr.employed': float}

df2 = df2.astype(convert_dict)

In [36]:
# Now we must work on the preprocessing ML model to predict the missing values of 'education'. In order to train the model, we must first 
# remove the missing values.

df2_test=df2[df2['education'].isnull()]
X_test=df2_test.drop(['education'], axis=1)
y_test=pd.DataFrame(df2_test['education'])

df2_train=df2[df2['education'].notnull()]
X_train=df2_train.drop(['education'],axis=1)
y_train=pd.DataFrame(df2_train['education'])

X_train.isnull().sum() 
y_train.head(100)
y_test.head()

Unnamed: 0,education
7,
10,
26,
30,
31,


In [37]:
numeric_columns=list(X_train.select_dtypes('float64').columns)
categorical_columns=list(X_train.select_dtypes('object').columns)

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,LabelBinarizer,OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
#import xgboost as xgb

scaler = StandardScaler()
scaler.fit(X_train[numeric_columns])
X_train_scaled = pd.DataFrame(scaler.transform(X_train[numeric_columns]), columns = numeric_columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test[numeric_columns]), columns = numeric_columns) 
X_train_scaled.head()



Unnamed: 0,age,year,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,1.566437,-0.65347,0.166192,-0.562507,-0.168557,-0.34976,0.650117,0.732708,0.893725,0.713704,0.330044
1,1.663545,-0.65347,-0.43945,-0.562507,-0.168557,-0.34976,0.650117,0.732708,0.893725,0.713704,0.330044
2,-0.278619,-0.65347,-0.023071,-0.562507,-0.168557,-0.34976,0.650117,0.732708,0.893725,0.713704,0.330044
3,0.012706,-0.65347,-0.428635,-0.562507,-0.168557,-0.34976,0.650117,0.732708,0.893725,0.713704,0.330044
4,1.566437,-0.65347,0.414937,-0.562507,-0.168557,-0.34976,0.650117,0.732708,0.893725,0.713704,0.330044


In [54]:
encoder = OneHotEncoder(sparse=False)

encoder.fit_transform(X_train[categorical_columns])

df_temp=encoder.transform(X_train[categorical_columns])
X_train_encoded=pd.DataFrame(df_temp,columns=encoder.get_feature_names())

df_temp2=encoder.transform(X_test[categorical_columns])
X_test_encoded=pd.DataFrame(df_temp2,columns=encoder.get_feature_names())

encoder.fit_transform(y_train)

df_temp3=encoder.transform(y_train)
y_columns=encoder.get_feature_names()
y_train_encoded=pd.DataFrame(df_temp3,columns=encoder.get_feature_names())

y_train_encoded.head()



Unnamed: 0,x0_basic.4y,x0_basic.6y,x0_basic.9y,x0_high.school,x0_illiterate,x0_professional.course,x0_university.degree
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [46]:
Transformed_X_train= pd.concat([X_train_scaled,X_train_encoded], axis=1)
Transformed_X_test= pd.concat([X_test_scaled,X_test_encoded], axis=1)

In [47]:
clf =RandomForestRegressor()
clf.fit(Transformed_X_train, y_train_encoded)

In [50]:
y_pred = clf.predict(Transformed_X_test)

In [59]:
y_pred=pd.DataFrame(y_pred,columns=y_columns)
y_pred

Unnamed: 0,x0_basic.4y,x0_basic.6y,x0_basic.9y,x0_high.school,x0_illiterate,x0_professional.course,x0_university.degree
0,0.29,0.07,0.63,0.00,0.0,0.01,0.00
1,0.13,0.01,0.85,0.00,0.0,0.01,0.00
2,0.27,0.03,0.17,0.05,0.0,0.22,0.26
3,0.02,0.02,0.10,0.34,0.0,0.00,0.52
4,0.16,0.02,0.12,0.03,0.0,0.36,0.31
...,...,...,...,...,...,...,...
1547,0.00,0.00,0.01,0.75,0.0,0.07,0.17
1548,0.14,0.00,0.06,0.07,0.0,0.24,0.49
1549,0.00,0.00,0.01,0.80,0.0,0.01,0.18
1550,0.02,0.04,0.14,0.16,0.0,0.40,0.24


In [None]:
#Now that I have the predicted values for education, I can plug them back into the original dataset.



Connor's Comments:
The EDA process in the provided code is well-executed in several aspects. The Virginia demonstrates a comprehensive approach to data preprocessing, including handling missing values, outlier detection, and data type conversions, ensuring a clean dataset for analysis. Points to improve: Commenting: Would be helpful to provide more comments at each step for readability / Formatting: Code contains Inconsistent formatting and spacing / Model Choice: Education is a categorial variable so random forrest regressor isn’t the best choice of model - use a classifier / Model Evaluation: The model has no evaluation of performance - use something like a classification report to assess this

In [None]:
# Vincent's Comment: The process of removing certain rows based on features that had a small amount of missing values were executed well. Virginia also used the method of replacing missing values with the mode with two featues, which is a strategy that we agreed upon. Furthermore, Virginia used a random forest along with a regressor, which the model mistakenly predicted continuous numerical values for default. However, default is a binary classification. Thus, one suggestion that we discussed at our meeting was changing the regressor to a different ML method to ensure that it predicts either 1 or 0 for default. Overall, the data cleansing and transformation was job really well.

Kefan's comments: Virgina's way of dropping nan values in "jobs" and "marital" columns and replacing with mode in "housing" and "loan" columns is the same as my approach, and I have the same opinion as Connor that regressor is not suitable in this case but it only works for continuous variables, and it will return decimals with categorical encoding.