In [51]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import KFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder

In [2]:
df=pd.read_csv('/content/drive/MyDrive/CE888/Lab 3/bank-additional/bank-additional-full.csv',sep=';')

In [3]:
df.shape

(41188, 21)

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [5]:
df.drop('duration', inplace=True, axis=1)

In [6]:
df.shape

(41188, 20)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32950 entries, 33479 to 14000
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32950 non-null  int64  
 1   job             32950 non-null  object 
 2   marital         32950 non-null  object 
 3   education       32950 non-null  object 
 4   default         32950 non-null  object 
 5   housing         32950 non-null  object 
 6   loan            32950 non-null  object 
 7   contact         32950 non-null  object 
 8   month           32950 non-null  object 
 9   day_of_week     32950 non-null  object 
 10  campaign        32950 non-null  int64  
 11  pdays           32950 non-null  int64  
 12  previous        32950 non-null  int64  
 13  poutcome        32950 non-null  object 
 14  emp.var.rate    32950 non-null  float64
 15  cons.price.idx  32950 non-null  float64
 16  cons.conf.idx   32950 non-null  float64
 17  euribor3m       32950 non-n

In [7]:
# Check class imbalance 
df['y'].value_counts()

no     36548
yes     4640
Name: y, dtype: int64

In [8]:
le1 = LabelEncoder()
df['label'] = le1.fit_transform(df['y'])
df.drop('y', inplace=True, axis=1)

In [9]:
cat_features = ["job", "marital", "education", "default", "housing", "loan", "contact", "month", "day_of_week", "poutcome"]
df[cat_features].head()

# Question 3 : How many categorical features does the Bank dataset have?
#. Answer : 10

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,nonexistent
4,services,married,high.school,no,no,yes,telephone,may,mon,nonexistent


In [10]:
# Let's also look at standardising some of the numerical features
num_features = ['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'  ]
df[num_features].head()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0
1,57,1,999,0,1.1,93.994,-36.4,4.857,5191.0
2,37,1,999,0,1.1,93.994,-36.4,4.857,5191.0
3,40,1,999,0,1.1,93.994,-36.4,4.857,5191.0
4,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0


In [35]:
# Most algorithms cannot work with missing values. Three main ways to deal with them:
# 1. Drop rows
# 2. Drop features with NaNs
# 3. Fill missing values
# As we have very few missing values, let's drop the rows
print("Number of rows before dropping NaNs: %d" % len(df))
df = df.dropna()
print("Number of rows after dropping NaNs: %d" % len(df))
# Separate features from outcomes
df_y = df['label'].copy()
df_X = df.iloc[:,:-1]

Number of rows before dropping NaNs: 32950
Number of rows after dropping NaNs: 32950


In [11]:
class OneHotEncoderCategoricalFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features):
        self.cat_features = cat_features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.get_dummies(X, columns=self.cat_features)

In [36]:
# We can use a ColumnTransformer to transform only some types of columns in different ways
colTransformer = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(), cat_features)
    ],
    remainder='passthrough'  # i.e., leave the rest of the columns untouched -- otherwise, they're dropped
    )
X_processed = colTransformer.fit_transform(df_X)
X_processed.shape

(32950, 62)

In [37]:
X_train,X_test,y_train,y_test = train_test_split(X_processed,df_y,random_state=42,stratify=df_y,test_size=0.3)

In [41]:
dc = DummyClassifier()
dc.fit(X_train, y_train)

dc_y_pred = dc.predict(X_test)

In [42]:
rf = RandomForestClassifier(n_estimators = 100 , max_depth = 3)
rf.fit(X_train, y_train)

rf_y_pred = rf.predict(X_test)

In [43]:
dc_f1_score = f1_score(y_test,dc_y_pred)
rf_f1_score = f1_score(y_test,rf_y_pred)
print('Dummy Classifier:', dc_f1_score)
print('Random Forest:', rf_f1_score)

Dummy Classifier: 0.0
Random Forest: 0.2529411764705882


In [None]:

# Question 4: In the bank dataset, the dummy (stratified) classifier is better than a random forest classifier 
# with 100 estimators and a maximum depth of 3, according to the F1 score.
# Answer : False

# Question 5: In the bank dataset, a random forest classifier with 100 estimators and a maximum depth of 3 is better
# than a dummy (stratified) classifier, according to the F1 score.
# Answer : True

In [56]:
# Divide your training and test set using train_test_split and keeping your test set to 20% of the full dataset, using random_state=50 when calling the function.
# Perform 10-fold cross-validation using as a classifier a random forest with 100 estimators and a maximum depth of 8. 
# Report the average F1 score with two numbers after the floating point (e.g., 0.56)

cv = KFold(n_splits=10)
rf_model = RandomForestClassifier(n_estimators=100,max_depth=8,random_state=60)
f1_scores = cross_val_score(rf_model, X_processed, df_y, scoring='f1', cv=cv)
f1_scores

array([0.30463576, 0.26724138, 0.29977629, 0.32985386, 0.28633406,
       0.31364562, 0.3059867 , 0.3190184 , 0.2804878 , 0.32911392])

In [57]:
print(np.average(f1_scores))

0.3036093797113811


In [58]:
#Divide your training and test set using train_test_split and keeping your test set to 20% of the full dataset, using random_state=50 when calling the function.
#Perform 10-fold cross-validation using as a classifier a random forest with 200 estimators and a maximum depth of 8. Report the average F1 score 
#with two numbers after the floating point (e.g., 0.56)

In [59]:
cv = KFold(n_splits=10)
rf_model = RandomForestClassifier(n_estimators=200,max_depth=8,random_state=60)
f1_scores = cross_val_score(rf_model, X_processed, df_y, scoring='f1', cv=cv)
f1_scores

array([0.30088496, 0.26552463, 0.29977629, 0.33954451, 0.28884026,
       0.32048682, 0.30666667, 0.31492843, 0.27586207, 0.33542977])

In [60]:
print(np.average(f1_scores))

0.30479443892108715


In [61]:
import pickle as pkl

outp = open('model.pkl', 'wb')
pkl.dump(clf, outp, -1)  # where clf is your classifier
outp.close()