In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv("C:\\D\\datasets\\bank.csv", sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [3]:
df.shape

(4521, 17)

In [4]:
df.duplicated().sum()

0

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [9]:
df = df.replace('unknown', np.nan)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,Oct,79,1,-1,0,,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,May,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,Apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,,3,Jun,199,4,-1,0,,no
4,59,blue-collar,married,secondary,no,0,yes,no,,5,May,226,1,-1,0,,no


In [None]:
df['month'] = pd.to_datetime(df['month'], format='%b')
df['month'] = df['month'].dt.month

In [10]:
df.isna().sum()

age             0
job            38
marital         0
education     187
default         0
balance         0
housing         0
loan            0
contact      1324
day             0
month           0
duration        0
campaign        0
pdays           0
previous        0
poutcome     3705
y               0
dtype: int64

In [None]:
df = df.drop(['contact', 'poutcome', 'day'], axis=1)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,y
0,30,unemployed,married,primary,no,1787,no,no,19,Oct,79,1,-1,0,no
1,33,services,married,secondary,no,4789,yes,yes,11,May,220,1,339,4,no
2,35,management,single,tertiary,no,1350,yes,no,16,Apr,185,1,330,1,no
3,30,management,married,tertiary,no,1476,yes,yes,3,Jun,199,4,-1,0,no
4,59,blue-collar,married,secondary,no,0,yes,no,5,May,226,1,-1,0,no


In [12]:
cols_with_nan = [col for col in df if df[col].isnull().sum() > 0]
cols_with_nan

['job', 'education']

In [None]:
data = df.copy()
is_nan = np.where(data['job'].isna())[0]
data['is_nan'] = 0
data.loc[is_nan, 'is_nan'] = 1

for col in data.select_dtypes(include='object'):
    data[col] = pd.factorize(data[col])[0]

train_data = data[data['is_nan' == 0]]
test_data = data[data['is_nan' == 1]]

X_train = train_data.drop(['job', 'is_nan'], axis=1)
y_train = train_data['job']
X_test = test_data.drop(['job', 'is_nan'], axis=1)



Unnamed: 0,age,job,marital,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,y,is_nan
0,30,unemployed,married,primary,no,1787,no,no,19,Oct,79,1,-1,0,no,0
1,33,services,married,secondary,no,4789,yes,yes,11,May,220,1,339,4,no,0
2,35,management,single,tertiary,no,1350,yes,no,16,Apr,185,1,330,1,no,0
3,30,management,married,tertiary,no,1476,yes,yes,3,Jun,199,4,-1,0,no,0
4,59,blue-collar,married,secondary,no,0,yes,no,5,May,226,1,-1,0,no,0


In [14]:
# extract categorical coumns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
encoder = OneHotEncoder(sparse_output=False)

categorical_columns_encoded = encoder.fit_transform(df[categorical_columns])
# convert the categorical variables into a data frame, then conctenate the data frame with the og and drop the categorical columns
one_hot_df = pd.DataFrame(categorical_columns_encoded, columns=encoder.get_feature_names_out(categorical_columns))

df_encoded = pd.concat([df, one_hot_df], axis=1)

df_encoded = df_encoded.drop(categorical_columns, axis=1)
df_encoded

Unnamed: 0,age,balance,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep,y_no,y_yes
0,30,1787,79,1,-1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,33,4789,220,1,339,4,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,35,1350,185,1,330,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,30,1476,199,4,-1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,59,0,226,1,-1,0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,-333,329,5,-1,0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4517,57,-3313,153,1,-1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4518,57,295,151,11,-1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4519,28,1137,129,4,211,3,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [15]:
# seperate the data into 2 data frames to predict the missing values based on the present values
df_na = df[df['job'].isna()]
df_notna = df[df['job'].notna()]

In [16]:
# train a decision tree model to predict the missing value
X = df_notna.drop('job', axis=1)
y = df_notna['job']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)
tree = DecisionTreeClassifier()

param_grid = {
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(
    estimator=tree,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)

grid_search.fit(X_train, y_train)

ValueError: 
All the 90 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\pc\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\pc\anaconda3\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\anaconda3\Lib\site-packages\sklearn\tree\_classes.py", line 1019, in fit
    super()._fit(
  File "c:\Users\pc\anaconda3\Lib\site-packages\sklearn\tree\_classes.py", line 252, in _fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "c:\Users\pc\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 2956, in validate_data
    X = check_array(X, input_name="X", **check_X_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\anaconda3\Lib\site-packages\sklearn\utils\_array_api.py", line 832, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pc\anaconda3\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'married'
