In [81]:
import pandas as pd      
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

from scipy.stats import skew

from sklearn.model_selection import cross_validate, cross_val_score
import warnings
warnings.filterwarnings('ignore')
plt.rcParams["figure.figsize"] = (10,6)
pd.set_option('display.max_columns', 500) 
pd.set_option('display.max_rows', 500)

In [82]:
data = pd.read_csv('bank-additional-full.csv',sep=';')
data

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,334,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,383,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,189,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,442,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [83]:
df = data.copy()

In [84]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [85]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,41188.0,40.02406,10.42125,17.0,32.0,38.0,47.0,98.0
duration,41188.0,258.28501,259.279249,0.0,102.0,180.0,319.0,4918.0
campaign,41188.0,2.567593,2.770014,1.0,1.0,2.0,3.0,56.0
pdays,41188.0,962.475454,186.910907,0.0,999.0,999.0,999.0,999.0
previous,41188.0,0.172963,0.494901,0.0,0.0,0.0,0.0,7.0
emp.var.rate,41188.0,0.081886,1.57096,-3.4,-1.8,1.1,1.4,1.4
cons.price.idx,41188.0,93.575664,0.57884,92.201,93.075,93.749,93.994,94.767
cons.conf.idx,41188.0,-40.5026,4.628198,-50.8,-42.7,-41.8,-36.4,-26.9
euribor3m,41188.0,3.621291,1.734447,0.634,1.344,4.857,4.961,5.045
nr.employed,41188.0,5167.035911,72.251528,4963.6,5099.1,5191.0,5228.1,5228.1


In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [87]:
object_col=df.columns[df.dtypes=="object"]
object_col 

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome', 'y'],
      dtype='object')

In [88]:
for col in object_col:   #There are 'unknown'. that ist not suitable to work
    print(col)
    print("--"*8)
    print(df[col].value_counts(dropna=False))
    print("--"*20)

job
----------------
admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: job, dtype: int64
----------------------------------------
marital
----------------
married     24928
single      11568
divorced     4612
unknown        80
Name: marital, dtype: int64
----------------------------------------
education
----------------
university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
unknown                 1731
illiterate                18
Name: education, dtype: int64
----------------------------------------
default
----------------
no         32588
unknown     8597
yes            3
Name: default, dtype: int64
--------------------------------------

In [108]:
df.replace(['unknown'],np.nan, inplace=True)  # I replaced the 'unknown' to np.none

In [90]:
object_col=df.columns[df.dtypes=="object"] #I want to see all value_counts together

for col in object_col:
    print(col)
    print("--"*8)
    print(df[col].value_counts(dropna=False))
    print("--"*20)

job
----------------
admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
NaN                330
Name: job, dtype: int64
----------------------------------------
marital
----------------
married     24928
single      11568
divorced     4612
NaN            80
Name: marital, dtype: int64
----------------------------------------
education
----------------
university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
NaN                     1731
illiterate                18
Name: education, dtype: int64
----------------------------------------
default
----------------
no     32588
NaN     8597
yes        3
Name: default, dtype: int64
----------------------------------------
housing
-

In [92]:
df.isnull().sum()

age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64

In [93]:
df.shape

(41188, 21)

In [96]:
data_after_drop = df.dropna()

In [97]:
data_after_drop.shape #I think ,dropna is not a godd idea,because is a lot of data lost

(30488, 21)

In [100]:
df.y.value_counts(dropna=False)  #that means, how many people got a credit. yes is about 1/8

no     36548
yes     4640
Name: y, dtype: int64

In [101]:
df.default.value_counts(dropna=False) 
#that is relevant with credit ,whether he has a credit or not
#the ratio can be like y columns
#because of that I'll change the NaN value to yes. with that can be the ratio like y columns

no     32588
NaN     8597
yes        3
Name: default, dtype: int64

In [109]:
df.default.replace([np.nan], 'yes',inplace=True) #I changed the NaN to yes

In [111]:
df.default.value_counts()

no     32588
yes     8600
Name: default, dtype: int64

In [112]:
df.isnull().sum()

age                  0
job                330
marital             80
education         1731
default              0
housing            990
loan               990
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64

In [116]:
df.education.value_counts(dropna=False)  #I see the NaN value

university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
NaN                     1731
illiterate                18
Name: education, dtype: int64

In [131]:
df.groupby('education',dropna=False)['y'].describe().T 

 #the top of frequency  is 'no',I think, the most of the NaN group won't get a credit


education,basic.4y,basic.6y,basic.9y,high.school,illiterate,professional.course,university.degree,NaN
count,4176,2292,6045,9515,18,5243,12168,1731
unique,2,2,2,2,2,2,2,2
top,no,no,no,no,no,no,no,no
freq,3748,2104,5572,8484,14,4648,10498,1480


In [138]:
print((3748/4176))  #basic.4y
print(2104/2292)    #basic.6y
print(5572/6045)    #basic.9y
print(8484/9515)    #high.school
print(14/18)        #illiterate
print(4648/5243)    #professional.course
print(10498/12168)  #university.degree
print(1480/1731)    #NaN


#the NaN ist nearest to university.degree
# because of that, I decided to change NaN value to university.degree

0.8975095785440613
0.9179755671902269
0.9217535153019024
0.8916447714135576
0.7777777777777778
0.8865153538050734
0.8627547666009204
0.8549971114962449


s = pd.Series(['a', 'a', 'b', 'c']) \
s.describe() \
count     4 \
unique    3 \
top       a \
freq      2 \
dtype: object

In [142]:
df.education.replace([np.nan],'university.degree',inplace=True)  #I change the NaN to university.degree

In [144]:
df.education.value_counts(dropna=False)  #There is no NAN

university.degree      13899
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
illiterate                18
Name: education, dtype: int64

In [145]:
df.isnull().sum()

age                 0
job               330
marital            80
education           0
default             0
housing           990
loan              990
contact             0
month               0
day_of_week         0
duration            0
campaign            0
pdays               0
previous            0
poutcome            0
emp.var.rate        0
cons.price.idx      0
cons.conf.idx       0
euribor3m           0
nr.employed         0
y                   0
dtype: int64

In [152]:
df.housing.value_counts(dropna=False)

yes    21576
no     18622
NaN      990
Name: housing, dtype: int64

# KNN ile missing leri dolduralim

In [None]:
data = pd.read_csv('bank-additional-full.csv',sep=';')
data

In [153]:
df = data.copy()

In [155]:
df.replace(['unknown'],np.nan, inplace=True)  # I replaced the 'unknown' to np.none

In [156]:
df.isnull().sum()

age                  0
job                330
marital             80
education         1731
default           8597
housing            990
loan               990
contact              0
month                0
day_of_week          0
duration             0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
y                    0
dtype: int64

In [157]:
var_names = list(df)

In [158]:
import numpy as np
n_df = np.array(df)  #veriyi nparray'e cevirdik

In [160]:
n_df

array([[56, 'housemaid', 'married', ..., 4.857, 5191.0, 'no'],
       [57, 'services', 'married', ..., 4.857, 5191.0, 'no'],
       [37, 'services', 'married', ..., 4.857, 5191.0, 'no'],
       ...,
       [56, 'retired', 'married', ..., 1.028, 4963.6, 'no'],
       [44, 'technician', 'married', ..., 1.028, 4963.6, 'yes'],
       [74, 'retired', 'married', ..., 1.028, 4963.6, 'no']], dtype=object)

In [162]:
!pip install ycimpute

Collecting ycimpute
  Downloading ycimpute-0.2-py3-none-any.whl (35 kB)
Collecting torch>=1.1.0
  Downloading torch-1.12.1-cp39-none-macosx_10_9_x86_64.whl (133.8 MB)
[K     |████████████████████████████████| 133.8 MB 6.3 MB/s eta 0:00:01    |█████▌                          | 23.2 MB 5.8 MB/s eta 0:00:20     |██████████████▋                 | 61.3 MB 7.5 MB/s eta 0:00:10
Installing collected packages: torch, ycimpute
Successfully installed torch-1.12.1 ycimpute-0.2


In [163]:
from ycimpute.imputer import knnimput

dff = knnimput.KNN(k = 5).complete(n_df) #doldurma islemini yapacak

#5 komsuluk degeri,5 komsuya göre yapiyor

ValueError: could not convert string to float: 'housemaid'