In [278]:
#importing library
import numpy as np
import pandas as pd

#importing Scaling library for scaling the dataset
from sklearn.preprocessing import StandardScaler
#importing label encoder
from sklearn.preprocessing import LabelEncoder
#importing train_test_split
from sklearn.model_selection import train_test_split

#importing LogisticRegression to train the model
from sklearn.linear_model import LogisticRegression

In [279]:
#loading the dataset
df=pd.read_csv('/kaggle/input/bank-direct-marketing/bank-full.csv',delimiter=';')
#showing the dataset
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


# #Getting Preliminary Information

In [280]:
#getting information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [281]:
df=df.replace('unknown',np.NaN)

# Preprocessing

In [282]:
y=df['y']
x=df.drop('y',axis=1)

# Creating function to get categorical features

In [283]:
def get_categorical_features(df):
    return [feature for feature in df.columns if df[feature].dtype=='object']

In [284]:
#implmenting the function to the dataset
get_categorical_features(df)

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome',
 'y']

# Creating Function to get unique value in each  column

In [285]:
def get_uniques(df,columns):
    return {column:list(df[column].unique()) for column in columns if df[column].dtypes=='object'}

In [286]:
#implementing the function
get_uniques(x,x.columns)

{'job': ['management',
  'technician',
  'entrepreneur',
  'blue-collar',
  nan,
  'retired',
  'admin.',
  'services',
  'self-employed',
  'unemployed',
  'housemaid',
  'student'],
 'marital': ['married', 'single', 'divorced'],
 'education': ['tertiary', 'secondary', nan, 'primary'],
 'default': ['no', 'yes'],
 'housing': ['yes', 'no'],
 'loan': ['no', 'yes'],
 'contact': [nan, 'cellular', 'telephone'],
 'month': ['may',
  'jun',
  'jul',
  'aug',
  'oct',
  'nov',
  'dec',
  'jan',
  'feb',
  'mar',
  'apr',
  'sep'],
 'poutcome': [nan, 'failure', 'other', 'success']}

# # Replacing unknown to the Nan

In [287]:
#replacing the unknown to NaN
x=x.replace('unknown',np.NaN)

# Checking for Null value

In [288]:
x.isna().sum()

age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36959
dtype: int64

# Sorting different features in different list

In [289]:
binary_features=['default','housing','loan']
ordinal_features=['education','month']
nominal_features=['job','marital','contact']

# Creating Binary Function

In [290]:
def binary_encode(df,columns,positive_label):
    df=df.copy()
    for column in columns:
        df[column]=df[column].apply(lambda x:1 if x==positive_label else 0)
    return df

# Applying binary function to the dataset

In [291]:
x=binary_encode(x,binary_features,'yes')

In [292]:
x['education'].unique()

array(['tertiary', 'secondary', nan, 'primary'], dtype=object)

# Creating list of unique features in order

In [293]:
education_ordering=['primary','secondary','tertiary']
month_ordering=['jan','feb','mar','apr','may','jun','jul','aug',
               'sep','oct','nov','dec']

ordinal=[education_ordering,month_ordering]

ordinal

[['primary', 'secondary', 'tertiary'],
 ['jan',
  'feb',
  'mar',
  'apr',
  'may',
  'jun',
  'jul',
  'aug',
  'sep',
  'oct',
  'nov',
  'dec']]

In [294]:
month=sorted(x['month'].unique())
month

['apr',
 'aug',
 'dec',
 'feb',
 'jan',
 'jul',
 'jun',
 'mar',
 'may',
 'nov',
 'oct',
 'sep']

# Creating Ordinal Function

In [295]:

def ordinal_encode(df,columns,orderings):
    df=df.copy()
    for column,ordering in zip(columns,orderings):
        df[column]=df[column].apply(lambda x:ordering.index(x) if str(x)!='nan' else x)
    return df
        
        
    

In [296]:
x=ordinal_encode(x,ordinal_features,ordinal)

In [297]:
x

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,2.0,0,2143,1,0,,5,4,261,1,-1,0,
1,44,technician,single,1.0,0,29,1,0,,5,4,151,1,-1,0,
2,33,entrepreneur,married,1.0,0,2,1,1,,5,4,76,1,-1,0,
3,47,blue-collar,married,,0,1506,1,0,,5,4,92,1,-1,0,
4,33,,single,,0,1,0,0,,5,4,198,1,-1,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,2.0,0,825,0,0,cellular,17,10,977,3,-1,0,
45207,71,retired,divorced,0.0,0,1729,0,0,cellular,17,10,456,2,-1,0,
45208,72,retired,married,1.0,0,5715,0,0,cellular,17,10,1127,5,184,3,success
45209,57,blue-collar,married,1.0,0,668,0,0,telephone,17,10,508,4,-1,0,


# #Creating Onehot Encode Function

In [298]:
def onehot_encode(df,columns):
    for  column in columns:
        dummies=pd.get_dummies(df[column])
        #concating the dummies to the original function 
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df

In [299]:
x=onehot_encode(x,nominal_features)

In [300]:
x.isna().sum()

age                  0
education         1857
default              0
balance              0
housing              0
loan                 0
day                  0
month                0
duration             0
campaign             0
pdays                0
previous             0
poutcome         36959
admin.               0
blue-collar          0
entrepreneur         0
housemaid            0
management           0
retired              0
self-employed        0
services             0
student              0
technician           0
unemployed           0
divorced             0
married              0
single               0
cellular             0
telephone            0
dtype: int64

In [301]:
x['education']=x['education'].fillna(x['education'].mean())

In [302]:
x=x.drop('poutcome',axis=1)

In [303]:
x

Unnamed: 0,age,education,default,balance,housing,loan,day,month,duration,campaign,...,self-employed,services,student,technician,unemployed,divorced,married,single,cellular,telephone
0,58,2.000000,0,2143,1,0,5,4,261,1,...,0,0,0,0,0,0,1,0,0,0
1,44,1.000000,0,29,1,0,5,4,151,1,...,0,0,0,1,0,0,0,1,0,0
2,33,1.000000,0,2,1,1,5,4,76,1,...,0,0,0,0,0,0,1,0,0,0
3,47,1.148775,0,1506,1,0,5,4,92,1,...,0,0,0,0,0,0,1,0,0,0
4,33,1.148775,0,1,0,0,5,4,198,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,2.000000,0,825,0,0,17,10,977,3,...,0,0,0,1,0,0,1,0,1,0
45207,71,0.000000,0,1729,0,0,17,10,456,2,...,0,0,0,0,0,1,0,0,1,0
45208,72,1.000000,0,5715,0,0,17,10,1127,5,...,0,0,0,0,0,0,1,0,1,0
45209,57,1.000000,0,668,0,0,17,10,508,4,...,0,0,0,0,0,0,1,0,0,1


# Scaling the dataset

In [304]:
scaler=StandardScaler()
x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)

In [305]:
x

Unnamed: 0,age,education,default,balance,housing,loan,day,month,duration,campaign,...,self-employed,services,student,technician,unemployed,divorced,married,single,cellular,telephone
0,1.606965,1.306477,-0.13549,0.256419,0.893915,-0.436803,-1.298476,-0.475354,0.011016,-0.569351,...,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,0.813212,-0.628090,-1.356030,-0.262091
1,0.288529,-0.228343,-0.13549,-0.437895,0.893915,-0.436803,-1.298476,-0.475354,-0.416127,-0.569351,...,-0.190234,-0.318082,-0.145557,2.225121,-0.172266,-0.360780,-1.229691,1.592128,-1.356030,-0.262091
2,-0.747384,-0.228343,-0.13549,-0.446762,0.893915,2.289359,-1.298476,-0.475354,-0.707361,-0.569351,...,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,0.813212,-0.628090,-1.356030,-0.262091
3,0.571051,0.000000,-0.13549,0.047205,0.893915,-0.436803,-1.298476,-0.475354,-0.645231,-0.569351,...,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,0.813212,-0.628090,-1.356030,-0.262091
4,-0.747384,0.000000,-0.13549,-0.447091,-1.118674,-0.436803,-1.298476,-0.475354,-0.233620,-0.569351,...,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,-1.229691,1.592128,-1.356030,-0.262091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.947747,1.306477,-0.13549,-0.176460,-1.118674,-0.436803,0.143418,2.016333,2.791329,0.076230,...,-0.190234,-0.318082,-0.145557,2.225121,-0.172266,-0.360780,0.813212,-0.628090,0.737447,-0.262091
45207,2.831227,-1.763163,-0.13549,0.120447,-1.118674,-0.436803,0.143418,2.016333,0.768224,-0.246560,...,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,2.771775,-1.229691,-0.628090,0.737447,-0.262091
45208,2.925401,-0.228343,-0.13549,1.429593,-1.118674,-0.436803,0.143418,2.016333,3.373797,0.721811,...,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,0.813212,-0.628090,0.737447,-0.262091
45209,1.512791,-0.228343,-0.13549,-0.228024,-1.118674,-0.436803,0.143418,2.016333,0.970146,0.399020,...,-0.190234,-0.318082,-0.145557,-0.449414,-0.172266,-0.360780,0.813212,-0.628090,-1.356030,3.815470


In [307]:
y.value_counts()

no     39922
yes     5289
Name: y, dtype: int64

In [310]:
label=LabelEncoder()
y=label.fit_transform(y)

# Train Test Split

In [312]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)

In [313]:
print(x_train.shape)
print(x_test.shape)

(31647, 28)
(13564, 28)


# Training the Model

In [314]:
model=LogisticRegression()
model.fit(x_train,y_train)

LogisticRegression()

# Checking the R Score

In [315]:
model.score(x_test,y_test)

0.8881598348569744

In [319]:
model.predict(x_test).sum()

561

In [321]:
len(model.predict(x_test))

13564

In [320]:
len(x_test)

13564

AttributeError: 'numpy.ndarray' object has no attribute 'unique'