In [1]:
# import Data Manipulation Libraries
import numpy as np
import pandas as pd

#Import Data Visualization Libraries 
import matplotlib.pyplot as plt
import seaborn as sns

#Import Filter Warning Libraries
import warnings
warnings.filterwarnings('ignore')

#Import Logging Files
import logging
logging.basicConfig(level= logging.INFO,
                    filemode= 'w',
                    filename= 'model.log',
                    format= '%(asctime)s - %(levelname)s-%(message)s', force = True)

#Import Stats Scipy Library
import scipy.stats as stats 

In [2]:
#Data Import Using Pandas Function

url = 'https://raw.githubusercontent.com/vinnithakur/BANKTELEMARKETING_MLMODEL/refs/heads/main/BankTelemarketing.csv'

df =  pd.read_csv(url,sep = ';')

df.sample(frac =1)  #Shuffle Dataset

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
29908,60,retired,married,secondary,no,6132,no,no,cellular,4,feb,266,2,-1,0,unknown,no
20156,45,technician,married,secondary,no,1395,yes,no,cellular,11,aug,243,4,-1,0,unknown,no
17177,39,management,single,tertiary,no,168,no,no,cellular,28,jul,56,9,-1,0,unknown,no
15066,39,management,married,primary,no,3760,no,yes,cellular,17,jul,262,1,-1,0,unknown,no
13975,36,blue-collar,married,secondary,no,576,yes,no,cellular,10,jul,290,2,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36752,47,services,married,secondary,no,367,yes,no,cellular,12,may,309,1,306,4,success,yes
8989,38,admin.,divorced,secondary,no,0,no,no,unknown,5,jun,56,1,-1,0,unknown,no
8191,47,management,divorced,secondary,no,1208,yes,no,unknown,2,jun,339,7,-1,0,unknown,no
14795,35,management,single,tertiary,no,-353,yes,no,cellular,16,jul,70,1,-1,0,unknown,no


In [3]:
#Lock file
logging.info('Dataset Uploaded Successfully.....')

In [4]:
#Checking Data Shape and Data Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
logging.info('Dataset Uploaded Successfully.')

In [6]:
# Split the Dataset into Numerical_Data and Categorical_Data
Numerical_Data = df.select_dtypes(exclude = 'object')

Categorical_Data = df.select_dtypes(include = 'object')

In [7]:
# Checking Numerical_Data
Numerical_Data

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0
...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0
45207,71,1729,17,456,2,-1,0
45208,72,5715,17,1127,5,184,3
45209,57,668,17,508,4,-1,0


In [8]:
#Checking Categorical_Data
Categorical_Data

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome,y
0,management,married,tertiary,no,yes,no,unknown,may,unknown,no
1,technician,single,secondary,no,yes,no,unknown,may,unknown,no
2,entrepreneur,married,secondary,no,yes,yes,unknown,may,unknown,no
3,blue-collar,married,unknown,no,yes,no,unknown,may,unknown,no
4,unknown,single,unknown,no,no,no,unknown,may,unknown,no
...,...,...,...,...,...,...,...,...,...,...
45206,technician,married,tertiary,no,no,no,cellular,nov,unknown,yes
45207,retired,divorced,primary,no,no,no,cellular,nov,unknown,yes
45208,retired,married,secondary,no,no,no,cellular,nov,success,yes
45209,blue-collar,married,secondary,no,no,no,telephone,nov,unknown,no


In [9]:
# Checking Descriptive Stats : Numerical_Data and Categorical_Data
from collections import OrderedDict

stats = []

for i in Numerical_Data:
    
    numerical_stats = OrderedDict({
        'Feature': i,
        'Maximum': Numerical_Data[i].max(),
        'Minimum': Numerical_Data[i].min(),
        '25%': Numerical_Data[i].quantile(0.25),
        'Median' : Numerical_Data[i].quantile(0.50),
        '75%': Numerical_Data[i].quantile(0.75),
        'Kurtosis': Numerical_Data[i].kurt(),
        'Skewness': Numerical_Data[i].skew(),
        'Standard Deviation': Numerical_Data[i].std()
    })
    
    stats.append(numerical_stats)
    
    report = pd.DataFrame(stats)
    
report

Unnamed: 0,Feature,Maximum,Minimum,25%,Median,75%,Kurtosis,Skewness,Standard Deviation
0,age,95,18,33.0,39.0,48.0,0.31957,0.684818,10.618762
1,balance,102127,-8019,72.0,448.0,1428.0,140.751547,8.360308,3044.765829
2,day,31,1,8.0,16.0,21.0,-1.059897,0.093079,8.322476
3,duration,4918,0,103.0,180.0,319.0,18.153915,3.144318,257.527812
4,campaign,63,1,1.0,2.0,3.0,39.249651,4.89865,3.098021
5,pdays,871,-1,-1.0,-1.0,-1.0,6.935195,2.615715,100.128746
6,previous,275,0,0.0,0.0,0.0,4506.86066,41.846454,2.303441


In [10]:
logging.info('The Above Dataset is Non Normal Distributed..')

In [11]:
#  Checking Categorical Dataset Stats
for i in Categorical_Data:
    print(Categorical_Data[i].value_counts())
    print('*'*40)

job
blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: count, dtype: int64
****************************************
marital
married     27214
single      12790
divorced     5207
Name: count, dtype: int64
****************************************
education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64
****************************************
default
no     44396
yes      815
Name: count, dtype: int64
****************************************
housing
yes    25130
no     20081
Name: count, dtype: int64
****************************************
loan
no     37967
yes     7244
Name: count, dtype: int64
****************************************
contact
cellular     29285
unknown      13020
telephone     2906
Name: count, dtype

In [12]:
# As the Categorical columns not showing any specific order so Label Encoding Technique is Recommended.
# The Numerical Columns Shows Non Normal Distribution, Hence Tree Based Algorithms are Suggested.

In [13]:
# Checking missing information if any using Graphical Method

df.isnull().sum().plot(kind = 'barh')
9plt.show()

SyntaxError: invalid decimal literal (565405262.py, line 4)

The Dataset Contains No missing Values

In [None]:
#Checking Output i.e. Target Column for Data Distribution
df['y'].value_counts()

In [None]:
df['y'] = df['y'].replace({'no':0,'yes':1})

In [None]:
df['y'].value_counts()

In [None]:
df

In [None]:
Categorical_Data

In [None]:
Categorical_Data.default.value_counts()

In [14]:
#Encoding Loan  Column
# No:0 and Yes: 1
df['loan'] = df['loan'].replace({'no':0,'yes':1})

In [None]:
#Encoding Default Column
# No:0 and Yes: 1
df['default'] = df['default'].replace({'no':0,'yes':1})

In [None]:
df.shape

In [None]:
df = pd.get_dummies(df,dtype = int)

In [None]:
df.shape

In [15]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,0,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,0,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,1,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,0,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,0,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,0,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,0,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,0,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,0,telephone,17,nov,508,4,-1,0,unknown,no


In [None]:
df.housing.value_counts()