# Data modelling libraries

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier ,RandomForestClassifier ,GradientBoostingClassifier
from xgboost import XGBClassifier 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Ridge,Lasso
from sklearn.metrics import roc_auc_score ,mean_squared_error,accuracy_score,classification_report,roc_curve,confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from scipy.stats.mstats import winsorize
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns',None)

# <b>Data importing and pre-processing</b>

### Importing the train.csv dataset


In [13]:
path = '../data/train.csv'

# Load the dataframe
dataframe = pd.read_csv(path,delimiter=',')

# Remove the Id column from the dataset
dataframe.drop('Id',axis=1,inplace=True)

print('Shape of the data is: ',dataframe.shape)

dataframe.head()

Shape of the data is:  (32950, 21)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,49,blue-collar,married,basic.9y,unknown,no,no,cellular,nov,wed,227,4,999,0,nonexistent,-0.1,93.2,-42.0,4.12,5195.8,no
1,37,entrepreneur,married,university.degree,no,no,no,telephone,nov,wed,202,2,999,1,failure,-0.1,93.2,-42.0,4.12,5195.8,no
2,78,retired,married,basic.4y,no,no,no,cellular,jul,mon,1148,1,999,0,nonexistent,-1.7,94.215,-40.3,0.87,4991.6,yes
3,36,admin.,married,university.degree,no,yes,no,telephone,may,mon,120,2,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,59,retired,divorced,university.degree,no,no,no,cellular,jun,tue,368,2,999,0,nonexistent,-2.9,92.963,-40.8,1.262,5076.2,no


### <b>Check Numeric and Categorical Records</b>

The dataset includes both numerical and categorical columns, and it's important to verify the datatype of each feature as some numerical values may be represented as strings, and some categorical values may be represented in a different datatype instead of strings.

In [14]:
# Function to identify numeric features
def numeric_features(dataset):
    numeric_col = dataset.select_dtypes(include=np.number).columns.tolist()
    return dataset[numeric_col].head()

numeric_columns = numeric_features(dataframe)
print("Numeric Features:")
print(numeric_columns)
print("===="*20)



# Function to identify categorical features
def categorical_features(dataset):
    categorical_col = dataset.select_dtypes(exclude=np.number).columns.tolist()
    return dataset[categorical_col].head()

categorical_columns = categorical_features(dataframe)
print("Categorical Features:")
print(categorical_columns)


# Function to check the datatypes of all the columns:
def check_datatypes(dataset):
    
    return dataset.dtypes


check_datatypes(dataframe)

Numeric Features:
   age  duration  campaign  pdays  previous  emp.var.rate  cons.price.idx  \
0   49       227         4    999         0          -0.1          93.200   
1   37       202         2    999         1          -0.1          93.200   
2   78      1148         1    999         0          -1.7          94.215   
3   36       120         2    999         0           1.1          93.994   
4   59       368         2    999         0          -2.9          92.963   

   cons.conf.idx  euribor3m  nr.employed  
0          -42.0      4.120       5195.8  
1          -42.0      4.120       5195.8  
2          -40.3      0.870       4991.6  
3          -36.4      4.857       5191.0  
4          -40.8      1.262       5076.2  
Categorical Features:
            job   marital          education  default housing loan    contact  \
0   blue-collar   married           basic.9y  unknown      no   no   cellular   
1  entrepreneur   married  university.degree       no      no   no  telephone

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

### <b>Check for Missing Data</b>
<ul>
<li>In the function below, we calculate the total missing values and the percentage of missing values in every feature of the dataset.</li>
<li>The function ideally returns a dataframe consisting of the feature names as index and two columns having the count and percentage of missing values in that feature.</li>
</ul>

In [15]:
# Function to identify the number of missing values in every feature
def missing_data(dataset):
    total = dataset.isnull().sum().sort_values(ascending=False)
    percent = (dataset.isnull().sum()/dataset.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data
    
    
missing_data = missing_data(dataframe)
print(missing_data)

                Total  Percent
age                 0      0.0
campaign            0      0.0
nr.employed         0      0.0
euribor3m           0      0.0
cons.conf.idx       0      0.0
cons.price.idx      0      0.0
emp.var.rate        0      0.0
poutcome            0      0.0
previous            0      0.0
pdays               0      0.0
duration            0      0.0
job                 0      0.0
day_of_week         0      0.0
month               0      0.0
contact             0      0.0
loan                0      0.0
housing             0      0.0
default             0      0.0
education           0      0.0
marital             0      0.0
y                   0      0.0


### <b>If there were missing values</b>
<ul>
<li>Now you have the number and percentage of missing values in every feature, from the previous function.</li>
<li>Using this information, you can decide as to what proportion of missing values you should remove from every feature.</li>
<li>The function below takes a threshold value of your choice and removes the features having missing value percentage greater than this threshold. The function can take three parameters - the dataframe, missing data dataframe and threshold value.</li>
</ul>

In [16]:
# Function to drop missing values
def drop_missing(dataset, missing,value):
    dataset = dataset.drop((missing[missing['Percent'] > value]).index,axis= 1)
    print(dataset.isnull().sum().sort_values(ascending = False))
    return dataset 
    
dataframe = drop_missing(dataframe,missing_data,0.60)

age               0
campaign          0
nr.employed       0
euribor3m         0
cons.conf.idx     0
cons.price.idx    0
emp.var.rate      0
poutcome          0
previous          0
pdays             0
duration          0
job               0
day_of_week       0
month             0
contact           0
loan              0
housing           0
default           0
education         0
marital           0
y                 0
dtype: int64


### <b>Check for class Imbalance</b>

Machine learning algorithms often have a bias towards majority classes and struggle to classify minority classes when the data is imbalanced. This is because these algorithms expect equal distribution of data. To avoid this issue, it's crucial to identify and address class imbalance.

In [21]:
def class_imbalance(target):
    class_values = (target.value_counts()/target.value_counts().sum())*100
    return class_values

class_imbalance(dataframe['y'])

no     88.734446
yes    11.265554
Name: y, dtype: float64

### <b>detect outliers</b>