# Libraries used

Running Kernel3.9.13 base anaconda

pip install squarify
pip install yellowbrick
pip install plotly
pip install seaborn
pip install lazypredict
pip install pandas_profiling

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt
import plotly.io as pio
import squarify #treemap
import os
import matplotlib
import warnings

#to enable the inline plotting
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")

In [2]:
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import classification_report

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler


from scipy.stats import normaltest

from pandas_profiling import ProfileReport

from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.style.palettes import PALETTES, SEQUENCES, color_palette

import lazypredict

warnings.simplefilter(action='ignore', category=FutureWarning)

# Importing previous dataset

In [3]:
#point to the folder where the data is stored
os.chdir(r"C:\Users\pedro\datathon")

# Loading combined_mod dataset
df_combined_mod = pd.read_csv('df_combined_mod.csv')


In [4]:

#show all columns in pandas
pd.set_option('display.max_columns', None)


In [5]:
df_combined_mod.head(1)

Unnamed: 0,PipeId,MaintenanceId,InspectionYear,InspectionDate,MonthsLastRev,Incidence,Province,Town,YearBuilt,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,InspectionDay,gas_natural,Material_Acrylonitrile-Butadiene-Styrene,Material_Copper,Material_Fiberglass-Reinforced Plastic,Material_Polyethylene,Material_Polypropylene,Relative_Thickness,Age_of_Pipe_upon_inspection,Severity_0,Severity_low,Severity_medium,Severity_high
0,189311802,ZRV-00001972,2010,2010-10-01,24,0,Barcelona,Sentmenat,2001,160.0,117.831,4.0,0,0,0,Friday,1,0,0,0,1,0,40.0,9,1,0,0,0


In [6]:
#creating a subset for altering the dataset after initial EDA
df_baseline = df_combined_mod.copy()

# Data Pre-Processing

Machine Learning algorithms don't perform well when the input numerical attributes have very different scales. Since we have very skewed data, __standardization__ is much less affected by outliers and it is the one we will choose __instead of normalization__.


In [7]:
df_baseline.columns

Index(['PipeId', 'MaintenanceId', 'InspectionYear', 'InspectionDate',
       'MonthsLastRev', 'Incidence', 'Province', 'Town', 'YearBuilt',
       'Diameter', 'Length', 'Pressure', 'NumConnections',
       'NumConnectionsUnder', 'BoolBridle', 'InspectionDay', 'gas_natural',
       'Material_Acrylonitrile-Butadiene-Styrene', 'Material_Copper',
       'Material_Fiberglass-Reinforced Plastic', 'Material_Polyethylene',
       'Material_Polypropylene', 'Relative_Thickness',
       'Age_of_Pipe_upon_inspection', 'Severity_0', 'Severity_low',
       'Severity_medium', 'Severity_high'],
      dtype='object')

In [8]:
df_baseline.head(1)

Unnamed: 0,PipeId,MaintenanceId,InspectionYear,InspectionDate,MonthsLastRev,Incidence,Province,Town,YearBuilt,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,InspectionDay,gas_natural,Material_Acrylonitrile-Butadiene-Styrene,Material_Copper,Material_Fiberglass-Reinforced Plastic,Material_Polyethylene,Material_Polypropylene,Relative_Thickness,Age_of_Pipe_upon_inspection,Severity_0,Severity_low,Severity_medium,Severity_high
0,189311802,ZRV-00001972,2010,2010-10-01,24,0,Barcelona,Sentmenat,2001,160.0,117.831,4.0,0,0,0,Friday,1,0,0,0,1,0,40.0,9,1,0,0,0


## Using Standard Scaler

In [9]:
#Scaling data in df_baseline dataset except object columns and boolean values
cols_to_scale = ['Diameter', 'Length', 'Pressure', 'NumConnections','NumConnectionsUnder','Relative_Thickness', 'Age_of_Pipe_upon_inspection']
df_baseline[cols_to_scale] = StandardScaler().fit_transform(df_baseline[cols_to_scale])

df_baseline.head()


Unnamed: 0,PipeId,MaintenanceId,InspectionYear,InspectionDate,MonthsLastRev,Incidence,Province,Town,YearBuilt,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,InspectionDay,gas_natural,Material_Acrylonitrile-Butadiene-Styrene,Material_Copper,Material_Fiberglass-Reinforced Plastic,Material_Polyethylene,Material_Polypropylene,Relative_Thickness,Age_of_Pipe_upon_inspection,Severity_0,Severity_low,Severity_medium,Severity_high
0,189311802,ZRV-00001972,2010,2010-10-01,24,0,Barcelona,Sentmenat,2001,0.790815,1.169736,0.226834,-0.464026,-0.013887,0,Friday,1,0,0,0,1,0,-0.59848,-0.536658,1,0,0,0
1,189311802,ZRV-00001972,2012,2012-10-01,24,0,Barcelona,Sentmenat,2001,0.790815,1.169736,0.226834,-0.464026,-0.013887,0,Monday,1,0,0,0,1,0,-0.59848,-0.300222,1,0,0,0
2,189311802,ZRV-00001972,2014,2014-10-08,24,0,Barcelona,Sentmenat,2001,0.790815,1.169736,0.226834,-0.464026,-0.013887,0,Wednesday,1,0,0,0,1,0,-0.59848,-0.063787,1,0,0,0
3,189311802,ZRV-00001972,2016,2016-10-14,24,0,Barcelona,Sentmenat,2001,0.790815,1.169736,0.226834,-0.464026,-0.013887,0,Friday,1,0,0,0,1,0,-0.59848,0.172649,1,0,0,0
4,189311802,ZRV-00001972,2018,2018-10-09,24,0,Barcelona,Sentmenat,2001,0.790815,1.169736,0.226834,-0.464026,-0.013887,0,Tuesday,1,0,0,0,1,0,-0.59848,0.409085,1,0,0,0


In [10]:
#converting to absolute values 'NumConnections','NumConnectionsUnder','Relative_Thickness', 'Age_of_Pipe_upon_inspection'
df_baseline['NumConnections'] = df_baseline['NumConnections'].abs()
df_baseline['NumConnectionsUnder'] = df_baseline['NumConnectionsUnder'].abs()
df_baseline['Relative_Thickness'] = df_baseline['Relative_Thickness'].abs()
df_baseline['Age_of_Pipe_upon_inspection'] = df_baseline['Age_of_Pipe_upon_inspection'].abs()

df_baseline.head(1)

Unnamed: 0,PipeId,MaintenanceId,InspectionYear,InspectionDate,MonthsLastRev,Incidence,Province,Town,YearBuilt,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,InspectionDay,gas_natural,Material_Acrylonitrile-Butadiene-Styrene,Material_Copper,Material_Fiberglass-Reinforced Plastic,Material_Polyethylene,Material_Polypropylene,Relative_Thickness,Age_of_Pipe_upon_inspection,Severity_0,Severity_low,Severity_medium,Severity_high
0,189311802,ZRV-00001972,2010,2010-10-01,24,0,Barcelona,Sentmenat,2001,0.790815,1.169736,0.226834,0.464026,0.013887,0,Friday,1,0,0,0,1,0,0.59848,0.536658,1,0,0,0


# SMOTE to tackle the unbalanced dataset problem

In [11]:
#sow how many records does Incident have on df_baseline
df_baseline['Incidence'].value_counts()


0    6103397
1      11521
Name: Incidence, dtype: int64

In [12]:
#using SMOTE to balance the dataset creating a subset dataset with target y = 'Incidence' and removing Incidence','MonthsLastRev','InspectionDay','PipeId', 'MaintenanceId', 'InspectionYear', 'InspectionDate',  'Province', 'Town', 'YearBuilt' for x
from imblearn.over_sampling import SMOTE
X = df_baseline.drop(['Incidence','MonthsLastRev','InspectionDay','PipeId', 'MaintenanceId', 'InspectionYear', 'InspectionDate',  'Province', 'Town', 'YearBuilt'], axis=1)
y = df_baseline['Incidence']
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
X_res.shape,y_res.shape

#create a pandas dataframe with the new balanced dataset
df_baseline_balanced = pd.DataFrame(X_res, columns=X.columns)
df_baseline_balanced['Incidence'] = y_res
df_baseline_balanced.head()


Unnamed: 0,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural,Material_Acrylonitrile-Butadiene-Styrene,Material_Copper,Material_Fiberglass-Reinforced Plastic,Material_Polyethylene,Material_Polypropylene,Relative_Thickness,Age_of_Pipe_upon_inspection,Severity_0,Severity_low,Severity_medium,Severity_high,Incidence
0,0.790815,1.169736,0.226834,0.464026,0.013887,0,1,0,0,0,1,0,0.59848,0.536658,1,0,0,0,0
1,0.790815,1.169736,0.226834,0.464026,0.013887,0,1,0,0,0,1,0,0.59848,0.300222,1,0,0,0,0
2,0.790815,1.169736,0.226834,0.464026,0.013887,0,1,0,0,0,1,0,0.59848,0.063787,1,0,0,0,0
3,0.790815,1.169736,0.226834,0.464026,0.013887,0,1,0,0,0,1,0,0.59848,0.172649,1,0,0,0,0
4,0.790815,1.169736,0.226834,0.464026,0.013887,0,1,0,0,0,1,0,0.59848,0.409085,1,0,0,0,0


In [13]:
df_baseline_balanced['Incidence'].value_counts()

0    6103397
1    6103397
Name: Incidence, dtype: int64

In [14]:
df_baseline_balanced.shape

(12206794, 19)

In [15]:
df_baseline.shape

(6114918, 28)

We lose some features with this method

In [16]:
df_baseline_balanced.head(1)

Unnamed: 0,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural,Material_Acrylonitrile-Butadiene-Styrene,Material_Copper,Material_Fiberglass-Reinforced Plastic,Material_Polyethylene,Material_Polypropylene,Relative_Thickness,Age_of_Pipe_upon_inspection,Severity_0,Severity_low,Severity_medium,Severity_high,Incidence
0,0.790815,1.169736,0.226834,0.464026,0.013887,0,1,0,0,0,1,0,0.59848,0.536658,1,0,0,0,0


In [17]:
#create polinomial features in df_baseline_balanced dataset for diameter, length and pressure
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
df_baseline_balanced['Diameter2'] = poly.fit_transform(df_baseline_balanced[['Diameter']])[:,2]
df_baseline_balanced['Length2'] = poly.fit_transform(df_baseline_balanced[['Length']])[:,2]
df_baseline_balanced['Pressure2'] = poly.fit_transform(df_baseline_balanced[['Pressure']])[:,2]
df_baseline_balanced.head(1)

Unnamed: 0,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural,Material_Acrylonitrile-Butadiene-Styrene,Material_Copper,Material_Fiberglass-Reinforced Plastic,Material_Polyethylene,Material_Polypropylene,Relative_Thickness,Age_of_Pipe_upon_inspection,Severity_0,Severity_low,Severity_medium,Severity_high,Incidence,Diameter2,Length2,Pressure2
0,0.790815,1.169736,0.226834,0.464026,0.013887,0,1,0,0,0,1,0,0.59848,0.536658,1,0,0,0,0,0.625388,1.368283,0.051454


In [19]:
#use stratified shuffle split to split df_baseline_balanced into train and validation sets in a balanced way
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df_baseline_balanced, df_baseline_balanced['Incidence']):
    strat_train_set = df_baseline_balanced.loc[train_index]
    strat_validation_set = df_baseline_balanced.loc[test_index]
strat_validation_set['Incidence'].value_counts()/len(strat_train_set)

strat_validation_set['Incidence'].value_counts()/len(strat_validation_set)

0    0.5
1    0.5
Name: Incidence, dtype: float64

In [20]:
#show the number of records for Incidence in each set
strat_train_set['Incidence'].value_counts()


1    4882718
0    4882717
Name: Incidence, dtype: int64

In [22]:
strat_validation_set['Incidence'].value_counts()


0    1220680
1    1220679
Name: Incidence, dtype: int64

We take out the validation set for now and NEVER LOOK AT IT!!

Let's recap, by now we have done:

- Data standardization for non boolean values

- SMOTE for balancing the dataset

    - Removed all categortical variables in the process

- Created polinomial features for length, diameter and pressure

-Used Stratified Shuffle Split to divide our huge now balanced dataset (at expense of duplicating rows)

    -We now have a subset to use for training and validation
    
        -The training subset (strat_train_set) is going to be subdivided with a normal split

            -train
            -test

        -Idea is to iterate models between train and test subsplits and final validate against validation


In [23]:
#split the strat_train_set into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(strat_train_set.drop(['Incidence'], axis=1), strat_train_set['Incidence'], test_size=0.20, random_state=42)
#show shape of train and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((7812348, 21), (1953087, 21), (7812348,), (1953087,))

# Baseline