# Libraries used

Running Kernel3.9.13 base anaconda

pip install squarify
pip install yellowbrick
pip install plotly
pip install seaborn
pip install lazypredict
pip install pandas_profiling

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt
import plotly.io as pio
import squarify #treemap
import os
import matplotlib
import warnings

#to enable the inline plotting
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")

In [2]:
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import classification_report

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler


from scipy.stats import normaltest

from pandas_profiling import ProfileReport

from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.style.palettes import PALETTES, SEQUENCES, color_palette

import lazypredict

warnings.simplefilter(action='ignore', category=FutureWarning)
#pandas show all columns
pd.set_option('display.max_columns', None)

# Functions


In [3]:
# Function for EDA. Using the display() function to have  well-formatted tables. We are mainly using pandas to explore the datasets

def dataset_description(df_target):

    print('This is the Dataset shape: %s\n' % (df_target.shape, ))
    print('Dataset columns: %s\n' % df_target.columns)

    print('\nColumns description:\n')
    display(df_target.info())
    display(df_target.describe())  # describe the dataset

    print('\nNull values:\n')
    display(df_target.isnull().sum())  # Identify null values

#function performing a quick check on df_inspection with a function to have best of pandas functions separated by a line
def quick_check(dataframe):
    print('First 5 rows %s\n')
    print(dataframe.head(2))
    print("=====================================")
    print('Dataframe shape %s\n')
    print(dataframe.shape)
    print("=====================================")
    print('Dataframe describe categorical %s\n')
    print(dataframe.describe(include=['O']))
    print("=====================================")
    print('Dataframe null values %s\n')
    print(dataframe.isnull().sum())
    print("=====================================")
    print('Dataframe value counts %s\n')
    print(dataframe.value_counts())
    print("=====================================")

def stats(dataframe):
    print('Dataframe correlation %s\n')
    print(dataframe.corr())
    print("=====================================")
    print('Dataframe covariance %s\n')
    print(dataframe.cov())
    print("=====================================")
    print('Dataframe skew %s\n')
    print(dataframe.skew())
    print("=====================================")
    print('Dataframe kurtosis %s\n')
    print(dataframe.kurt())
    print("=====================================")

#create a function to normalize characters from a dataset's column in Spanish
def normalize_characters(df, column):
    df[column] = df[column].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    df[column] = df[column].str.lower()
    df[column] = df[column].str.replace('á', 'a')
    df[column] = df[column].str.replace('é', 'e')
    df[column] = df[column].str.replace('í', 'i')
    df[column] = df[column].str.replace('ó', 'o')
    df[column] = df[column].str.replace('ú', 'u')
    df[column] = df[column].str.replace('ñ', 'n')
    df[column] = df[column].str.replace('ü', 'u')
    df[column] = df[column].str.replace('ç', 'c')
    df[column] = df[column].str.replace('(', '')
    df[column] = df[column].str.replace(')', '')
    df[column] = df[column].str.replace('\'', '')
    df[column] = df[column].str.replace('´', '')
    df[column] = df[column].str.replace('`', '')
    df[column] = df[column].str.replace('’', '')
    return df

#create function to change detypes in64 to int32 in a df
def change_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
        elif df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
    return df

#make a function for replacing the null values in df_combined_mod dataframe with a string of my choice
def replace_nulls(df, column, string):
    df[column] = df[column].fillna(string)
    return df

In [4]:
#show all print outputs when using a function
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


# Importing df from previous notebook

In [5]:
#point to the folder where the data is stored
os.chdir(r"C:\Users\pedro\datathon")

# Loading data from previous notebook
df_combined_mod = pd.read_csv('df_combined_mod.csv', index_col=0)
df_combined_mod.head(2)


Unnamed: 0_level_0,Inspections,No_Incidents,Risk_S*I/Inspections,leakage_estimate_factor,InspectionDay,InspectionYear,InspectionDate,MonthsLastRev,Risk_S*I,Severity,Incidence,Province,Town,YearBuilt,Material,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural
PipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
56922465,1,0,0.0,0.0,Thursday,2020,2020-12-31,24,0.0,4,0,Valencia,Betera,1993,PE,63.0,1.778,4.0,0,0,0,1
188341482,6,0,0.0,0.0,Thursday,2021,2020-12-31,23,0.0,4,0,Barcelona,Sabadell,1995,PE,200.0,34.96,0.025,0,0,0,1


In [6]:
#how much RAM is being used
import psutil
psutil.virtual_memory()

svmem(total=34272403456, available=28319248384, percent=17.4, used=5953155072, free=28319248384)

# Adding a new dataset
We want to extract value of two categorical variables, Town and Province but the way we have them now they are useless.

We will add a new dataset to join and extract the surface of each town as well as the comunidad autonoma to group by accordingly in another column the number of towns and afterwards hot encode


In [7]:
#count towns populating Town column
df_combined_mod['Town'].value_counts()
df_combined_mod['Town'].unique()

Madrid             75616
Barcelona          58408
Valencia           25405
Sevilla            22491
Terrassa           16573
                   ...  
Villarmayor            1
Alcaraz                1
Rodeiro                1
Vedra                  1
Ituero de Azaba        1
Name: Town, Length: 1960, dtype: int64

array(['Betera', 'Sabadell', 'Valencia', ..., 'Villabrazaro',
       'Arenzana de Abajo', 'Camprovin'], dtype=object)

In [8]:
#point to the folder where the data is stored
os.chdir(r"C:\Users\pedro\datathon\base\complementary_datasets")

# Loading combined_mod dataset
df_mun = pd.read_excel('list-mun-2012.xls' , sheet_name='list-mun')

#df_population= pd.read_excel('pobmun20.xls')


In [9]:
df_mun.head(1)

Unnamed: 0,codine,Municipio,Superficie,Capitalidad,Año,CA,Autonomía,CP,Provincia,CPJ,Partido_Judicial
0,1001,Alegría-Dulantzi,19.95,Alegría-Dulantzi,,16,País Vasco,1,Araba/Álava,102,Vitoria-Gasteiz


In [10]:
normalize_characters(df_mun, 'Municipio')

Unnamed: 0,codine,Municipio,Superficie,Capitalidad,Año,CA,Autonomía,CP,Provincia,CPJ,Partido_Judicial
0,1001,alegria-dulantzi,19.95,Alegría-Dulantzi,,16,País Vasco,1,Araba/Álava,102,Vitoria-Gasteiz
1,1002,amurrio,96.36,Amurrio,,16,País Vasco,1,Araba/Álava,101,Amurrio
2,1003,aramaio,73.27,Ibarra,,16,País Vasco,1,Araba/Álava,102,Vitoria-Gasteiz
3,1004,artziniega,27.45,Artziniega,,16,País Vasco,1,Araba/Álava,101,Amurrio
4,1006,arminon,12.94,Armiñón,,16,País Vasco,1,Araba/Álava,102,Vitoria-Gasteiz
...,...,...,...,...,...,...,...,...,...,...,...
8111,50901,biel,130.73,Biel,,2,Aragón,50,Zaragoza,5005,Ejea de los Caballeros
8112,50902,marracos,16.92,Marracos,1998.0,2,Aragón,50,Zaragoza,5005,Ejea de los Caballeros
8113,50903,villamayor de gallego,89.36,Villamayor ...,2006.0,2,Aragón,50,Zaragoza,5003,Zaragoza
8114,51001,ceuta,19.48,Ceuta,,18,Ceuta (Ciudad de),51,Ceuta,5112,Ceuta


In [11]:
normalize_characters(df_combined_mod, 'Town')

Unnamed: 0_level_0,Inspections,No_Incidents,Risk_S*I/Inspections,leakage_estimate_factor,InspectionDay,InspectionYear,InspectionDate,MonthsLastRev,Risk_S*I,Severity,Incidence,Province,Town,YearBuilt,Material,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural
PipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
56922465,1,0,0.0,0.0,Thursday,2020,2020-12-31,24,0.0,4,0,Valencia,betera,1993,PE,63.0,1.778,4.000,0,0,0,1
188341482,6,0,0.0,0.0,Thursday,2021,2020-12-31,23,0.0,4,0,Barcelona,sabadell,1995,PE,200.0,34.960,0.025,0,0,0,1
189485681,6,0,0.0,0.0,Thursday,2020,2020-12-31,23,0.0,4,0,Valencia,betera,1950,AO,50.8,16.423,4.000,0,0,0,1
189485654,6,0,0.0,0.0,Thursday,2020,2020-12-31,23,0.0,4,0,Valencia,betera,1950,AO,50.8,11.443,4.000,0,0,0,1
274990283,6,0,0.0,0.0,Thursday,2021,2020-12-31,23,0.0,4,0,Barcelona,sabadell,2005,PE,160.0,10.377,0.025,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189142507,1,0,0.0,0.0,Wednesday,2010,2010-10-06,24,0.0,4,0,Tarragona,amposta,2000,PE,110.0,0.694,0.150,0,0,0,1
189141476,1,0,0.0,0.0,Tuesday,2010,2010-10-05,24,0.0,4,0,Tarragona,calafell,2000,PE,110.0,1.188,0.150,0,0,0,1
324551020,1,0,0.0,0.0,Tuesday,2010,2010-10-05,24,0.0,4,0,Barcelona,sentmenat,2008,PE,110.0,0.802,0.100,0,0,0,1
190908195,1,0,0.0,0.0,Tuesday,2010,2010-10-05,24,0.0,4,0,Alicante,alicante/alacant,2004,PE,200.0,0.999,0.150,0,0,0,1


In [12]:
normalize_characters(df_combined_mod, 'Province')


Unnamed: 0_level_0,Inspections,No_Incidents,Risk_S*I/Inspections,leakage_estimate_factor,InspectionDay,InspectionYear,InspectionDate,MonthsLastRev,Risk_S*I,Severity,Incidence,Province,Town,YearBuilt,Material,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural
PipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
56922465,1,0,0.0,0.0,Thursday,2020,2020-12-31,24,0.0,4,0,valencia,betera,1993,PE,63.0,1.778,4.000,0,0,0,1
188341482,6,0,0.0,0.0,Thursday,2021,2020-12-31,23,0.0,4,0,barcelona,sabadell,1995,PE,200.0,34.960,0.025,0,0,0,1
189485681,6,0,0.0,0.0,Thursday,2020,2020-12-31,23,0.0,4,0,valencia,betera,1950,AO,50.8,16.423,4.000,0,0,0,1
189485654,6,0,0.0,0.0,Thursday,2020,2020-12-31,23,0.0,4,0,valencia,betera,1950,AO,50.8,11.443,4.000,0,0,0,1
274990283,6,0,0.0,0.0,Thursday,2021,2020-12-31,23,0.0,4,0,barcelona,sabadell,2005,PE,160.0,10.377,0.025,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189142507,1,0,0.0,0.0,Wednesday,2010,2010-10-06,24,0.0,4,0,tarragona,amposta,2000,PE,110.0,0.694,0.150,0,0,0,1
189141476,1,0,0.0,0.0,Tuesday,2010,2010-10-05,24,0.0,4,0,tarragona,calafell,2000,PE,110.0,1.188,0.150,0,0,0,1
324551020,1,0,0.0,0.0,Tuesday,2010,2010-10-05,24,0.0,4,0,barcelona,sentmenat,2008,PE,110.0,0.802,0.100,0,0,0,1
190908195,1,0,0.0,0.0,Tuesday,2010,2010-10-05,24,0.0,4,0,alicante,alicante/alacant,2004,PE,200.0,0.999,0.150,0,0,0,1


In [13]:
#join df_combined_mod and df_mun on Town column and Municipio column
df_combined_mod = pd.merge(df_combined_mod, df_mun, how='left', left_on='Town', right_on='Municipio')
df_combined_mod.head(2)

Unnamed: 0,Inspections,No_Incidents,Risk_S*I/Inspections,leakage_estimate_factor,InspectionDay,InspectionYear,InspectionDate,MonthsLastRev,Risk_S*I,Severity,Incidence,Province,Town,YearBuilt,Material,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural,codine,Municipio,Superficie,Capitalidad,Año,CA,Autonomía,CP,Provincia,CPJ,Partido_Judicial
0,1,0,0.0,0.0,Thursday,2020,2020-12-31,24,0.0,4,0,valencia,betera,1993,PE,63.0,1.778,4.0,0,0,0,1,46070.0,betera,75.1,Bétera,,10.0,Comunidad Valenciana,46.0,Valencia/València,4601.0,Llíria
1,6,0,0.0,0.0,Thursday,2021,2020-12-31,23,0.0,4,0,barcelona,sabadell,1995,PE,200.0,34.96,0.025,0,0,0,1,8187.0,sabadell,37.53,Sabadell,,9.0,Cataluña,8.0,Barcelona,813.0,Sabadell


In [14]:
#place Province and Town Columns at the end of the dataframe
cols = list(df_combined_mod.columns.values)
cols.pop(cols.index('Province'))
cols.pop(cols.index('Town'))
df_combined_mod = df_combined_mod[cols+['Province','Town']]
df_combined_mod.tail(2)

'Province'

'Town'

Unnamed: 0,Inspections,No_Incidents,Risk_S*I/Inspections,leakage_estimate_factor,InspectionDay,InspectionYear,InspectionDate,MonthsLastRev,Risk_S*I,Severity,Incidence,YearBuilt,Material,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural,codine,Municipio,Superficie,Capitalidad,Año,CA,Autonomía,CP,Provincia,CPJ,Partido_Judicial,Province,Town
1436390,1,0,0.0,0.0,Tuesday,2010,2010-10-05,24,0.0,4,0,2004,PE,200.0,0.999,0.15,0,0,0,1,,,,,,,,,,,,alicante,alicante/alacant
1436391,1,0,0.0,0.0,Friday,2010,2010-10-01,21,0.0,4,0,2009,PE,90.0,1.101,0.15,0,0,0,1,43037.0,calafell,20.18,Calafell,,9.0,Cataluña,43.0,Tarragona,4301.0,"Vendrell, El",tarragona,calafell


In [15]:
df_mun.columns

Index(['codine', 'Municipio', 'Superficie', 'Capitalidad', 'Año', 'CA',
       'Autonomía', 'CP', 'Provincia', 'CPJ', 'Partido_Judicial'],
      dtype='object')

In [16]:
#deleting 'CP', 'Provincia', 'CPJ', 'Partido_Judicial', 'Capitalidad', 'Año', 'CA', 'codine', 'Municipio' from df_combined_mod dataframe
df_combined_mod = df_combined_mod.drop(['CP', 'Provincia', 'CPJ', 'Partido_Judicial', 'Capitalidad', 'Año', 'CA', 'codine', 'Municipio'], axis=1)
df_combined_mod.head(2)


Unnamed: 0,Inspections,No_Incidents,Risk_S*I/Inspections,leakage_estimate_factor,InspectionDay,InspectionYear,InspectionDate,MonthsLastRev,Risk_S*I,Severity,Incidence,YearBuilt,Material,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural,Superficie,Autonomía,Province,Town
0,1,0,0.0,0.0,Thursday,2020,2020-12-31,24,0.0,4,0,1993,PE,63.0,1.778,4.0,0,0,0,1,75.1,Comunidad Valenciana,valencia,betera
1,6,0,0.0,0.0,Thursday,2021,2020-12-31,23,0.0,4,0,1995,PE,200.0,34.96,0.025,0,0,0,1,37.53,Cataluña,barcelona,sabadell


In [17]:
#count nulls in each column for df_combined_mod dataframe
df_combined_mod.isnull().sum()


Inspections                     0
No_Incidents                    0
Risk_S*I/Inspections            0
leakage_estimate_factor         0
InspectionDay                   0
InspectionYear                  0
InspectionDate                  0
MonthsLastRev                   0
Risk_S*I                        0
Severity                        0
Incidence                       0
YearBuilt                       0
Material                        0
Diameter                        0
Length                          0
Pressure                        0
NumConnections                  0
NumConnectionsUnder             0
BoolBridle                      0
gas_natural                     0
Superficie                 179644
Autonomía                  179644
Province                        0
Town                            0
dtype: int64

In [18]:
#delete column "Superficie" from df_combined_mod dataframe
df_combined_mod = df_combined_mod.drop(['Superficie'], axis=1)

In [19]:
#for Province with value tarragona, replace null values in Autonomia with Cataluña
df_combined_mod.loc[df_combined_mod['Province'] == 'barcelona', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'barcelona', 'Autonomía'].fillna('Cataluña')
df_combined_mod.loc[df_combined_mod['Province'] == 'tarragona', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'tarragona', 'Autonomía'].fillna('Cataluña')
df_combined_mod.loc[df_combined_mod['Province'] == 'lleida', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'lleida', 'Autonomía'].fillna('Cataluña')
df_combined_mod.loc[df_combined_mod['Province'] == 'girona', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'girona', 'Autonomía'].fillna('Cataluña')



In [20]:
#count nulls in Autonomia column in df_combined_mod dataframe
df_combined_mod['Autonomía'].isnull().sum()

86507

In [21]:
#for Province with value "La Rioja", replace null values in Autonomía with "Rioja (La)"
df_combined_mod.loc[df_combined_mod['Province'] == 'la rioja', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'la rioja', 'Autonomía'].fillna('Rioja (La)')


In [22]:
#for Province with value Orense, replace null values in Autonomía with "Galicia"
df_combined_mod.loc[df_combined_mod['Province'] == 'orense', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'orense', 'Autonomía'].fillna('Galicia')
df_combined_mod.loc[df_combined_mod['Province'] == 'la coruna', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'la coruna', 'Autonomía'].fillna('Galicia')
df_combined_mod.loc[df_combined_mod['Province'] == 'pontevedra', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'pontevedra', 'Autonomía'].fillna('Galicia')
df_combined_mod.loc[df_combined_mod['Province'] == 'lugo', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'lugo', 'Autonomía'].fillna('Galicia')


In [23]:
#for Province with value Zaragoza, replace null values in Autonomía with "Aragón"
df_combined_mod.loc[df_combined_mod['Province'] == 'zaragoza', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'zaragoza', 'Autonomía'].fillna('Aragón')
df_combined_mod.loc[df_combined_mod['Province'] == 'huesca', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'huesca', 'Autonomía'].fillna('Aragón')

In [24]:
#for Province with value Toledo, replace null values in Autonomía with "Castilla-La Mancha"
df_combined_mod.loc[df_combined_mod['Province'] == 'toledo', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'toledo', 'Autonomía'].fillna('Castilla-La Mancha')
df_combined_mod.loc[df_combined_mod['Province'] == 'albacete', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'albacete', 'Autonomía'].fillna('Castilla-La Mancha')
df_combined_mod.loc[df_combined_mod['Province'] == 'ciudad real', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'ciudad real', 'Autonomía'].fillna('Castilla-La Mancha')
df_combined_mod.loc[df_combined_mod['Province'] == 'cuenca', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'cuenca', 'Autonomía'].fillna('Castilla-La Mancha')


In [25]:
#for Province with value Almeria, replace null values in Autonomía with "Andalucía"
df_combined_mod.loc[df_combined_mod['Province'] == 'almeria', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'almeria', 'Autonomía'].fillna('Andalucía')
df_combined_mod.loc[df_combined_mod['Province'] == 'sevilla', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'sevilla', 'Autonomía'].fillna('Andalucía')
df_combined_mod.loc[df_combined_mod['Province'] == 'jaen', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'jaen', 'Autonomía'].fillna('Andalucía')
df_combined_mod.loc[df_combined_mod['Province'] == 'cordoba', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'cordoba', 'Autonomía'].fillna('Andalucía')
df_combined_mod.loc[df_combined_mod['Province'] == 'cadiz', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'cadiz', 'Autonomía'].fillna('Andalucía')
df_combined_mod.loc[df_combined_mod['Province'] == 'granada', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'granada', 'Autonomía'].fillna('Andalucía')



In [26]:
#for Province with value Zamora, replace null values in Autonomía with "Castilla y León"
df_combined_mod.loc[df_combined_mod['Province'] == 'zamora', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'zamora', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'segovia', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'segovia', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'valladolid', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'valladolid', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'leon', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'leon', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'salamanca', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'salamanca', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'burgos', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'burgos', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'soria', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'soria', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'palencia', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'palencia', 'Autonomía'].fillna('Castilla y León')


In [27]:
#for Province with value Madrid, replace null values in Autonomía with "Madrid (Comunidad de)"
df_combined_mod.loc[df_combined_mod['Province'] == 'madrid', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'madrid', 'Autonomía'].fillna('Madrid (Comunidad de)')


In [28]:
#for Province with value Alicante, replace null values in Autonomía with Comunidad Valenciana
df_combined_mod.loc[df_combined_mod['Province'] == 'alicante', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'alicante', 'Autonomía'].fillna('Comunidad Valenciana')
df_combined_mod.loc[df_combined_mod['Province'] == 'castellon', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'castellon', 'Autonomía'].fillna('Comunidad Valenciana')
df_combined_mod.loc[df_combined_mod['Province'] == 'valencia', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'valencia', 'Autonomía'].fillna('Comunidad Valenciana')


In [29]:
#count nulls in Autonomia column in df_combined_mod dataframe
df_combined_mod['Autonomía'].isnull().sum()

0

In [30]:
#droping df_mun from memory to free RAM
del df_mun

In [31]:
#change df_population POB32 to int32
change_dtypes(df_combined_mod)



Unnamed: 0,Inspections,No_Incidents,Risk_S*I/Inspections,leakage_estimate_factor,InspectionDay,InspectionYear,InspectionDate,MonthsLastRev,Risk_S*I,Severity,Incidence,YearBuilt,Material,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural,Autonomía,Province,Town
0,1,0,0.0,0.0,Thursday,2020,2020-12-31,24,0.0,4,0,1993,PE,63.000000,1.778000,4.000,0,0,0,1,Comunidad Valenciana,valencia,betera
1,6,0,0.0,0.0,Thursday,2021,2020-12-31,23,0.0,4,0,1995,PE,200.000000,34.959999,0.025,0,0,0,1,Cataluña,barcelona,sabadell
2,6,0,0.0,0.0,Thursday,2020,2020-12-31,23,0.0,4,0,1950,AO,50.799999,16.423000,4.000,0,0,0,1,Comunidad Valenciana,valencia,betera
3,6,0,0.0,0.0,Thursday,2020,2020-12-31,23,0.0,4,0,1950,AO,50.799999,11.443000,4.000,0,0,0,1,Comunidad Valenciana,valencia,betera
4,6,0,0.0,0.0,Thursday,2021,2020-12-31,23,0.0,4,0,2005,PE,160.000000,10.377000,0.025,0,0,0,1,Cataluña,barcelona,sabadell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1436387,1,0,0.0,0.0,Wednesday,2010,2010-10-06,24,0.0,4,0,2000,PE,110.000000,0.694000,0.150,0,0,0,1,Cataluña,tarragona,amposta
1436388,1,0,0.0,0.0,Tuesday,2010,2010-10-05,24,0.0,4,0,2000,PE,110.000000,1.188000,0.150,0,0,0,1,Cataluña,tarragona,calafell
1436389,1,0,0.0,0.0,Tuesday,2010,2010-10-05,24,0.0,4,0,2008,PE,110.000000,0.802000,0.100,0,0,0,1,Cataluña,barcelona,sentmenat
1436390,1,0,0.0,0.0,Tuesday,2010,2010-10-05,24,0.0,4,0,2004,PE,200.000000,0.999000,0.150,0,0,0,1,Comunidad Valenciana,alicante,alicante/alacant


In [32]:
df_combined_mod.dtypes

Inspections                  int32
No_Incidents                 int32
Risk_S*I/Inspections       float32
leakage_estimate_factor    float32
InspectionDay               object
InspectionYear               int32
InspectionDate              object
MonthsLastRev                int32
Risk_S*I                   float32
Severity                     int32
Incidence                    int32
YearBuilt                    int32
Material                    object
Diameter                   float32
Length                     float32
Pressure                   float32
NumConnections               int32
NumConnectionsUnder          int32
BoolBridle                   int32
gas_natural                  int32
Autonomía                   object
Province                    object
Town                        object
dtype: object

In [33]:
#change df_combined_mod int64 columns to int8 to save memory
df_combined_mod['YearBuilt'] = df_combined_mod['YearBuilt'].astype('int8')
df_combined_mod['Severity'] = df_combined_mod['Severity'].astype('int8')
df_combined_mod['Incidence'] = df_combined_mod['Incidence'].astype('int8')
df_combined_mod['Inspections'] = df_combined_mod['Inspections'].astype('int8')



In [34]:
#count the number of Town per Autonomía and create a new column in df_combined_mod dataframe
df_combined_mod['TownCount'] = df_combined_mod.groupby('Town')['Town'].transform('count')
df_combined_mod.head()

Unnamed: 0,Inspections,No_Incidents,Risk_S*I/Inspections,leakage_estimate_factor,InspectionDay,InspectionYear,InspectionDate,MonthsLastRev,Risk_S*I,Severity,Incidence,YearBuilt,Material,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural,Autonomía,Province,Town,TownCount
0,1,0,0.0,0.0,Thursday,2020,2020-12-31,24,0.0,4,0,-55,PE,63.0,1.778,4.0,0,0,0,1,Comunidad Valenciana,valencia,betera,2250
1,6,0,0.0,0.0,Thursday,2021,2020-12-31,23,0.0,4,0,-53,PE,200.0,34.959999,0.025,0,0,0,1,Cataluña,barcelona,sabadell,15887
2,6,0,0.0,0.0,Thursday,2020,2020-12-31,23,0.0,4,0,-98,AO,50.799999,16.423,4.0,0,0,0,1,Comunidad Valenciana,valencia,betera,2250
3,6,0,0.0,0.0,Thursday,2020,2020-12-31,23,0.0,4,0,-98,AO,50.799999,11.443,4.0,0,0,0,1,Comunidad Valenciana,valencia,betera,2250
4,6,0,0.0,0.0,Thursday,2021,2020-12-31,23,0.0,4,0,-43,PE,160.0,10.377,0.025,0,0,0,1,Cataluña,barcelona,sabadell,15887


In [35]:
#hot encoding of categorical column Autonomía
df_combined_mod = pd.get_dummies(df_combined_mod, columns=['Autonomía'])
df_combined_mod.head(1)

Unnamed: 0,Inspections,No_Incidents,Risk_S*I/Inspections,leakage_estimate_factor,InspectionDay,InspectionYear,InspectionDate,MonthsLastRev,Risk_S*I,Severity,Incidence,YearBuilt,Material,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural,Province,Town,TownCount,Autonomía_Andalucía,Autonomía_Aragón,Autonomía_Balears (Illes),Autonomía_Castilla y León,Autonomía_Castilla-La Mancha,Autonomía_Cataluña,Autonomía_Comunidad Valenciana,Autonomía_Extremadura,Autonomía_Galicia,Autonomía_Madrid (Comunidad de),Autonomía_Navarra (Comunidad Foral de),Autonomía_Rioja (La)
0,1,0,0.0,0.0,Thursday,2020,2020-12-31,24,0.0,4,0,-55,PE,63.0,1.778,4.0,0,0,0,1,valencia,betera,2250,0,0,0,0,0,0,1,0,0,0,0,0


In [36]:
#count nulls
df_combined_mod.isnull().sum()

Inspections                               0
No_Incidents                              0
Risk_S*I/Inspections                      0
leakage_estimate_factor                   0
InspectionDay                             0
InspectionYear                            0
InspectionDate                            0
MonthsLastRev                             0
Risk_S*I                                  0
Severity                                  0
Incidence                                 0
YearBuilt                                 0
Material                                  0
Diameter                                  0
Length                                    0
Pressure                                  0
NumConnections                            0
NumConnectionsUnder                       0
BoolBridle                                0
gas_natural                               0
Province                                  0
Town                                      0
TownCount                       

# Further feature engineering

We want to also see materials available for the pipes as a hot encoded feature

In [37]:
print (df_combined_mod['Material'].unique())

['PE' 'AO' 'FD' 'PN' 'CU' 'PA' 'ZD' 'FG' 'PV' 'FI' 'FO']


In [38]:
#Map df_combined Material column to the names of materials
df_combined_mod['Material'] = df_combined_mod['Material'].map({'PE': 'Polyethylene', 'AO': 'Acrylonitrile-Butadiene-Styrene', 'FD': 'Fiberglass-Reinforced Plastic', 
    'FG': 'Fiberglass', 'PN': 'Polypropylene', 'PA': 'Polyamide', 'FO': 'Flexible Polyolefin', 'FI': 'Flexible Polyvinyl Chloride', 'CU': 'Copper', 
    'PV': 'Polyvinylidene Fluoride', 'ZD': 'Zinc-Coated Steel', 'ZA': 'Zinc-Aluminum', 'CP': 'Cast Iron', 'CS': 'Cast Steel', 
    'ZC': 'Zinc-Coated Steel', 'ZM': 'Zinc-Magnesium','ZN': 'Zinc', 'AL': 'Aluminum', 'ZP': 'Zinc-Coated Steel', 'ZF': 'Zinc-Aluminum-Magnesium'})


In [39]:
df_combined_mod.tail(1)

Unnamed: 0,Inspections,No_Incidents,Risk_S*I/Inspections,leakage_estimate_factor,InspectionDay,InspectionYear,InspectionDate,MonthsLastRev,Risk_S*I,Severity,Incidence,YearBuilt,Material,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural,Province,Town,TownCount,Autonomía_Andalucía,Autonomía_Aragón,Autonomía_Balears (Illes),Autonomía_Castilla y León,Autonomía_Castilla-La Mancha,Autonomía_Cataluña,Autonomía_Comunidad Valenciana,Autonomía_Extremadura,Autonomía_Galicia,Autonomía_Madrid (Comunidad de),Autonomía_Navarra (Comunidad Foral de),Autonomía_Rioja (La)
1436391,1,0,0.0,0.0,Friday,2010,2010-10-01,21,0.0,4,0,-39,Polyethylene,90.0,1.101,0.15,0,0,0,1,tarragona,calafell,3687,0,0,0,0,0,1,0,0,0,0,0,0


In [40]:
#Hot enconde Material column in the df_combined_mod dataframe subset
df_combined_mod = pd.get_dummies(df_combined_mod, columns=['Material'], prefix = ['Material'])

In [43]:
#deleting columns Material_Fiberglass, Material_Zinc_coated_steel, Material_Polyvinylidene_fluoride
df_combined_mod = df_combined_mod.drop(['Material_Fiberglass', 'Material_Zinc-Coated Steel', 'Material_Polyvinylidene Fluoride','Material_Flexible Polyolefin', 'Material_Flexible Polyvinyl Chloride', 'Material_Polyamide'], axis=1)
df_combined_mod.head(1)

Unnamed: 0,Inspections,No_Incidents,Risk_S*I/Inspections,leakage_estimate_factor,InspectionDay,InspectionYear,InspectionDate,MonthsLastRev,Risk_S*I,Severity,Incidence,YearBuilt,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle,gas_natural,Province,Town,TownCount,Autonomía_Andalucía,Autonomía_Aragón,Autonomía_Balears (Illes),Autonomía_Castilla y León,Autonomía_Castilla-La Mancha,Autonomía_Cataluña,Autonomía_Comunidad Valenciana,Autonomía_Extremadura,Autonomía_Galicia,Autonomía_Madrid (Comunidad de),Autonomía_Navarra (Comunidad Foral de),Autonomía_Rioja (La),Material_Acrylonitrile-Butadiene-Styrene,Material_Copper,Material_Fiberglass-Reinforced Plastic,Material_Polyethylene,Material_Polypropylene
0,1,0,0.0,0.0,Thursday,2020,2020-12-31,24,0.0,4,0,-55,63.0,1.778,4.0,0,0,0,1,valencia,betera,2250,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0


In [None]:
#delete outliers in year_built column to keep values only higher than 1960
df_combined_mod = df_combined_mod[df_combined_mod['YearBuilt'] > 1960]


In [None]:
df_combined_mod.dtypes

In [None]:
#eliminate values higher than 40 in MonthsLastRev column
df_combined_mod = df_combined_mod[df_combined_mod['MonthsLastRev'] < 40]

In [None]:
#graph histogram of MonthsLastRev column
df_combined_mod['MonthsLastRev'].hist()

In [None]:
#eliminate values higher than 400 in diameter column
df_combined_mod = df_combined_mod[df_combined_mod['Diameter'] < 400]

In [None]:
#show histogram of diameter column
df_combined_mod['Diameter'].hist()

In [None]:
#plot column Length
df_combined_mod['Length'].value_counts()

In [None]:
#plot histogram of Length column
df_combined_mod['Length'].hist()

# Feature Engineering

In [None]:
#create a new column that divides the diameter by the pressure and name it Relative Thickness
df_combined_mod['Relative_Thickness'] = df_combined_mod['Diameter'] / df_combined_mod['Pressure']

In [None]:
#Create a new column that takes the inspecion year and substracts the YearBuilt column naming it Age of Pipe
df_combined_mod['Age_of_Pipe_upon_inspection'] = df_combined_mod['InspectionYear'] - df_combined_mod['YearBuilt']
df_combined_mod.head(1)

In [None]:
#create a column that multiplies severity by incidence and name it severity_incidence
df_combined_mod['Severity_Incidence'] = df_combined_mod['Severity'] * df_combined_mod['Incidence']

In [None]:
#delete column severity
df_combined_mod = df_combined_mod.drop(['Severity'], axis=1)
df_combined_mod.head(1)


In [None]:
df_combined_mod.dtypes

In [None]:
#hot encode severity_incidence column in the df_combined_mod dataframe
df_combined_mod = pd.get_dummies(df_combined_mod, columns=['Severity_Incidence'], prefix = ['Severity_Incidence'])
df_combined_mod.head(1)

In [None]:
#change name of Severity_Incidence_0 column to Severity_0
df_combined_mod = df_combined_mod.rename(columns={'Severity_Incidence_0': 'Severity_0'})
#change name of Severity_Incidence_1 column to Severity_low
df_combined_mod = df_combined_mod.rename(columns={'Severity_Incidence_1': 'Severity_low'})
#change name of Severity_Incidence_2 column to Severity_medium
df_combined_mod = df_combined_mod.rename(columns={'Severity_Incidence_2': 'Severity_medium'})
#change name of Severity_Incidence_3 column to Severity_high
df_combined_mod = df_combined_mod.rename(columns={'Severity_Incidence_3': 'Severity_high'})
df_combined_mod.head(1)


In [None]:
df_combined_mod.shape

In [None]:
#General number of high severity per year
df_combined_mod.groupby('InspectionYear')['Severity_high'].value_counts()

In [None]:
df_combined_mod.dtypes

# Exporting the new dataset for a backup

In [None]:
#point to the folder where the data is stored
os.chdir(r"C:\Users\pedro\datathon")
#export the dataframe to a csv file
df_combined_mod.to_csv('df_combined_mod.csv', index=False)

# Plotting EDA for new dataset

In [None]:
#pandas profiling on the df_combined_mod dataframe
profile = ProfileReport(df_combined_mod, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile

# WOULD BE NICE TO KNOW

- what do we do with categoricals
    province and town, are they relevant?
- how toi interpret the length?
- an extra dataset would be great to create more features 
- should we create polinomial features

In [None]:
#create polinomial features 
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
poly_data = poly.fit_transform(df_combined_mod[['Relative_Thickness', 'Age_of_Pipe_upon_inspection']])
poly_columns = poly.get_feature_names(['Relative_Thickness', 'Age_of_Pipe_upon_inspection'])
df_poly = pd.DataFrame(poly_data, columns=poly_columns)
df_poly = df_poly.drop(['1'], axis=1)
df_combined_mod = pd.concat([df_combined_mod, df_poly], axis=1)
df_combined_mod.head(1)