# Libraries used

Running Kernel3.9.13 base anaconda

pip install squarify
pip install yellowbrick
pip install plotly
pip install seaborn
pip install lazypredict
pip install pandas_profiling

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns  
import matplotlib.pyplot as plt
import plotly.io as pio
import squarify #treemap
import os
import matplotlib
import warnings

#to enable the inline plotting
%matplotlib inline 

import warnings
warnings.filterwarnings('ignore')

sns.set_style("darkgrid")

In [2]:
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import classification_report

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler


from scipy.stats import normaltest

from pandas_profiling import ProfileReport

from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ClassPredictionError
from yellowbrick.style.palettes import PALETTES, SEQUENCES, color_palette

import lazypredict

warnings.simplefilter(action='ignore', category=FutureWarning)

# Functions


In [3]:
# Function for EDA. Using the display() function to have  well-formatted tables. We are mainly using pandas to explore the datasets

def dataset_description(df_target):

    print('This is the Dataset shape: %s\n' % (df_target.shape, ))
    print('Dataset columns: %s\n' % df_target.columns)

    print('\nColumns description:\n')
    display(df_target.info())
    display(df_target.describe())  # describe the dataset

    print('\nNull values:\n')
    display(df_target.isnull().sum())  # Identify null values

#function performing a quick check on df_inspection with a function to have best of pandas functions separated by a line
def quick_check(dataframe):
    print('First 5 rows %s\n')
    print(dataframe.head(2))
    print("=====================================")
    print('Dataframe shape %s\n')
    print(dataframe.shape)
    print("=====================================")
    print('Dataframe describe categorical %s\n')
    print(dataframe.describe(include=['O']))
    print("=====================================")
    print('Dataframe null values %s\n')
    print(dataframe.isnull().sum())
    print("=====================================")
    print('Dataframe value counts %s\n')
    print(dataframe.value_counts())
    print("=====================================")

def stats(dataframe):
    print('Dataframe correlation %s\n')
    print(dataframe.corr())
    print("=====================================")
    print('Dataframe covariance %s\n')
    print(dataframe.cov())
    print("=====================================")
    print('Dataframe skew %s\n')
    print(dataframe.skew())
    print("=====================================")
    print('Dataframe kurtosis %s\n')
    print(dataframe.kurt())
    print("=====================================")

#create a function to normalize characters from a dataset's column in Spanish
def normalize_characters(df, column):
    df[column] = df[column].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
    df[column] = df[column].str.lower()
    df[column] = df[column].str.replace('á', 'a')
    df[column] = df[column].str.replace('é', 'e')
    df[column] = df[column].str.replace('í', 'i')
    df[column] = df[column].str.replace('ó', 'o')
    df[column] = df[column].str.replace('ú', 'u')
    df[column] = df[column].str.replace('ñ', 'n')
    df[column] = df[column].str.replace('ü', 'u')
    df[column] = df[column].str.replace('ç', 'c')
    df[column] = df[column].str.replace('(', '')
    df[column] = df[column].str.replace(')', '')
    df[column] = df[column].str.replace('\'', '')
    df[column] = df[column].str.replace('´', '')
    df[column] = df[column].str.replace('`', '')
    df[column] = df[column].str.replace('’', '')
    return df

In [4]:
#show all print outputs when using a function
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


# 2. Data Collection and Understanding 

## Network file

The network.csv file contains a static picture of the gas pipeline network. Every row corresponds to a pipe and has a unique PipeId identifier. The table has 1.446.529 pipes. 

The columns describe relevant features of each pipe. The complete list is: 

PipeId - unique identifier for the pipe 

Province - Spanish province where the pipe is located 

Town - Town or city where the pipe is located 

YearBuilt - Year in which the pipe was built and installed 

Material - Material in which the pipe is built 

GasType - Type of gas that runs through the pipe 

Diameter - diameter of the pipe 

Length - Length of the pipe 

Pressure - Pressure of the gas that runs through the pipe (bar) 

NumConnections - Number of connections (external). 

NumConnectionsUnder - Number of connections (internal and buried) 

BoolBridle  - Whether the pipe is bridled (True) or welded (False) 

## Inspection file

MaintenanceId - unique identifier for the inspection operation 

InspectionYear - year in which the inspection took place 

InspectionDate - date in which the inspection took place 

MonthsLastRev - number of months elapsed since the last previous inspection. 

Severity - Severity of the damage found (1: most severe, 3: least severe) 

Incidence - Boolean whether an incident was found on the revision (1) or not (0). 

# Merging datasets

In [5]:
#point to the folder where the data is stored
os.chdir(r"C:\Users\pedro\datathon")

# Loading inspection data
df_inspection = pd.read_csv('inspections.csv')

# Loading network data
df_network = pd.read_csv('network.csv')

In [6]:
quick_check(df_inspection)


First 5 rows %s

      PipeId MaintenanceId  InspectionYear InspectionDate  MonthsLastRev  \
0  189311802  ZRV-00001972            2010     2010-10-01             24   
1  188889914  ZRV-00001406            2010     2010-10-01             22   

   Severity  Incidence  
0         4          0  
1         4          0  
Dataframe shape %s

(6345344, 7)
Dataframe describe categorical %s

       MaintenanceId InspectionDate
count        6345344        6345344
unique          4179           3565
top     ZRV-00002121     2014-05-05
freq           21773           5832
Dataframe null values %s

PipeId            0
MaintenanceId     0
InspectionYear    0
InspectionDate    0
MonthsLastRev     0
Severity          0
Incidence         0
dtype: int64
Dataframe value counts %s

PipeId     MaintenanceId  InspectionYear  InspectionDate  MonthsLastRev  Severity  Incidence
489616     ZRV-00003385   2013            2013-05-08      16             4         0            1
256852211  ZRV-00002129   2016    

In [7]:
quick_check(df_network)

First 5 rows %s

   PipeId     Province                   Town  YearBuilt Material  \
0  489616  Ciudad Real  Torralba de Calatrava       2012       PE   
1  489645  Ciudad Real  Torralba de Calatrava       2012       PE   

       GasType  Diameter  Length  Pressure  NumConnections  \
0  Gas natural      63.0   7.792       0.4               0   
1  Gas natural      90.0   2.080       0.4               0   

   NumConnectionsUnder  BoolBridle  
0                    0       False  
1                    0       False  
Dataframe shape %s

(1446539, 12)
Dataframe describe categorical %s

         Province     Town Material      GasType
count     1446539  1446539  1446539      1446539
unique         38     1972       11            2
top     Barcelona   Madrid       PE  Gas natural
freq       389307    76666  1242845      1388397
Dataframe null values %s

PipeId                 0
Province               0
Town                   0
YearBuilt              0
Material               0
GasType     

In [8]:
stats(df_inspection)

Dataframe correlation %s

                  PipeId  InspectionYear  MonthsLastRev  Severity  Incidence
PipeId          1.000000       -0.123868       0.048799  0.017614  -0.021449
InspectionYear -0.123868        1.000000       0.117585  0.001188  -0.000359
MonthsLastRev   0.048799        0.117585       1.000000  0.011268  -0.013749
Severity        0.017614        0.001188       0.011268  1.000000  -0.904174
Incidence      -0.021449       -0.000359      -0.013749 -0.904174   1.000000
Dataframe covariance %s

                      PipeId  InspectionYear  MonthsLastRev       Severity  \
PipeId          1.294358e+16   -4.099991e+07   2.754554e+07  129283.543032   
InspectionYear -4.099991e+07    8.464272e+00   1.697315e+00       0.000223   
MonthsLastRev   2.754554e+07    1.697315e+00   2.461659e+01       0.003607   
Severity        1.292835e+05    2.230601e-04   3.606757e-03       0.004162   
Incidence      -1.074542e+05   -4.603098e-05  -3.003968e-03      -0.002569   

                  

In [9]:
stats(df_network)

Dataframe correlation %s

                       PipeId  YearBuilt  Diameter    Length  Pressure  \
PipeId               1.000000  -0.052882  0.133925 -0.013087  0.044928   
YearBuilt           -0.052882   1.000000 -0.155437  0.028727  0.047748   
Diameter             0.133925  -0.155437  1.000000  0.073956  0.194073   
Length              -0.013087   0.028727  0.073956  1.000000  0.105697   
Pressure             0.044928   0.047748  0.194073  0.105697  1.000000   
NumConnections      -0.045010  -0.028777 -0.112723  0.307837 -0.102855   
NumConnectionsUnder -0.010956   0.011157 -0.007065  0.017895 -0.002610   
BoolBridle          -0.091219  -0.176955  0.089100 -0.022166 -0.065808   

                     NumConnections  NumConnectionsUnder  BoolBridle  
PipeId                    -0.045010            -0.010956   -0.091219  
YearBuilt                 -0.028777             0.011157   -0.176955  
Diameter                  -0.112723            -0.007065    0.089100  
Length                 

In [10]:
#merge both dataframes
df_combined = pd.merge(df_inspection, df_network, on="PipeId")

In [11]:
df_combined.head(1)

Unnamed: 0,PipeId,MaintenanceId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
0,189311802,ZRV-00001972,2010,2010-10-01,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.0,0,0,False


In [12]:
#creating a subset for altering the dataset after initial EDA
df_combined_mod = df_combined.copy()

In [13]:
df_combined_mod.head(1)


Unnamed: 0,PipeId,MaintenanceId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
0,189311802,ZRV-00001972,2010,2010-10-01,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.0,0,0,False


In [14]:
df_combined_mod.columns

Index(['PipeId', 'MaintenanceId', 'InspectionYear', 'InspectionDate',
       'MonthsLastRev', 'Severity', 'Incidence', 'Province', 'Town',
       'YearBuilt', 'Material', 'GasType', 'Diameter', 'Length', 'Pressure',
       'NumConnections', 'NumConnectionsUnder', 'BoolBridle'],
      dtype='object')

# Data Wrangling on combined dataset

As a result of merging both datasets we now have pipeline duplicates per each maintenace_id operation. Before getrtng rid of the duplicates, we want to engineer some metrics interesting to the model such as number of operations, number of incidents and average risk based on severity*incidence

In [15]:
#show all columns in pandas
pd.set_option('display.max_columns', None)

In [16]:
#create a new column counting the number of inspections (MaintenanceId) per PipeId
df_combined_mod['Inspections'] = df_combined_mod.groupby('PipeId')['MaintenanceId'].transform('count')
#aggregate the number of Incidents per pipe in a new column and place it in the fourth position
df_combined_mod['No_Incidents'] = df_combined_mod.groupby('PipeId')['Incidence'].transform('sum')
#place the new columns in the third position
cols = list(df_combined_mod.columns.values)
cols.pop(cols.index('No_Incidents'))
df_combined_mod = df_combined_mod[['PipeId', 'MaintenanceId', 'Inspections', 'No_Incidents', 'InspectionYear', 'InspectionDate',
       'MonthsLastRev', 'Severity', 'Incidence', 'Province', 'Town',
       'YearBuilt', 'Material', 'GasType', 'Diameter', 'Length', 'Pressure',
       'NumConnections', 'NumConnectionsUnder', 'BoolBridle']]
#show head of rows only where No_Incidents is greater than 0
df_combined_mod[df_combined_mod['No_Incidents'] > 2].head(10)


'No_Incidents'

Unnamed: 0,PipeId,MaintenanceId,Inspections,No_Incidents,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
140753,191522404,ZRV-00003686,5,3,2010,2010-10-19,19,4,0,Barcelona,Igualada,1988,AO,Gas natural,304.8,220.382828,0.025,0,0,True
140754,191522404,ZRV-00003686,5,3,2012,2012-11-02,25,1,1,Barcelona,Igualada,1988,AO,Gas natural,304.8,220.382828,0.025,0,0,True
140755,191522404,ZRV-00003686,5,3,2014,2014-10-01,23,4,0,Barcelona,Igualada,1988,AO,Gas natural,304.8,220.382828,0.025,0,0,True
140756,191522404,ZRV-00003686,5,3,2016,2016-11-11,25,2,1,Barcelona,Igualada,1988,AO,Gas natural,304.8,220.382828,0.025,0,0,True
140757,191522404,ZRV-00003686,5,3,2018,2018-11-05,24,2,1,Barcelona,Igualada,1988,AO,Gas natural,304.8,220.382828,0.025,0,0,True
272048,287737413,ZRV-00003836,6,3,2010,2010-11-05,18,4,0,Madrid,Madrid,1988,FD,Gas natural,200.0,107.866,0.025,8,0,True
272049,287737413,ZRV-00003836,6,3,2012,2012-09-17,22,3,1,Madrid,Madrid,1988,FD,Gas natural,200.0,107.866,0.025,8,0,True
272050,287737413,ZRV-00003836,6,3,2014,2014-06-10,21,4,0,Madrid,Madrid,1988,FD,Gas natural,200.0,107.866,0.025,8,0,True
272051,287737413,ZRV-00003836,6,3,2016,2016-03-29,21,4,0,Madrid,Madrid,1988,FD,Gas natural,200.0,107.866,0.025,8,0,True
272052,287737413,ZRV-00003836,6,3,2018,2018-02-02,23,2,1,Madrid,Madrid,1988,FD,Gas natural,200.0,107.866,0.025,8,0,True


In [17]:
df_combined_mod.columns

Index(['PipeId', 'MaintenanceId', 'Inspections', 'No_Incidents',
       'InspectionYear', 'InspectionDate', 'MonthsLastRev', 'Severity',
       'Incidence', 'Province', 'Town', 'YearBuilt', 'Material', 'GasType',
       'Diameter', 'Length', 'Pressure', 'NumConnections',
       'NumConnectionsUnder', 'BoolBridle'],
      dtype='object')

In [21]:
#create a column named average_severity that calculates the average severity per pipe 
df_combined_mod['average_severity'] = df_combined_mod.groupby('PipeId')['Severity'].transform('mean')
#show head of rows only where mean has a decimal value
df_combined_mod[df_combined_mod['average_severity'] % 1 != 0].head(10)
#place the new column in 7th position
cols = list(df_combined_mod.columns.values)
cols.pop(cols.index('average_severity'))
df_combined_mod = df_combined_mod[['PipeId', 'MaintenanceId', 'Inspections', 'No_Incidents', 
    'InspectionYear', 'InspectionDate', 'average_severity', 'MonthsLastRev', 'Severity', 'Incidence', 'Province',
    'Town','YearBuilt', 'Material', 'GasType', 'Diameter', 'Length', 'Pressure', 'NumConnections',
    'NumConnectionsUnder', 'BoolBridle']]


Unnamed: 0,PipeId,MaintenanceId,Inspections,No_Incidents,Risk_S*I,InspectionYear,InspectionDate,average_severity,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
108,188889938,ZRV-00001406,6,1,3.833333,2010,2010-10-01,3.833333,22,4,0,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
109,188889938,ZRV-00001406,6,1,3.833333,2011,2011-11-17,3.833333,13,4,0,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
110,188889938,ZRV-00001406,6,1,3.833333,2013,2013-11-12,3.833333,24,4,0,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
111,188889938,ZRV-00001406,6,1,3.833333,2015,2015-11-06,3.833333,24,3,1,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
112,188889938,ZRV-00001406,6,1,3.833333,2017,2017-11-23,3.833333,24,4,0,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
113,188889938,ZRV-00001406,6,1,3.833333,2019,2019-11-21,3.833333,24,4,0,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
186,188909811,ZRV-00001406,6,1,3.5,2010,2010-10-01,3.5,22,4,0,Barcelona,Barcelona,1999,PE,Gas natural,200.0,34.436,0.025,4,0,False
187,188909811,ZRV-00001406,6,1,3.5,2011,2011-11-17,3.5,13,4,0,Barcelona,Barcelona,1999,PE,Gas natural,200.0,34.436,0.025,4,0,False
188,188909811,ZRV-00001406,6,1,3.5,2013,2013-11-11,3.5,24,4,0,Barcelona,Barcelona,1999,PE,Gas natural,200.0,34.436,0.025,4,0,False
189,188909811,ZRV-00001406,6,1,3.5,2015,2015-11-03,3.5,24,4,0,Barcelona,Barcelona,1999,PE,Gas natural,200.0,34.436,0.025,4,0,False


'average_severity'

In [22]:
#create a column taking average severity and number of total incidences per pipe multiplying them and naming it as risk_(s*i)
df_combined_mod['Risk_S*I'] = df_combined_mod['average_severity'] * df_combined_mod['No_Incidents'] 
#position the new column in the 7th position
cols = list(df_combined_mod.columns.values)
cols.pop(cols.index('Risk_S*I'))
df_combined_mod = df_combined_mod[['PipeId', 'MaintenanceId', 'Inspections', 'No_Incidents', 'Risk_S*I',
    'InspectionYear', 'InspectionDate', 'average_severity', 'MonthsLastRev', 'Severity', 'Incidence', 'Province',
    'Town','YearBuilt', 'Material', 'GasType', 'Diameter', 'Length', 'Pressure', 'NumConnections',
    'NumConnectionsUnder', 'BoolBridle']]

'Risk_S*I'

In [23]:
#show head of rows only where Risk_S*I is greater than 0
df_combined_mod[df_combined_mod['Risk_S*I'] > 0].head(10)

Unnamed: 0,PipeId,MaintenanceId,Inspections,No_Incidents,Risk_S*I,InspectionYear,InspectionDate,average_severity,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
108,188889938,ZRV-00001406,6,1,3.833333,2010,2010-10-01,3.833333,22,4,0,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
109,188889938,ZRV-00001406,6,1,3.833333,2011,2011-11-17,3.833333,13,4,0,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
110,188889938,ZRV-00001406,6,1,3.833333,2013,2013-11-12,3.833333,24,4,0,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
111,188889938,ZRV-00001406,6,1,3.833333,2015,2015-11-06,3.833333,24,3,1,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
112,188889938,ZRV-00001406,6,1,3.833333,2017,2017-11-23,3.833333,24,4,0,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
113,188889938,ZRV-00001406,6,1,3.833333,2019,2019-11-21,3.833333,24,4,0,Barcelona,Barcelona,1999,PE,Gas natural,160.0,77.024,0.025,5,0,False
186,188909811,ZRV-00001406,6,1,3.5,2010,2010-10-01,3.5,22,4,0,Barcelona,Barcelona,1999,PE,Gas natural,200.0,34.436,0.025,4,0,False
187,188909811,ZRV-00001406,6,1,3.5,2011,2011-11-17,3.5,13,4,0,Barcelona,Barcelona,1999,PE,Gas natural,200.0,34.436,0.025,4,0,False
188,188909811,ZRV-00001406,6,1,3.5,2013,2013-11-11,3.5,24,4,0,Barcelona,Barcelona,1999,PE,Gas natural,200.0,34.436,0.025,4,0,False
189,188909811,ZRV-00001406,6,1,3.5,2015,2015-11-03,3.5,24,4,0,Barcelona,Barcelona,1999,PE,Gas natural,200.0,34.436,0.025,4,0,False


In [24]:
#create a new column taking Risk_S*I and dividing it by No_Inspections naming it as Risk_S*I/Inspections and placing it in 6th position
df_combined_mod['Risk_S*I/Inspections'] = df_combined_mod['Risk_S*I'] / df_combined_mod['Inspections']
cols = list(df_combined_mod.columns.values)
cols.pop(cols.index('Risk_S*I/Inspections'))
df_combined_mod = df_combined_mod[['PipeId', 'MaintenanceId', 'Inspections', 'Risk_S*I/Inspections', 'No_Incidents', 
    'Risk_S*I', 'average_severity', 'MonthsLastRev', 'Severity', 'Incidence', 'Province',
    'Town','YearBuilt', 'InspectionYear', 'InspectionDate','Material', 'GasType', 'Diameter', 'Length', 'Pressure', 'NumConnections',
    'NumConnectionsUnder', 'BoolBridle']]

#show head of rows only where Risk_S*I/Inspections is greater than 0
df_combined_mod[df_combined_mod['Risk_S*I/Inspections'] > 0].head(5)

'Risk_S*I/Inspections'

Unnamed: 0,PipeId,MaintenanceId,Inspections,Risk_S*I/Inspections,No_Incidents,Risk_S*I,average_severity,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,InspectionYear,InspectionDate,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
108,188889938,ZRV-00001406,6,0.638889,1,3.833333,3.833333,22,4,0,Barcelona,Barcelona,1999,2010,2010-10-01,PE,Gas natural,160.0,77.024,0.025,5,0,False
109,188889938,ZRV-00001406,6,0.638889,1,3.833333,3.833333,13,4,0,Barcelona,Barcelona,1999,2011,2011-11-17,PE,Gas natural,160.0,77.024,0.025,5,0,False
110,188889938,ZRV-00001406,6,0.638889,1,3.833333,3.833333,24,4,0,Barcelona,Barcelona,1999,2013,2013-11-12,PE,Gas natural,160.0,77.024,0.025,5,0,False
111,188889938,ZRV-00001406,6,0.638889,1,3.833333,3.833333,24,3,1,Barcelona,Barcelona,1999,2015,2015-11-06,PE,Gas natural,160.0,77.024,0.025,5,0,False
112,188889938,ZRV-00001406,6,0.638889,1,3.833333,3.833333,24,4,0,Barcelona,Barcelona,1999,2017,2017-11-23,PE,Gas natural,160.0,77.024,0.025,5,0,False


In [25]:
#creating a new column named probability with No_Incidents divided by Inspections
df_combined_mod['Probability_incidence'] = df_combined_mod['No_Incidents'] / df_combined_mod['Inspections']
#place column in 3rd position
cols = list(df_combined_mod.columns.values)
cols.pop(cols.index('Probability_incidence'))
df_combined_mod = df_combined_mod[['PipeId', 'MaintenanceId', 'Inspections', 'Probability_incidence', 'Risk_S*I/Inspections', 
    'No_Incidents', 'Risk_S*I', 'average_severity', 'MonthsLastRev', 'Severity', 'Incidence', 'Province',
    'Town','YearBuilt', 'InspectionYear', 'InspectionDate','Material', 'GasType', 'Diameter', 'Length', 'Pressure',
    'NumConnections', 'NumConnectionsUnder', 'BoolBridle']]
#show head of rows only where Probability is greater than 0
df_combined_mod[df_combined_mod['Probability_incidence'] > 0].head(5)

'Probability_incidence'

Unnamed: 0,PipeId,MaintenanceId,Inspections,Probability_incidence,Risk_S*I/Inspections,No_Incidents,Risk_S*I,average_severity,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,InspectionYear,InspectionDate,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
108,188889938,ZRV-00001406,6,0.166667,0.638889,1,3.833333,3.833333,22,4,0,Barcelona,Barcelona,1999,2010,2010-10-01,PE,Gas natural,160.0,77.024,0.025,5,0,False
109,188889938,ZRV-00001406,6,0.166667,0.638889,1,3.833333,3.833333,13,4,0,Barcelona,Barcelona,1999,2011,2011-11-17,PE,Gas natural,160.0,77.024,0.025,5,0,False
110,188889938,ZRV-00001406,6,0.166667,0.638889,1,3.833333,3.833333,24,4,0,Barcelona,Barcelona,1999,2013,2013-11-12,PE,Gas natural,160.0,77.024,0.025,5,0,False
111,188889938,ZRV-00001406,6,0.166667,0.638889,1,3.833333,3.833333,24,3,1,Barcelona,Barcelona,1999,2015,2015-11-06,PE,Gas natural,160.0,77.024,0.025,5,0,False
112,188889938,ZRV-00001406,6,0.166667,0.638889,1,3.833333,3.833333,24,4,0,Barcelona,Barcelona,1999,2017,2017-11-23,PE,Gas natural,160.0,77.024,0.025,5,0,False


In [27]:
#creating a column with the Average of MonthsLastRev grouping per PipeId and placing it in 7th position
df_combined_mod['Average_MonthsLastRev'] = df_combined_mod.groupby('PipeId')['MonthsLastRev'].transform('mean')
cols = list(df_combined_mod.columns.values)
cols.pop(cols.index('Average_MonthsLastRev'))
df_combined_mod = df_combined_mod[['PipeId', 'MaintenanceId', 'Inspections', 'Probability_incidence', 'Risk_S*I/Inspections', 'Average_MonthsLastRev',
    'No_Incidents', 'Risk_S*I', 'average_severity', 'MonthsLastRev', 'Severity', 'Incidence', 'Province',
    'Town','YearBuilt', 'InspectionYear', 'InspectionDate','Material', 'GasType', 'Diameter', 'Length', 'Pressure',
    'NumConnections', 'NumConnectionsUnder', 'BoolBridle']]
df_combined_mod.head(1)

'Average_MonthsLastRev'

Unnamed: 0,PipeId,MaintenanceId,Inspections,Probability_incidence,Risk_S*I/Inspections,Average_MonthsLastRev,No_Incidents,Risk_S*I,average_severity,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,InspectionYear,InspectionDate,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
0,189311802,ZRV-00001972,6,0.0,0.0,24.0,0,0.0,4.0,24,4,0,Barcelona,Sentmenat,2001,2010,2010-10-01,PE,Gas natural,160.0,117.831,4.0,0,0,False


In [37]:
#creating a column named age_pipe_inspection with the difference between InspectionYear and YearBuilt and placing it in 10 th position
df_combined_mod['Age_pipe_inspection'] = df_combined_mod['InspectionYear'] - df_combined_mod['YearBuilt']
cols = list(df_combined_mod.columns.values)
cols.pop(cols.index('Age_pipe_inspection'))
df_combined_mod = df_combined_mod[['PipeId', 'MaintenanceId', 'Inspections', 'Probability_incidence', 'Risk_S*I/Inspections', 'Average_MonthsLastRev', 
    'Age_pipe_inspection', 'No_Incidents', 'Risk_S*I', 'average_severity', 'MonthsLastRev', 'Severity', 'Incidence', 'Province',
    'Town','YearBuilt', 'InspectionYear', 'InspectionDate','Material', 'GasType', 'Diameter', 'Length', 'Pressure',
    'NumConnections', 'NumConnectionsUnder', 'BoolBridle']]
df_combined_mod.head(1)

'Age_pipe_inspection'

Unnamed: 0,PipeId,MaintenanceId,Inspections,Probability_incidence,Risk_S*I/Inspections,Average_MonthsLastRev,Age_pipe_inspection,No_Incidents,Risk_S*I,average_severity,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,InspectionYear,InspectionDate,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
0,189311802,ZRV-00001972,6,0.0,0.0,24.0,9,0,0.0,4.0,24,4,0,Barcelona,Sentmenat,2001,2010,2010-10-01,PE,Gas natural,160.0,117.831,4.0,0,0,False


In [43]:
#create a column named age_pipe_incidence with the aggregate of Age_pipe_inspection per PipeId divided by No_Incidents and in case No_Incidents is 0, substitute by 1
df_combined_mod['Age_pipe_incidence'] = df_combined_mod.groupby('PipeId')['Age_pipe_inspection'].transform('sum') / df_combined_mod['No_Incidents']
#place it in 10th position
cols = list(df_combined_mod_incidence.columns.values)
cols.pop(cols.index('Average_age_pipe_when_incidence'))
df_combined_mod = df_combined_mod[['PipeId', 'MaintenanceId', 'Inspections', 'Probability_incidence', 'Risk_S*I/Inspections', 'Average_MonthsLastRev',
    'Average_age_pipe_when_incidence', 'Age_pipe_inspection', 'No_Incidents', 'Risk_S*I', 'average_severity', 'MonthsLastRev', 'Severity', 'Incidence', 'Province',
    'Town','YearBuilt', 'InspectionYear', 'InspectionDate','Material', 'GasType', 'Diameter', 'Length', 'Pressure',
    'NumConnections', 'NumConnectionsUnder', 'BoolBridle']]
#show head of rows where Average_age_pipe_when_incidence is 1 and No_Incidents is greater than 0 
df_combined_mod[(df_combined_mod['Average_age_pipe_when_incidence'] == 1) & (df_combined_mod['No_Incidents'] == 0)].head(5)


'Average_age_pipe_when_incidence'

Unnamed: 0,PipeId,MaintenanceId,Inspections,Probability_incidence,Risk_S*I/Inspections,Average_MonthsLastRev,Average_age_pipe_when_incidence,Age_pipe_inspection,No_Incidents,Risk_S*I,average_severity,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,InspectionYear,InspectionDate,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
730,355107556,ZRV-00001915,6,0.0,0.0,23.5,1.0,1,0,0.0,4.0,21,4,0,Barcelona,Montcada I Reixach,2009,2010,2010-10-01,PE,Gas natural,110.0,10.441,5.0,1,0,False
736,355107547,ZRV-00001915,6,0.0,0.0,23.5,1.0,1,0,0.0,4.0,21,4,0,Barcelona,Montcada I Reixach,2009,2010,2010-10-01,PE,Gas natural,110.0,1.422,5.0,0,0,False
742,355107529,ZRV-00001915,6,0.0,0.0,23.5,1.0,1,0,0.0,4.0,21,4,0,Barcelona,Montcada I Reixach,2009,2010,2010-10-01,PE,Gas natural,110.0,7.761,5.0,0,0,False
748,355107511,ZRV-00001915,4,0.0,0.0,35.25,1.0,1,0,0.0,4.0,21,4,0,Barcelona,Montcada I Reixach,2009,2010,2010-10-01,PE,Gas natural,90.0,0.558,5.0,0,0,False
752,355107391,ZRV-00001915,4,0.0,0.0,35.25,1.0,1,0,0.0,4.0,21,4,0,Barcelona,Montcada I Reixach,2009,2010,2010-10-01,PE,Gas natural,63.0,0.388,5.0,0,0,False


In [None]:
#place it in 10th position
cols = list(df_combined_mod_incidence.columns.values)
cols.pop(cols.index('Average_age_pipe_when_incidence'))
df_combined_mod = df_combined_mod[['PipeId', 'MaintenanceId', 'Inspections', 'Probability_incidence', 'Risk_S*I/Inspections', 'Average_MonthsLastRev',
    'Average_age_pipe_when_incidence', 'Age_pipe_inspection', 'No_Incidents', 'Risk_S*I', 'average_severity', 'MonthsLastRev', 'Severity', 'Incidence', 'Province',
    'Town','YearBuilt', 'InspectionYear', 'InspectionDate','Material', 'GasType', 'Diameter', 'Length', 'Pressure',
    'NumConnections', 'NumConnectionsUnder', 'BoolBridle']]
#show head of rows where Average_age_pipe_when_incidence is 1 and No_Incidents is greater than 0 and Age_pipe_inspection is different than No_Incidents
df_combined_mod[(df_combined_mod['Average_age_pipe_when_incidence'] == 1) & (df_combined_mod['No_Incidents'] > 0) & (df_combined_mod['Age_pipe_inspection'] != df_combined_mod['No_Incidents'])].head(5)

In [None]:
#transform Inspection Date to datetime format
df_combined_mod['InspectionDate'] = pd.to_datetime(df_combined_mod['InspectionDate'])


# Calculating aprox gas leakage for case study
## Revisit whenever team has come down with a factor!

### JUAN: Risk Matrix
    Alto riesgo alta probabilidad Bajo riesgo baja probabilidad

Before narrowing down the dataset, we want to take an ssumption in the case there has been an incidence, to take half of the months since last inspection and multiply by a factor to estimate losses and hence potential savings

In [None]:
#create a new column named aprox_gas leakage taking PipeId with Incidence = 1 and multyplyig a factor of to MonthsLastRev/2
df_combined_mod['leakage_estimate_factor'] = df_combined_mod['Risk_S*I/Inspections'] * (df_combined_mod['MonthsLastRev']/2)
#position the new column in the 6th position
cols = list(df_combined_mod.columns.values)
cols.pop(cols.index('leakage_estimate_factor'))
df_combined_mod = df_combined_mod[['PipeId', 'MaintenanceId', 'Inspections', 'No_Incidents', 'Risk_S*I/Inspections','leakage_estimate_factor','InspectionYear', 'InspectionDate', 'MonthsLastRev', 'Risk_S*I',
    'Severity','Incidence', 'Province', 'Town', 'YearBuilt', 'Material', 'GasType','Diameter', 'Length', 'Pressure', 'NumConnections', 'NumConnectionsUnder', 'BoolBridle']]
df_combined_mod.head(10)

In [None]:
df_combined.shape

# Narrowing combined dataset

Now that we have extracted the interesting information from combining the dataset, we will further transform it to only include 1 pipeID and proceed with further data wrangling. We eliminate the maintenaceId for now

In [None]:
#keep only one PipeId per row
df_combined_mod = df_combined_mod.drop_duplicates(subset=['PipeId'], keep='first')
df_combined_mod.head()

In [None]:
df_combined_mod.shape

In [None]:
#removing MaintenanceId column
df_combined_mod = df_combined_mod.drop(['MaintenanceId'], axis=1)

# Data Wrangling
Start of cleaning and organizing a dataset

In [None]:
df_combined['BoolBridle'].describe()

In [None]:
# Converting Boolbride into  boolean variable
def boolbridle(x):
    return 1 if x == 'True' else 0

In [None]:
# Apply function on dataset
df_combined['BoolBridle'] = df_combined['BoolBridle'].apply(lambda x: boolbridle(x))

In [None]:
#using custom made function to describe the dataset
dataset_description(df_combined_mod)

In [None]:
#showing head
df_combined_mod.head(1)

In [None]:
#Extract the day of the week from InspectionDate and place it in 5th position
df_combined_mod['InspectionDay'] = df_combined_mod['InspectionDate'].dt.day_name()
cols = list(df_combined_mod.columns.values)
cols.pop(cols.index('InspectionDay'))
df_combined_mod = df_combined_mod[['PipeId', 'Inspections', 'No_Incidents', 'Risk_S*I/Inspections','leakage_estimate_factor','InspectionDay',
    'InspectionYear', 'InspectionDate', 'MonthsLastRev', 'Risk_S*I','Severity','Incidence', 'Province', 'Town', 'YearBuilt', 'Material', 'GasType',
    'Diameter', 'Length', 'Pressure', 'NumConnections', 'NumConnectionsUnder', 'BoolBridle']]
df_combined_mod.head(1)


In [None]:
#display possible values for week_day column and how many times they appear
df_combined_mod['InspectionDay'].value_counts()

In [None]:
#hot encode GasType column in the df_combined_mod dataframe subset
df_combined_mod = pd.get_dummies(df_combined_mod, columns=['GasType'], prefix = ['GasType'])

In [None]:
#delete GasType_Gas propano column
df_combined_mod = df_combined_mod.drop(['GasType_Gas propano'], axis=1)

In [None]:
#change name of GasType_Gas natural column to gas_natural
df_combined_mod = df_combined_mod.rename(columns={'GasType_Gas natural': 'gas_natural'})
df_combined_mod.head(1)

# Adding a new dataset
We want to extract value of two categorical variables, Town and Province but the way we have them now they are useless.

We will add a new dataset to join and extract the surface of each town as well as the comunidad autonoma to group by accordingly in another column the number of towns and afterwards hot encode


In [None]:
#count towns populating Town column
df_combined_mod['Town'].value_counts()
df_combined_mod['Town'].unique()

In [None]:
#point to the folder where the data is stored
os.chdir(r"C:\Users\pedro\datathon\base\complementary_datasets")

# Loading combined_mod dataset
df_mun = pd.read_excel('list-mun-2012.xls' , sheet_name='list-mun')

df_population= pd.read_excel('pobmun20.xls')


In [None]:
df_mun.head(1)

In [None]:
df_population.head(1)

In [None]:
normalize_characters(df_population, 'PROVINCIA')

In [None]:
normalize_characters(df_mun, 'Municipio')

In [None]:
normalize_characters(df_combined_mod, 'Town')

In [None]:
normalize_characters(df_combined_mod, 'Province')


In [None]:
#join df_combined_mod and df_mun on Town column and Municipio column
df_combined_mod = pd.merge(df_combined_mod, df_mun, how='left', left_on='Town', right_on='Municipio')
df_combined_mod.head(2)

In [None]:
#place Province and Town Columns at the end of the dataframe
cols = list(df_combined_mod.columns.values)
cols.pop(cols.index('Province'))
cols.pop(cols.index('Town'))
df_combined_mod = df_combined_mod[cols+['Province','Town']]
df_combined_mod.tail(2)

In [None]:
df_mun.columns

In [None]:
#deleting 'CP', 'Provincia', 'CPJ', 'Partido_Judicial', 'Capitalidad', 'Año', 'CA', 'codine', 'Municipio' from df_combined_mod dataframe
df_combined_mod = df_combined_mod.drop(['CP', 'Provincia', 'CPJ', 'Partido_Judicial', 'Capitalidad', 'Año', 'CA', 'codine', 'Municipio'], axis=1)
df_combined_mod.head(2)


In [None]:
#count nulls in each column for df_combined_mod dataframe
df_combined_mod.isnull().sum()


In [None]:
#delete column "Superficie" from df_combined_mod dataframe
df_combined_mod = df_combined_mod.drop(['Superficie'], axis=1)

In [None]:
#make a function for replacing the null values in df_combined_mod dataframe with a string of my choice
def replace_nulls(df, column, string):
    df[column] = df[column].fillna(string)
    return df

In [None]:
#for Province with value tarragona, replace null values in Autonomia with Cataluña

df_combined_mod.loc[df_combined_mod['Province'] == 'barcelona', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'barcelona', 'Autonomía'].fillna('Cataluña')
df_combined_mod.loc[df_combined_mod['Province'] == 'tarragona', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'tarragona', 'Autonomía'].fillna('Cataluña')
df_combined_mod.loc[df_combined_mod['Province'] == 'lleida', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'lleida', 'Autonomía'].fillna('Cataluña')
df_combined_mod.loc[df_combined_mod['Province'] == 'girona', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'girona', 'Autonomía'].fillna('Cataluña')



In [None]:
#count nulls in Autonomia column in df_combined_mod dataframe
df_combined_mod['Autonomía'].isnull().sum()

In [None]:
#for Province with value "La Rioja", replace null values in Autonomía with "Rioja (La)"
df_combined_mod.loc[df_combined_mod['Province'] == 'la rioja', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'la rioja', 'Autonomía'].fillna('Rioja (La)')


In [None]:
#for Province with value Orense, replace null values in Autonomía with "Galicia"
df_combined_mod.loc[df_combined_mod['Province'] == 'orense', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'orense', 'Autonomía'].fillna('Galicia')
df_combined_mod.loc[df_combined_mod['Province'] == 'la coruna', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'la coruna', 'Autonomía'].fillna('Galicia')
df_combined_mod.loc[df_combined_mod['Province'] == 'pontevedra', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'pontevedra', 'Autonomía'].fillna('Galicia')
df_combined_mod.loc[df_combined_mod['Province'] == 'lugo', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'lugo', 'Autonomía'].fillna('Galicia')


In [None]:
#for Province with value Zaragoza, replace null values in Autonomía with "Aragón"
df_combined_mod.loc[df_combined_mod['Province'] == 'zaragoza', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'zaragoza', 'Autonomía'].fillna('Aragón')
df_combined_mod.loc[df_combined_mod['Province'] == 'huesca', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'huesca', 'Autonomía'].fillna('Aragón')

In [None]:
#for Province with value Toledo, replace null values in Autonomía with "Castilla-La Mancha"
df_combined_mod.loc[df_combined_mod['Province'] == 'toledo', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'toledo', 'Autonomía'].fillna('Castilla-La Mancha')
df_combined_mod.loc[df_combined_mod['Province'] == 'albacete', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'albacete', 'Autonomía'].fillna('Castilla-La Mancha')
df_combined_mod.loc[df_combined_mod['Province'] == 'ciudad real', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'ciudad real', 'Autonomía'].fillna('Castilla-La Mancha')
df_combined_mod.loc[df_combined_mod['Province'] == 'cuenca', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'cuenca', 'Autonomía'].fillna('Castilla-La Mancha')


In [None]:
#for Province with value Almeria, replace null values in Autonomía with "Andalucía"
df_combined_mod.loc[df_combined_mod['Province'] == 'almeria', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'almeria', 'Autonomía'].fillna('Andalucía')
df_combined_mod.loc[df_combined_mod['Province'] == 'sevilla', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'sevilla', 'Autonomía'].fillna('Andalucía')
df_combined_mod.loc[df_combined_mod['Province'] == 'jaen', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'jaen', 'Autonomía'].fillna('Andalucía')
df_combined_mod.loc[df_combined_mod['Province'] == 'cordoba', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'cordoba', 'Autonomía'].fillna('Andalucía')
df_combined_mod.loc[df_combined_mod['Province'] == 'cadiz', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'cadiz', 'Autonomía'].fillna('Andalucía')
df_combined_mod.loc[df_combined_mod['Province'] == 'granada', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'granada', 'Autonomía'].fillna('Andalucía')



In [None]:
#for Province with value Zamora, replace null values in Autonomía with "Castilla y León"
df_combined_mod.loc[df_combined_mod['Province'] == 'zamora', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'zamora', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'segovia', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'segovia', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'valladolid', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'valladolid', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'leon', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'leon', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'salamanca', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'salamanca', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'burgos', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'burgos', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'soria', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'soria', 'Autonomía'].fillna('Castilla y León')
df_combined_mod.loc[df_combined_mod['Province'] == 'palencia', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'palencia', 'Autonomía'].fillna('Castilla y León')


In [None]:
#for Province with value Madrid, replace null values in Autonomía with "Madrid (Comunidad de)"
df_combined_mod.loc[df_combined_mod['Province'] == 'madrid', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'madrid', 'Autonomía'].fillna('Madrid (Comunidad de)')


In [None]:
#for Province with value Alicante, replace null values in Autonomía with Comunidad Valenciana
df_combined_mod.loc[df_combined_mod['Province'] == 'alicante', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'alicante', 'Autonomía'].fillna('Comunidad Valenciana')
df_combined_mod.loc[df_combined_mod['Province'] == 'castellon', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'castellon', 'Autonomía'].fillna('Comunidad Valenciana')
df_combined_mod.loc[df_combined_mod['Province'] == 'valencia', 'Autonomía'] = df_combined_mod.loc[df_combined_mod['Province'] == 'valencia', 'Autonomía'].fillna('Comunidad Valenciana')


In [None]:
#count nulls in Autonomia column in df_combined_mod dataframe
df_combined_mod['Autonomía'].isnull().sum()

In [None]:
#droping df_mun from memory to free RAM
del df_mun
del df_inspection
del df_network

In [None]:
#change df_population POB32 to int32
df_population['POB20'] = df_population['POB20'].astype('int32')
df_population.dtypes

In [None]:
df_combined_mod.dtypes

In [None]:
#create function to change detypes in64 to int32 in a df
def change_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
        elif df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
    return df

In [None]:
change_dtypes(df_combined_mod)

In [None]:
#change df_combined_mod int64 columns to int32 to save memory
df_combined_mod['PipeId'] = df_combined_mod['PipeId'].astype('int32')
df_combined_mod['YearBuilt'] = df_combined_mod['YearBuilt'].astype('int8')
df_combined_mod['NumConnections'] = df_combined_mod['NumConnections'].astype('int32')
df_combined_mod['InspectionYear'] = df_combined_mod['InspectionYear'].astype('int32')
df_combined_mod['Severity'] = df_combined_mod['Severity'].astype('int8')
df_combined_mod['Incidence'] = df_combined_mod['Incidence'].astype('int8')
df_combined_mod['Inspections'] = df_combined_mod['Inspections'].astype('int8')



In [None]:
#delete columns from df_population dataframe
df_population.drop(['NOMBRE', 'HOMBRES', 'MUJERES'], axis=1, inplace=True)

In [None]:
#joining df_combined_mod and df_population on Province columns adding only POB20 column from df_population
df_combined_mod = pd.merge(df_combined_mod, df_population[['PROVINCIA','POB20']], how='left', left_on='Province', right_on='PROVINCIA')

In [None]:
df_combined_mod.columns

In [None]:
#eda pandas profiling

# Further feature engineering

We want to also see materials available for the pipes as a hot encoded feature

In [None]:
print (df_combined['Material'].unique())

In [None]:
#Map df_combined Material column to the names of materials
df_combined_mod['Material'] = df_combined_mod['Material'].map({'PE': 'Polyethylene', 'AO': 'Acrylonitrile-Butadiene-Styrene', 'FD': 'Fiberglass-Reinforced Plastic', 
    'FG': 'Fiberglass', 'PN': 'Polypropylene', 'PA': 'Polyamide', 'FO': 'Flexible Polyolefin', 'FI': 'Flexible Polyvinyl Chloride', 'CU': 'Copper', 
    'PV': 'Polyvinylidene Fluoride', 'ZD': 'Zinc-Coated Steel', 'ZA': 'Zinc-Aluminum', 'CP': 'Cast Iron', 'CS': 'Cast Steel', 
    'ZC': 'Zinc-Coated Steel', 'ZM': 'Zinc-Magnesium','ZN': 'Zinc', 'AL': 'Aluminum', 'ZP': 'Zinc-Coated Steel', 'ZF': 'Zinc-Aluminum-Magnesium'})


In [None]:
df_combined_mod.tail(1)

In [None]:
#Hot enconde Material column in the df_combined_mod dataframe subset
df_combined_mod = pd.get_dummies(df_combined_mod, columns=['Material'], prefix = ['Material'])

In [None]:
#deleting columns Material_Fiberglass, Material_Zinc_coated_steel, Material_Polyvinylidene_fluoride
df_combined_mod = df_combined_mod.drop(['Material_Fiberglass', 'Material_Zinc-Coated Steel', 'Material_Polyvinylidene Fluoride'], axis=1)

In [None]:
#deleting columns Material_Flexible Polyolefin,Material_Flexible Polyvinyl Chloride, Material_Polyamide
df_combined_mod = df_combined_mod.drop(['Material_Flexible Polyolefin', 'Material_Flexible Polyvinyl Chloride', 'Material_Polyamide'], axis=1)
df_combined_mod.head(1)

In [None]:
#delete outliers in year_built column to keep values only higher than 1960
df_combined_mod = df_combined_mod[df_combined_mod['YearBuilt'] > 1960]


In [None]:
df_combined_mod.dtypes

In [None]:
#eliminate values higher than 40 in MonthsLastRev column
df_combined_mod = df_combined_mod[df_combined_mod['MonthsLastRev'] < 40]

In [None]:
#graph histogram of MonthsLastRev column
df_combined_mod['MonthsLastRev'].hist()

In [None]:
#eliminate values higher than 400 in diameter column
df_combined_mod = df_combined_mod[df_combined_mod['Diameter'] < 400]

In [None]:
#show histogram of diameter column
df_combined_mod['Diameter'].hist()

In [None]:
#plot column Length
df_combined_mod['Length'].value_counts()

In [None]:
#plot histogram of Length column
df_combined_mod['Length'].hist()

# Feature Engineering

In [None]:
#create a new column that divides the diameter by the pressure and name it Relative Thickness
df_combined_mod['Relative_Thickness'] = df_combined_mod['Diameter'] / df_combined_mod['Pressure']

In [None]:
#Create a new column that takes the inspecion year and substracts the YearBuilt column naming it Age of Pipe
df_combined_mod['Age_of_Pipe_upon_inspection'] = df_combined_mod['InspectionYear'] - df_combined_mod['YearBuilt']
df_combined_mod.head(1)

In [None]:
#create a column that multiplies severity by incidence and name it severity_incidence
df_combined_mod['Severity_Incidence'] = df_combined_mod['Severity'] * df_combined_mod['Incidence']

In [None]:
#delete column severity
df_combined_mod = df_combined_mod.drop(['Severity'], axis=1)
df_combined_mod.head(1)


In [None]:
df_combined_mod.dtypes

In [None]:
#hot encode severity_incidence column in the df_combined_mod dataframe
df_combined_mod = pd.get_dummies(df_combined_mod, columns=['Severity_Incidence'], prefix = ['Severity_Incidence'])
df_combined_mod.head(1)

In [None]:
#change name of Severity_Incidence_0 column to Severity_0
df_combined_mod = df_combined_mod.rename(columns={'Severity_Incidence_0': 'Severity_0'})
#change name of Severity_Incidence_1 column to Severity_low
df_combined_mod = df_combined_mod.rename(columns={'Severity_Incidence_1': 'Severity_low'})
#change name of Severity_Incidence_2 column to Severity_medium
df_combined_mod = df_combined_mod.rename(columns={'Severity_Incidence_2': 'Severity_medium'})
#change name of Severity_Incidence_3 column to Severity_high
df_combined_mod = df_combined_mod.rename(columns={'Severity_Incidence_3': 'Severity_high'})
df_combined_mod.head(1)


In [None]:
df_combined_mod.shape

In [None]:
#General number of high severity per year
df_combined_mod.groupby('InspectionYear')['Severity_high'].value_counts()

In [None]:
df_combined_mod.dtypes

# Exporting the new dataset for a backup

In [None]:
#point to the folder where the data is stored
os.chdir(r"C:\Users\pedro\datathon")
#export the dataframe to a csv file
df_combined_mod.to_csv('df_combined_mod.csv', index=False)

# Plotting EDA for new dataset

In [None]:
#pandas profiling on the df_combined_mod dataframe
profile = ProfileReport(df_combined_mod, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile

# WOULD BE NICE TO KNOW

- what do we do with categoricals
    province and town, are they relevant?
- how toi interpret the length?
- an extra dataset would be great to create more features 
- should we create polinomial features

In [None]:
#create polinomial features 
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
poly_data = poly.fit_transform(df_combined_mod[['Relative_Thickness', 'Age_of_Pipe_upon_inspection']])
poly_columns = poly.get_feature_names(['Relative_Thickness', 'Age_of_Pipe_upon_inspection'])
df_poly = pd.DataFrame(poly_data, columns=poly_columns)
df_poly = df_poly.drop(['1'], axis=1)
df_combined_mod = pd.concat([df_combined_mod, df_poly], axis=1)
df_combined_mod.head(1)