# Predicting Building Damage

In [12]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
#from category_encoders import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted

## Read datasets

The Earthquake data was split into several aspects e.g. `building_structure`, `building_damage`, `mapping` etc. There are many ways we can look at this data and **frame** the Machine Learning problem but in this case we are interested in building damage based on the structure of the building. In order to prepare the building damage predictive model we will need to combine two datasets i.e. `building_damage` and `building_structure`.

In [6]:
structure = pd.read_csv('./nepal_earthquke_full_dataset/csv_building_structure.csv', low_memory=False)

In [7]:
damage = pd.read_csv('./nepal_earthquke_full_dataset/csv_building_damage_assessment.csv', low_memory=False)

# Exploratory Data Analyses (EDA)

## Inspect the Data: Structure and Characteristics

In [8]:
structure.head()

Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,count_floors_pre_eq,count_floors_post_eq,age_building,plinth_area_sq_ft,height_ft_pre_eq,height_ft_post_eq,...,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,condition_post_eq,damage_grade,technical_solution_proposed
0,120101000011,12,1207,120703,1,1,9,288,9,9,...,0,0,0,1,0,0,0,Damaged-Used in risk,Grade 3,Major repair
1,120101000021,12,1207,120703,1,1,15,364,9,9,...,0,0,0,1,0,0,0,Damaged-Repaired and used,Grade 5,Reconstruction
2,120101000031,12,1207,120703,1,1,20,384,9,9,...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 2,Minor repair
3,120101000041,12,1207,120703,1,1,20,312,9,9,...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 2,Minor repair
4,120101000051,12,1207,120703,1,1,30,308,9,9,...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 1,Minor repair


In [9]:
structure.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762106 entries, 0 to 762105
Data columns (total 31 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             762106 non-null  int64 
 1   district_id                             762106 non-null  int64 
 2   vdcmun_id                               762106 non-null  int64 
 3   ward_id                                 762106 non-null  int64 
 4   count_floors_pre_eq                     762106 non-null  int64 
 5   count_floors_post_eq                    762106 non-null  int64 
 6   age_building                            762106 non-null  int64 
 7   plinth_area_sq_ft                       762106 non-null  int64 
 8   height_ft_pre_eq                        762106 non-null  int64 
 9   height_ft_post_eq                       762106 non-null  int64 
 10  land_surface_condition                  762106 non-null 

In [10]:
damage.head()

Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,damage_overall_collapse,damage_overall_leaning,damage_overall_adjacent_building_risk,damage_foundation_severe,damage_foundation_moderate,damage_foundation_insignificant,...,has_damage_parapet,has_damage_cladding_glazing,has_geotechnical_risk,has_geotechnical_risk_land_settlement,has_geotechnical_risk_fault_crack,has_geotechnical_risk_liquefaction,has_geotechnical_risk_landslide,has_geotechnical_risk_rock_fall,has_geotechnical_risk_flood,has_geotechnical_risk_other
0,120101000011,12,1207,120703,Moderate-Heavy,Insignificant/light,,,Moderate-Heavy-(<1/3),Insignificant/light-(<1/3),...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,120101000021,12,1207,120703,Severe-Extreme,Severe-Extreme,Insignificant/light,Severe-Extreme-(>2/3),,,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,120101000031,12,1207,120703,Moderate-Heavy,Moderate-Heavy,Moderate-Heavy,,Moderate-Heavy-(>2/3),,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,120101000041,12,1207,120703,Moderate-Heavy,Moderate-Heavy,Moderate-Heavy,,Moderate-Heavy-(>2/3),,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,120101000051,12,1207,120703,Insignificant/light,,,,,Insignificant/light-(<1/3),...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [11]:
damage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762106 entries, 0 to 762105
Data columns (total 79 columns):
 #   Column                                                Non-Null Count   Dtype  
---  ------                                                --------------   -----  
 0   building_id                                           762106 non-null  int64  
 1   district_id                                           762106 non-null  int64  
 2   vdcmun_id                                             762106 non-null  int64  
 3   ward_id                                               762106 non-null  int64  
 4   damage_overall_collapse                               500743 non-null  object 
 5   damage_overall_leaning                                500742 non-null  object 
 6   damage_overall_adjacent_building_risk                 500742 non-null  object 
 7   damage_foundation_severe                              254544 non-null  object 
 8   damage_foundation_moderate                  

## Merge dataframes 

In [17]:
df = pd.merge(damage, structure, on='building_id', how='inner')
df.head()

Unnamed: 0,building_id,district_id_x,vdcmun_id_x,ward_id_x,damage_overall_collapse,damage_overall_leaning,damage_overall_adjacent_building_risk,damage_foundation_severe,damage_foundation_moderate,damage_foundation_insignificant,...,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,condition_post_eq,damage_grade_y,technical_solution_proposed_y
0,120101000011,12,1207,120703,Moderate-Heavy,Insignificant/light,,,Moderate-Heavy-(<1/3),Insignificant/light-(<1/3),...,0,0,0,1,0,0,0,Damaged-Used in risk,Grade 3,Major repair
1,120101000021,12,1207,120703,Severe-Extreme,Severe-Extreme,Insignificant/light,Severe-Extreme-(>2/3),,,...,0,0,0,1,0,0,0,Damaged-Repaired and used,Grade 5,Reconstruction
2,120101000031,12,1207,120703,Moderate-Heavy,Moderate-Heavy,Moderate-Heavy,,Moderate-Heavy-(>2/3),,...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 2,Minor repair
3,120101000041,12,1207,120703,Moderate-Heavy,Moderate-Heavy,Moderate-Heavy,,Moderate-Heavy-(>2/3),,...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 2,Minor repair
4,120101000051,12,1207,120703,Insignificant/light,,,,,Insignificant/light-(<1/3),...,0,0,0,0,0,0,0,Damaged-Repaired and used,Grade 1,Minor repair


## Data Preparation and Cleaning

## Remove unneccessary and leaky features (columns) 

In [16]:
drop_cols = [col for col in df.columns if 'post_eq' in col]
drop_cols

['count_floors_post_eq', 'height_ft_post_eq', 'condition_post_eq']

- remove null/missing values 


## Inspect and Correct data types

In [13]:
damage.dtypes

building_id                            int64
district_id                            int64
vdcmun_id                              int64
ward_id                                int64
damage_overall_collapse               object
                                       ...  
has_geotechnical_risk_liquefaction     int64
has_geotechnical_risk_landslide        int64
has_geotechnical_risk_rock_fall        int64
has_geotechnical_risk_flood            int64
has_geotechnical_risk_other            int64
Length: 79, dtype: object

In [14]:
structure.dtypes

building_id                                int64
district_id                                int64
vdcmun_id                                  int64
ward_id                                    int64
count_floors_pre_eq                        int64
count_floors_post_eq                       int64
age_building                               int64
plinth_area_sq_ft                          int64
height_ft_pre_eq                           int64
height_ft_post_eq                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_c