In [1]:
import os
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, KFold
from lightgbm import LGBMClassifier

import gc

## Import files

In [2]:
buil_owner = pd.read_csv('/home/vivek/Datasets/Building_Damage_Classification/Building_Ownership_Use.csv')
buil_stru = pd.read_csv('/home/vivek/Datasets/Building_Damage_Classification/Building_Structure.csv')
train = pd.read_csv('/home/vivek/Datasets/Building_Damage_Classification/train.csv')
test = pd.read_csv('/home/vivek/Datasets/Building_Damage_Classification/test.csv')

In [3]:
# First we will work on "Building Ownership" file
buil_owner.head()

Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,a3380c4f75,7,701,70102,Private,1.0,0.0,0,0,0,0,0,0,0,0,0,0
1,a3380c4fd9,7,701,70102,Private,1.0,0.0,0,0,0,0,0,0,0,0,0,0
2,a3380c503e,7,701,70102,Private,1.0,0.0,0,0,0,0,0,0,0,0,0,0
3,a338a4e5f2,7,701,70103,Private,1.0,0.0,0,0,0,0,0,0,0,0,0,0
4,a338a4e653,7,701,70103,Private,1.0,0.0,0,0,0,0,0,0,0,0,0,0


In [4]:
buil_owner.shape

(1052948, 17)

First check which of the features are `object` which we can encode it via `Label Encoding` or `OneHotEncoding`.

In [5]:
buil_owner.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052948 entries, 0 to 1052947
Data columns (total 17 columns):
building_id                      1052948 non-null object
district_id                      1052948 non-null int64
vdcmun_id                        1052948 non-null int64
ward_id                          1052948 non-null int64
legal_ownership_status           1052948 non-null object
count_families                   1052946 non-null float64
has_secondary_use                1052938 non-null float64
has_secondary_use_agriculture    1052948 non-null int64
has_secondary_use_hotel          1052948 non-null int64
has_secondary_use_rental         1052948 non-null int64
has_secondary_use_institution    1052948 non-null int64
has_secondary_use_school         1052948 non-null int64
has_secondary_use_industry       1052948 non-null int64
has_secondary_use_health_post    1052948 non-null int64
has_secondary_use_gov_office     1052948 non-null int64
has_secondary_use_use_police     1052948 

As we can see that we only have one `object` file which is **legal_ownership_status**. We will use pandas's `get_dummies` method to encode that features.

First let's check how many classes we have in `legal_ownership_status` feature.

In [6]:
buil_owner['legal_ownership_status'].value_counts()

Private          1014042
Public             22326
Institutional      10502
Other               6078
Name: legal_ownership_status, dtype: int64

In [7]:
buil_owner = pd.concat([buil_owner, pd.get_dummies(buil_owner.legal_ownership_status, prefix='owner')], axis=1).drop('legal_ownership_status', axis=1)

Reading **building_structural**.

In [8]:
buil_stru.head()

Unnamed: 0,building_id,district_id,vdcmun_id,ward_id,count_floors_pre_eq,count_floors_post_eq,age_building,plinth_area_sq_ft,height_ft_pre_eq,height_ft_post_eq,...,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,condition_post_eq
0,a3380c4f75,7,701,70102,1,1,28,454,9,9,...,0,0,0,0,1,1,0,0,1,Damaged-Repaired and used
1,a3380c4fd9,7,701,70102,1,1,32,324,9,9,...,0,0,0,0,1,1,0,0,1,Damaged-Repaired and used
2,a3380c503e,7,701,70102,2,2,34,456,18,18,...,0,0,0,0,1,1,0,0,1,Damaged-Repaired and used
3,a338a4e5f2,7,701,70103,2,2,20,452,18,18,...,0,0,0,0,1,1,0,0,1,Damaged-Repaired and used
4,a338a4e653,7,701,70103,1,0,25,542,9,0,...,0,0,0,0,1,1,0,0,1,Damaged-Rubble unclear


In [9]:
buil_stru.shape

(1052948, 29)

In [10]:
buil_stru.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052948 entries, 0 to 1052947
Data columns (total 29 columns):
building_id                               1052948 non-null object
district_id                               1052948 non-null int64
vdcmun_id                                 1052948 non-null int64
ward_id                                   1052948 non-null int64
count_floors_pre_eq                       1052948 non-null int64
count_floors_post_eq                      1052948 non-null int64
age_building                              1052948 non-null int64
plinth_area_sq_ft                         1052948 non-null int64
height_ft_pre_eq                          1052948 non-null int64
height_ft_post_eq                         1052948 non-null int64
land_surface_condition                    1052948 non-null object
foundation_type                           1052948 non-null object
roof_type                                 1052948 non-null object
ground_floor_type                     

Use dummies to convert all the `object` features.

In [11]:
# For "land_surface_condition" take a look at how many classes we have
buil_stru.land_surface_condition.value_counts()

Flat              874202
Moderate slope    144748
Steep slope        33998
Name: land_surface_condition, dtype: int64

In [12]:
buil_stru.foundation_type.value_counts()

Mud mortar-Stone/Brick    886331
Bamboo/Timber              60598
Cement-Stone/Brick         57354
RC                         42808
Other                       5857
Name: foundation_type, dtype: int64

In [13]:
buil_stru.roof_type.value_counts()

Bamboo/Timber-Light roof    739184
Bamboo/Timber-Heavy roof    248235
RCC/RB/RBC                   65529
Name: roof_type, dtype: int64

In [14]:
buil_stru.ground_floor_type.value_counts()

Mud            847250
Brick/Stone    100487
RC              99046
Timber           4072
Other            2093
Name: ground_floor_type, dtype: int64

In [15]:
buil_stru.other_floor_type.value_counts()

TImber/Bamboo-Mud    666182
Timber-Planck        177571
Not applicable       160301
RCC/RB/RBC            48894
Name: other_floor_type, dtype: int64

In [16]:
buil_stru.position.value_counts()

Not attached       816364
Attached-1 side    173459
Attached-2 side     53522
Attached-3 side      9602
Name: position, dtype: int64

In [17]:
buil_stru.plan_configuration.value_counts()

Rectangular                        1009987
Square                               23106
L-shape                              15088
Multi-projected                       1412
T-shape                               1302
Others                                 994
U-shape                                591
Building with Central Courtyard        203
E-shape                                167
H-shape                                 97
Name: plan_configuration, dtype: int64

In [18]:
buil_stru.condition_post_eq.value_counts()

Damaged-Not used                           249464
Damaged-Repaired and used                  211119
Damaged-Used in risk                       187480
Damaged-Rubble unclear                     145732
Damaged-Rubble clear                       132508
Not damaged                                 71587
Damaged-Rubble Clear-New building built     54587
Covered by landslide                          471
Name: condition_post_eq, dtype: int64

In [19]:
buil_land_dum = pd.get_dummies(buil_stru.land_surface_condition, prefix='land_')
buil_foun = pd.get_dummies(buil_stru.foundation_type, prefix='foun')
buil_roof = pd.get_dummies(buil_stru.roof_type, prefix='roof')
buil_grnd = pd.get_dummies(buil_stru.ground_floor_type, prefix='grnd')
buil_oth = pd.get_dummies(buil_stru.other_floor_type, prefix='oth')
buil_pos = pd.get_dummies(buil_stru.position, prefix='pos')
buil_plan = pd.get_dummies(buil_stru.plan_configuration, prefix='plan')
buil_con = pd.get_dummies(buil_stru.condition_post_eq, prefix='con')

In [20]:
buil_stru = pd.concat([buil_stru, buil_land_dum, buil_foun, buil_roof, buil_grnd, buil_oth, buil_pos, buil_pos, buil_plan, buil_con], axis=1).drop(['land_surface_condition','foundation_type','roof_type','ground_floor_type','other_floor_type','position','plan_configuration','condition_post_eq'], axis=1)

In [21]:
del buil_land_dum, buil_foun, buil_roof, buil_grnd, buil_oth, buil_pos,buil_plan, buil_con
gc.collect()

18

In [38]:
stru_owner = pd.merge(buil_owner, buil_stru, on=['building_id','district_id','vdcmun_id','ward_id'])

In [41]:
stru_owner.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1052948 entries, 0 to 1052947
Data columns (total 83 columns):
building_id                                    1052948 non-null object
district_id                                    1052948 non-null int64
vdcmun_id                                      1052948 non-null int64
ward_id                                        1052948 non-null int64
count_families                                 1052946 non-null float64
has_secondary_use                              1052938 non-null float64
has_secondary_use_agriculture                  1052948 non-null int64
has_secondary_use_hotel                        1052948 non-null int64
has_secondary_use_rental                       1052948 non-null int64
has_secondary_use_institution                  1052948 non-null int64
has_secondary_use_school                       1052948 non-null int64
has_secondary_use_industry                     1052948 non-null int64
has_secondary_use_health_post             

* Merge train and test dataframe. 
* First we insert 'damage_grade' in test dataframe with some Negative value.

In [42]:
test['damage_grade'] = -111
trn_tst = pd.concat((train, test), axis=0)
trn_tst.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


(1052936, 14)

In [43]:
trn_tst.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1052936 entries, 0 to 421174
Data columns (total 14 columns):
area_assesed                             1052936 non-null object
building_id                              1052936 non-null object
damage_grade                             1052936 non-null object
district_id                              1052936 non-null int64
has_geotechnical_risk                    1052936 non-null float64
has_geotechnical_risk_fault_crack        1052936 non-null int64
has_geotechnical_risk_flood              1052936 non-null int64
has_geotechnical_risk_land_settlement    1052936 non-null int64
has_geotechnical_risk_landslide          1052936 non-null int64
has_geotechnical_risk_liquefaction       1052936 non-null int64
has_geotechnical_risk_other              1052936 non-null int64
has_geotechnical_risk_rock_fall          1052936 non-null int64
has_repair_started                       997597 non-null float64
vdcmun_id                                1052936 n

In [46]:
trn_tst['area_assesed'].value_counts()

Both                   638160
Building removed       218946
Exterior               165421
Not able to inspect     27385
Interior                 3024
Name: area_assesed, dtype: int64

In [47]:
trn_tst = pd.concat([trn_tst, pd.get_dummies(trn_tst.area_assesed)], axis=1).drop('area_assesed', axis=1)

In [50]:
full = pd.merge(stru_owner, trn_tst,on=['building_id','district_id','vdcmun_id'])
full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1052936 entries, 0 to 1052935
Data columns (total 98 columns):
building_id                                    1052936 non-null object
district_id                                    1052936 non-null int64
vdcmun_id                                      1052936 non-null int64
ward_id                                        1052936 non-null int64
count_families                                 1052935 non-null float64
has_secondary_use                              1052936 non-null float64
has_secondary_use_agriculture                  1052936 non-null int64
has_secondary_use_hotel                        1052936 non-null int64
has_secondary_use_rental                       1052936 non-null int64
has_secondary_use_institution                  1052936 non-null int64
has_secondary_use_school                       1052936 non-null int64
has_secondary_use_industry                     1052936 non-null int64
has_secondary_use_health_post             

We have only one feature variable which contains missing values.
Let's check out this values and we will use `Imputer` to fill the missing values.

In [52]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values="NaN", strategy='most_frequent')
imputed_values = full.has_repair_started.values.reshape(-1,1)
full.has_repair_started = imp.fit_transform(imputed_values, full.has_repair_started)

In [None]:
full.info()