In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Data Preprocessing

In [91]:
data = pd.read_csv('Earthquate_Damage.csv') 
data.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,3


In [92]:
def data_explanation(data, name):
    """
    create a txt file that contains the explanation of the data
    """
    
    with open(f'./data_description_{name}.txt','w') as f:
        for i in data.columns:
            f.write(f'Feature Name: {i} \n')
            f.write(f'# of data: {len(data[i])} \n')
            f.write(f'# of unique data: {len(data[i].unique())} \n')
            f.write(f'unique datas: {data[i].unique()} \n\n')
    f.close()
    
data_explanation(data, 'earthquake')

In [93]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 40 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   building_id                             260601 non-null  int64 
 1   geo_level_1_id                          260601 non-null  int64 
 2   geo_level_2_id                          260601 non-null  int64 
 3   geo_level_3_id                          260601 non-null  int64 
 4   count_floors_pre_eq                     260601 non-null  int64 
 5   age                                     260601 non-null  int64 
 6   area_percentage                         260601 non-null  int64 
 7   height_percentage                       260601 non-null  int64 
 8   land_surface_condition                  260601 non-null  object
 9   foundation_type                         260601 non-null  object
 10  roof_type                               260601 non-null 

Data Description file 참조 결과,

- building ID는 unique data 개수가 총 데이터 수와 같아, 제거 되어야함. Unnecessary Identifiers

- info() function 결과를 보면 모든 columnn에 null 값이 없음을 볼 수 있음.

[1] 입력 변수의 속성이 numeric 이 아닌 변수들에 대해 1-of-C coding (1-hot encoding) 방식을 통해 
명목형(요인형) 변수를 범주의 개수만큼의 이진형(binary)  변수들로 구성되는 dummy  variable  을 
생성하시오.

In [96]:
features = data.drop(['building_id','damage_grade'], axis = 1)
target = data['damage_grade']

In [99]:
#Object type features list
OHE_features = list(features.select_dtypes(include = ['object']).columns)
len(OHE_features)

8

In [100]:
OHE_features

['land_surface_condition',
 'foundation_type',
 'roof_type',
 'ground_floor_type',
 'other_floor_type',
 'position',
 'plan_configuration',
 'legal_ownership_status']

In [101]:
# one hot encoding
OHE = OneHotEncoder(drop='first')
One_hot_encoded = OHE.fit_transform(features[OHE_features])
One_hot_encoded_dt = pd.DataFrame(One_hot_encoded.toarray(), columns = OHE.get_feature_names_out(OHE_features))

features = features.drop(OHE_features, axis = 1)
features = pd.concat([features, One_hot_encoded_dt], axis = 1)

features.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,6,487,12198,2,30,6,5,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,8,900,2812,2,10,8,7,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,21,363,8973,2,10,5,5,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,22,418,10694,2,10,6,5,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,11,131,1488,3,30,8,9,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [102]:
# scale the feature values
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_features = pd.DataFrame(scaled_features, columns = features.columns)

scaled_features.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_f,plan_configuration_m,plan_configuration_n,plan_configuration_o,plan_configuration_q,plan_configuration_s,plan_configuration_u,legal_ownership_status_r,legal_ownership_status_v,legal_ownership_status_w
0,-0.983414,-0.518705,1.629055,-0.178274,0.0471,-0.45946,-0.226419,3.206391,0.558971,-0.188554,...,-0.009188,-0.013287,-0.012076,-0.024708,-0.149431,-0.036462,-0.119168,-0.075395,0.196223,-0.101878
1,-0.734459,0.481998,-0.945017,-0.178274,-0.224765,-0.00411,0.816109,-0.311877,0.558971,-0.188554,...,-0.009188,-0.013287,-0.012076,-0.024708,-0.149431,-0.036462,-0.119168,-0.075395,0.196223,-0.101878
2,0.883744,-0.819158,0.744612,-0.178274,-0.224765,-0.687135,-0.226419,-0.311877,0.558971,-0.188554,...,-0.009188,-0.013287,-0.012076,-0.024708,-0.149431,-0.036462,-0.119168,-0.075395,0.196223,-0.101878
3,1.008221,-0.685893,1.216589,-0.178274,-0.224765,-0.45946,-0.226419,-0.311877,0.558971,-0.188554,...,-0.009188,-0.013287,-0.012076,-0.024708,-0.149431,-0.036462,-0.119168,-0.075395,0.196223,-0.101878
4,-0.361028,-1.381296,-1.308119,1.195989,0.0471,-0.00411,1.858636,3.206391,-1.789003,-0.188554,...,-0.009188,-0.013287,-0.012076,-0.024708,-0.149431,-0.036462,-0.119168,-0.075395,0.196223,-0.101878


[2] 전체 데이터셋을 임의로 150,000 개의 빌딩이 포함된 Training dataset 과 50,000 개의 Validation 
dataset, 그리고 60,601 개의 Test dataset 으로 구분한 뒤 다음 각 물음에 답하시오. 분류 성능을 
평가/비교할 때는 3-class classification 의 Accuracy 와 Balanced Correction Rate (BCR)을 이용하시오.

In [104]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(scaled_features, target, test_size = 60601, random_state = 42)
# train valid split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 50000, random_state = 42)

print(len(X_train), len(X_valid), len(X_test))

150000 50000 60601


In [None]:
# performance evaluation function

def perf_eval_fc(y_pred, y_test):
    