# Logistic Regression Model

https://www.drivendata.org/competitions/57/nepal-earthquake/page/136/

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_validate, KFold, RandomizedSearchCV

In [2]:
raw_train = pd.read_csv('Data/train_values.csv')
raw_label = pd.read_csv('Data/train_labels.csv')
df = pd.merge(raw_train, raw_label, how="left", on="building_id")
df.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,802906,6,487,12198,2,30,6,5,t,r,...,0,0,0,0,0,0,0,0,0,3
1,28830,8,900,2812,2,10,8,7,o,r,...,0,0,0,0,0,0,0,0,0,2
2,94947,21,363,8973,2,10,5,5,t,r,...,0,0,0,0,0,0,0,0,0,3
3,590882,22,418,10694,2,10,6,5,t,r,...,0,0,0,0,0,0,0,0,0,2
4,201944,11,131,1488,3,30,8,9,t,r,...,0,0,0,0,0,0,0,0,0,3


In [3]:
df = df[(df['age'] < 300) &
        (df['height_percentage'] < df['height_percentage'].quantile(0.9)) &
        (df['count_floors_pre_eq'] < df['count_floors_pre_eq'].quantile(0.9)) &
        (df['area_percentage'] < df['area_percentage'].quantile(0.9))
       ]

In [4]:
# check for NA values
df.isnull().sum()

building_id                               0
geo_level_1_id                            0
geo_level_2_id                            0
geo_level_3_id                            0
count_floors_pre_eq                       0
age                                       0
area_percentage                           0
height_percentage                         0
land_surface_condition                    0
foundation_type                           0
roof_type                                 0
ground_floor_type                         0
other_floor_type                          0
position                                  0
plan_configuration                        0
has_superstructure_adobe_mud              0
has_superstructure_mud_mortar_stone       0
has_superstructure_stone_flag             0
has_superstructure_cement_mortar_stone    0
has_superstructure_mud_mortar_brick       0
has_superstructure_cement_mortar_brick    0
has_superstructure_timber                 0
has_superstructure_bamboo       

## Create Pipeline

In [6]:
# define variables for ColumnTransformer
numerical = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

categorical = ['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']

binary = ['has_superstructure_adobe_mud', 'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag', 'has_superstructure_cement_mortar_stone', 'has_superstructure_mud_mortar_brick', 'has_superstructure_cement_mortar_brick', 'has_superstructure_timber', 'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered', 'has_superstructure_rc_engineered', 'has_superstructure_other', 'has_secondary_use', 'has_secondary_use_agriculture', 'has_secondary_use_hotel', 'has_secondary_use_rental', 'has_secondary_use_institution', 'has_secondary_use_school', 'has_secondary_use_industry', 'has_secondary_use_health_post', 'has_secondary_use_gov_office', 'has_secondary_use_use_police', 'has_secondary_use_other', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

In [19]:
numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

binary_transformer = Pipeline(
    steps=[("ordinal", OneHotEncoder(handle_unknown="ignore"))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical),
        ("cat", categorical_transformer, categorical),
        ("binary", binary_transformer, binary)
    ]
)

## Model Fitting and Cross Validation

In [20]:
train = df[numerical + categorical + binary]
label = df['damage_grade']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.2, shuffle=True)

In [22]:
pipeline = preprocessor.fit(X_train)
train_x, test_x = pipeline.transform(X_train), pipeline.transform(X_test)

In [23]:
kf = KFold(n_splits=5)

for train_index, test_index in kf.split(train_x):
    X_train_cv, X_test_cv = train_x[train_index], train_x[test_index]
    y_train_cv, y_test_cv = y_train.to_numpy()[train_index], y_train.to_numpy()[test_index]

    clf = LogisticRegression(solver="lbfgs", max_iter=100000,
                             multi_class = "multinomial")
    clf.fit(X_train_cv, y_train_cv)

    predict = clf.predict(X_test_cv)
    print(accuracy_score(y_test_cv, predict))
    print(confusion_matrix(y_test_cv, predict))
    print("\n")

0.7400760892972508
[[ 1258  1404    69]
 [  601 13881  1754]
 [   82  3332  5481]]


0.7380661833321369
[[ 1276  1331    72]
 [  646 13814  1788]
 [   75  3386  5474]]


0.7408298040341684
[[ 1287  1384    72]
 [  606 13870  1853]
 [   79  3227  5484]]


0.7410358565737052
[[ 1254  1347    76]
 [  639 14060  1724]
 [   78  3351  5332]]


0.7442661785291267
[[ 1312  1410    54]
 [  577 13943  1723]
 [  105  3256  5481]]




## Randomized CV Search

In [26]:
# hyperparameters
param_grid = {
    'C': [0.01, 0.1, 0.5, 1, 5, 10],
    'solver': ['newton-cg', 'sag', 'saga', 'lbfgs']
}

# model
logistic = LogisticRegression(multi_class = "multinomial", max_iter=100000,
                             penalty="l2")

# cross validation
cv = KFold(n_splits=5, shuffle=True)

In [27]:
search = RandomizedSearchCV(estimator=logistic, param_distributions=param_grid, n_iter=10, scoring="accuracy", n_jobs=-1, cv=cv)
search.fit(train_x, y_train)
print('Best Score: %s' % search.best_score_)
print('Best Hyperparameters: %s' % search.best_params_)

Best Score: 0.7423981463544409
Best Hyperparameters: {'solver': 'sag', 'C': 0.5}


In [30]:
svc_prediction = search.predict(test_x)
print(f"Accuracy score: {accuracy_score(y_test, svc_prediction)}")

Accuracy score: 0.7445372842909237
