In [33]:
from ipywidgets import widgets
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Get Data from Website

In [3]:
!wget https://s3.amazonaws.com/drivendata/data/57/public/train_values.csv -nc -P ./nepal/
!wget https://s3.amazonaws.com/drivendata/data/57/public/train_labels.csv -nc -P ./nepal/
!wget https://s3.amazonaws.com/drivendata/data/57/public/test_values.csv -nc -P ./nepal/

--2019-11-14 09:14:08--  https://s3.amazonaws.com/drivendata/data/57/public/train_values.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.107.142
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.107.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23442727 (22M) [text/csv]
Saving to: ‘./nepal/train_values.csv’


2019-11-14 09:14:09 (20.6 MB/s) - ‘./nepal/train_values.csv’ saved [23442727/23442727]

--2019-11-14 09:14:09--  https://s3.amazonaws.com/drivendata/data/57/public/train_labels.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.107.142
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.107.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2330792 (2.2M) [text/csv]
Saving to: ‘./nepal/train_labels.csv’


2019-11-14 09:14:10 (5.03 MB/s) - ‘./nepal/train_labels.csv’ saved [2330792/2330792]

--2019-11-14 09:14:10--  https://s3.amazonaws.com/drivendata/data/57/public/test_values.csv
Resolving 

# Import Data

In [4]:
X = pd.read_csv('./nepal/train_values.csv', 
                index_col='building_id', 
                dtype={'geo_level_1_id': 'object',
                       'geo_level_2_id': 'object',
                       'geo_level_3_id': 'object'})

y = pd.read_csv('nepal/train_labels.csv', index_col='building_id')['damage_grade']

# Attempt 1: Model w/ One Feature

In [None]:
def housing_plot(X, y):
    def plotter(column):
        valid_rows = X[column].notna()
        plt.plot(X.loc[valid_rows, column], y[valid_rows], '.', color='k')
        plt.ylabel('Damage Level')
        plt.yticks([1,2,3])
    
    return plotter

dropdown_values = sorted(X.columns)
widgets.interact(housing_plot(X, y), column=dropdown_values);

In [None]:
X_height = X[['height_percentage']]
X_height.head()

## Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_height, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [None]:
one_feat_model = LogisticRegression(solver='lbfgs', multi_class='auto')
one_feat_model.fit(X_train, y_train)

In [None]:
y_train_pred = one_feat_model.predict(X_train)

## Compare in- and out-sample metrics (f1 score)

In [None]:
print('In-sample f1 score:')
f1_score(y_train, y_train_pred, average='micro')

In [None]:
y_test_pred = one_feat_model.predict(X_test)
print('Out-sample f1 score:')
f1_score(y_test, y_test_pred, average='micro')

## Create submission

In [None]:
X_comp_test = pd.read_csv('nepal/test_values.csv', index_col='building_id')
X_comp_test.head()

In [None]:
y_comp_pred = one_feat_model.predict(X_comp_test[['height_percentage']])

In [None]:
y_submission = pd.DataFrame(y_comp_pred, index=X_comp_test.index, columns=['damage_grade'])

In [None]:
y_submission.to_csv('nepal/2019-11-11_submission.csv')

**Score: 0.56**

# Create Functions for Repetitive Tasks

In [5]:
def create_submission(model, X_cols=None):
    X = pd.read_csv('./nepal/test_values.csv', 
                    index_col='building_id', 
                    dtype={'geo_level_1_id': 'object',
                           'geo_level_2_id': 'object',
                           'geo_level_3_id': 'object'})
    if X_cols != None:
        X = X[X_cols]
    y_pred = model.predict(X)
    submission = pd.DataFrame(y_pred, index=X.index, 
                              columns=['damage_grade'])
    date_string = pd.Timestamp.utcnow().strftime(format='%Y-%m-%d_%H%M_')
    submission.to_csv(f'nepal/{date_string}submission.csv')

def get_metrics(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    in_samp_score = f1_score(y_train, y_train_pred, average='micro')
    out_samp_score = f1_score(y_test, y_test_pred, average='micro')
    print('In-sample f1 score:', in_samp_score)
    print('Out-sample f1 score:', out_samp_score)

# Attempt 2: Model w/ All Numerical Features

In [None]:
X.head()

In [None]:
numerical_features = [col for col in X.columns if X[col].dtype == 'int64']
# Another way: list(X.describe().columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [None]:
num_feat_model = LogisticRegression(solver='lbfgs', 
                                    max_iter=1000, 
                                    multi_class='auto')
num_feat_model.fit(X_train[numerical_features], y_train)
y_train_pred = num_feat_model.predict(X_train[numerical_features])

print('In-sample f1 score:')
f1_score(y_train, y_train_pred, average='micro')

In [None]:
y_test_pred = num_feat_model.predict(X_test[numerical_features])

print('Out-sample f1 score:')
f1_score(y_test, y_test_pred, average='micro')

# Attempt 3: Numerical Features w/ Another Predictor

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

xg_num_model = GradientBoostingClassifier()
xg_num_model.fit(X_train[numerical_features], y_train)
y_train_pred = xg_num_model.predict(X_train[numerical_features])

print('In-sample f1 score:')
f1_score(y_train, y_train_pred, average='micro')

In [None]:
y_test_pred = xg_num_model.predict(X_test[numerical_features])

print('Out-sample f1 score:')
f1_score(y_test, y_test_pred, average='micro')

In [None]:
create_submission(xg_num_model, numerical_features)

# Attempt 4: Use all features (numeric and categorical)

In [16]:
categorical_variables = [col for col in X.columns
                         if X[col].dtype == 'object']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, 
                                                    random_state=42)

In [None]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_variables)],
    remainder='passthrough')

all_feat_model = Pipeline([
    ('transformer', ct),
    ('classifier', LogisticRegression(solver='lbfgs', multi_class='auto'))
])

all_feat_model.fit(X, y);

In [None]:
y_pred = all_feat_model.predict(X)
f1_score(y_pred, y, average='micro')

In [None]:
create_submission(all_feat_model)

# Attempt 5: Ensemble Predictor with GridSearch

In [40]:
np.power(2,np.arange(1,6))

array([ 2,  4,  8, 16, 32])

In [None]:
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_variables)],
    remainder='passthrough')

param_grid = {#'max_depth' : range(3,13,3),
              'min_samples_split' : np.power(2,np.arange(1,6)),
              'min_samples_leaf': np.power(2,np.arange(1,6))}

gs = GridSearchCV(ExtraTreesClassifier(),
                  param_grid=param_grid,
                  cv=3,
                  n_jobs=6,
                  verbose=1)

all_feat_model = Pipeline([
    ('transformer', ct),
    ('classifier', gs)
])

all_feat_model.fit(X_train, y_train);

Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


In [43]:
all_feat_model.named_steps['classifier'].best_params_

{'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 2}

In [44]:
get_metrics(all_feat_model, X_train, X_test, y_train, y_test)

In-sample f1 score: 0.6022831926323868
Out-sample f1 score: 0.5981466203641527


In [26]:
create_submission(model=all_feat_model)