<a href="https://colab.research.google.com/github/wel51x/DS-Unit-2-Sprint-5-Predictive-Modeling-Challenge/blob/master/DS2_Unit2_Sprint5_Day_4_Submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Science Unit 2 Sprint 5 DS2 Predictive Modeling Challenge

## Day 4 - XGBoost

In [0]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import warnings
import time

start_time = time.time()
pd.set_option('display.max_columns', None) # all cols
pd.set_option('display.width', 161)
warnings.filterwarnings('ignore')

# get data
#path = "https://github.com/wel51x/DS-Unit-2-Sprint-5-Predictive-Modeling-Challenge/tree/master/data/"
train_features_file = "train_features.csv"
train_labels_file = "train_labels.csv"
test_features_file = "test_features.csv"
submission_in_file = "sample_submission.csv"

train_features = pd.read_csv(train_features_file, sep=',')
train_labels = pd.read_csv(train_labels_file)
test_features = pd.read_csv(test_features_file)

print("\ntrain_features null counts:")
print(train_features.isnull().sum())
print(train_labels.status_group.value_counts())
print("\ntest_features null counts:")
print(test_features.isnull().sum())
# dropna
train_features.dropna(axis=1, inplace=True)
test_features.dropna(axis=1, inplace=True)
print(train_features.isnull().sum().sum())
print(test_features.isnull().sum().sum())

# create age feature
train_features['newer'] = (train_features.construction_year > 1998).map({True : 1, False : 0})
test_features['newer'] = (test_features.construction_year > 1998).map({True : 1, False : 0})

# create area feature
train_features['area'] = train_features.region.replace({'Iringa' : 2,
                                                        'Arusha' : 1,
                                                        'Manyara' : 1,
                                                        'Shinyanga' : 0,
                                                        'Mbeya' : 0,
                                                        'Kilimanjaro' : 0,
                                                        'Morogoro' : 0,
                                                        'Kagera' : 0,
                                                        'Mwanza' : 0,
                                                        'Kigoma' : 0,
                                                        'Ruvuma' : 0,
                                                        'Pwani' : 0,
                                                        'Tanga' : 0,
                                                        'Dodoma' : 0,
                                                        'Singida' : 0,
                                                        'Mara' : 0,
                                                        'Dar es Salaam' : 0,
                                                        'Tabora' : -1,
                                                        'Rukwa' : -1,
                                                        'Lindi' : -2,
                                                        'Mtwara' : -2})
test_features['area'] = test_features.region.replace({'Iringa' : 2,
                                                      'Arusha' : 1,
                                                      'Manyara' : 1,
                                                      'Shinyanga' : 0,
                                                      'Mbeya' : 0,
                                                      'Kilimanjaro' : 0,
                                                      'Morogoro' : 0,
                                                      'Kagera' : 0,
                                                      'Mwanza' : 0,
                                                      'Kigoma' : 0,
                                                      'Ruvuma' : 0,
                                                      'Pwani' : 0,
                                                      'Tanga' : 0,
                                                      'Dodoma' : 0,
                                                      'Singida' : 0,
                                                      'Mara' : 0,
                                                      'Dar es Salaam' : 0,
                                                      'Tabora' : -1,
                                                      'Rukwa' : -1,
                                                      'Lindi' : -2,
                                                      'Mtwara' : -2})

# create payment feature
train_features['pmt'] = train_features.payment_type.replace({'annually' : 2,
                                                             'never pay' : 0,
                                                             'unknown' : 0,
                                                             'on failure' : 0,
                                                             'other' : 0,
                                                             'per bucket' : 1,
                                                             'monthly' : 1})
test_features['pmt'] = test_features.payment_type.replace({'annually' : 2,
                                                           'never pay' : 0,
                                                           'unknown' : 0,
                                                           'on failure' : 0,
                                                           'other' : 0,
                                                           'per bucket' : 1,
                                                           'monthly' : 1})

print("\ntrain_features unique nonumeric values")
print(train_features.select_dtypes(exclude=np.number).nunique())

y_train = train_labels.status_group
# drop id + "high cardinality"
X_train = train_features.drop(['id', "date_recorded", "wpt_name", "lga", "ward"], axis = 1)
X_test  = test_features.drop(['id', "date_recorded", "wpt_name", "lga", "ward"], axis = 1)

train_num_cols = X_train.select_dtypes('number').columns.tolist()
train_cat_cols = X_train.select_dtypes(exclude='number').columns.tolist()[:-1]
test_num_cols = X_test.select_dtypes('number').columns.tolist()
test_cat_cols = X_test.select_dtypes(exclude='number').columns.tolist()[:-1]

cat_encoder = LabelEncoder()

encX_train = X_train[train_cat_cols].apply(cat_encoder.fit_transform)
encX_test = X_test[test_cat_cols].apply(cat_encoder.fit_transform)
encX_train = encX_train.join(X_train[train_num_cols])
encX_test = encX_test.join(X_test[test_num_cols])

print(encX_train.info())

xgb = XGBClassifier(objective = 'multi:softmax', booster = 'gbtree',
                    nrounds = 'min.error.idx', num_class = 4,
                    maximize = False, eval_metric = 'merror', eta = .2,
                    max_depth = 16, colsample_bytree = .4, nthread = -1)

xgb.fit(encX_train, y_train)
print("XGB Score:", xgb.score(encX_train, y_train))

submission = pd.read_csv(submission_in_file)

temp = pd.DataFrame(xgb.predict(encX_test))

print(temp.shape)
print(temp[0].value_counts(normalize=True))
print(temp[0].value_counts(normalize=True))

submission['status_group'] = temp[0]
print(submission.status_group.value_counts())

submission_out_file = "submission-day4.csv"
submission.to_csv(submission_out_file, index=False)
print("--- Run time: %s seconds ---" % (time.time() - start_time))
# Best score so far: 0.81529 ==>> NOW 0.81766!


train_features null counts:
id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_qual