In [123]:
import xgboost as xgb
import pandas as pd
pd.options.display.max_columns = 999
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import GridSearchCV, validation_curve, learning_curve
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
import category_encoders as ce

In [179]:
def map_installer(installer):
    
    unknown = ['0', 'unknown']
    
    if installer in unknown:
        return 'unknown'
    
    government = ['government ', 'government', 'dwe', 'hesawa', 'rwe', 'central government', 'lga',
                 'district council', 'gover', 'gove', 'gov', 'district water department',
                 'sengerema water department', 'distri', 'centr', 'distric water department',
                 'tasaf']
    
    if installer in government:
        return 'government'
    
    community = ['community', 'commu', 'villagers', 'twesa']
    
    if installer in community:
        return 'community'
    
    religious = ['church of disciples', 'kkkt', 'world vision', 'rc church', 'rc', 'tcrs',
                'dmdd']
    
    if installer in religious:
        return 'religious'
    
    international = ['norad', 'fini water', 'danida', 'danid', 'ces', 'kuwait',
                    'finw']
    
    if installer in international:
        return 'international'
    
    private = ['private', 'privat', 'kiliwater', 'wedeco']
    
    if installer in private:
        return 'private'
    
    aid = ['roman', 'amref', 'world bank', 'unicef', 'oxfam']
    
    if installer in aid:
        return 'aid'
    
    
    return 'other'

def season(month):
    """
    Returns a string corresponding to the typical seasononal period in Tanzania, given
    month input as an integer
    """
    if month in [4,5]:
        return 'heavy_rain'
    elif month in [12, 1, 2]:
        return 'hot_dry'
    elif month == 3:
        return 'intermittent_rain'
    elif month in [6,7,8,9,10]:
        return 'cool_dry'
    else:
        return 'short_rains'

def wrangle(X, X_train):
    """
    Takes in the raw water pump features and returns an enhanced dataframe
    
    We also need to pass in the training data for some functions, such as getting top value_counts.
    
    When wrangling the training set, we will pass two copies of the training data
    
    """
    X = X.copy()
    X_train = X_train.copy()
    
    ### date recorded ###
    X['year_recorded'] = X['date_recorded'].apply(lambda x: int(x.split('-')[0]))
    X['years_since_construction'] = [record - construction if construction > 1900 
                                        else 100 for record, construction in 
                                        zip(X['year_recorded'], X['construction_year'])]
    
    days = X_train['date_recorded'].value_counts()
    X['day_record_count'] = [days[day] if day in days.index else 0 for day in X['date_recorded']]
    X['month_recorded'] = X['date_recorded'].apply(lambda x: int(x.split('-')[1]))
    X['season'] = X['month_recorded'].apply(season)
    
    ### funder / installer ###
    # convert all strings to lowercase
    X['funder'], X['installer'] = X['funder'].str.lower(), X['installer'].str.lower()
    X['funded_and_installed'] = np.where(X['funder'] == X['installer'], True, False)
    # encode into categories
    X['funder_cat'] = X['funder'].apply(map_installer)
    X['installer_cat'] = X['installer'].apply(map_installer)
    
    # drop full feature sets
    X = X.drop(['funder', 'installer'], axis=1)
    
#     top_funders, top_installers = X_train['funder'].str.lower().value_counts().index[:20], X_train['installer'].str.lower().value_counts().index[:20]
#     X['funder'] = ['other' if funder not in top_funders else funder for funder in X['funder']]
#     X['installer'] = ['other' if installer not in top_installers else installer for installer in X['installer']]
    
    ### total static head ###
    X['tsh_zero'] = X['amount_tsh'] == 0.0
#     X['log_amount_tsh'] = np.log(X['amount_tsh']) <- has problems with 0
    
    ### waterpoint name ###
    X['wpt_name'] = X['wpt_name'].str.lower()
    top_wpt = X_train['wpt_name'].str.lower().value_counts().index[:50]
    X['wpt_name'] = ['other' if wpt_name not in top_wpt else wpt_name for wpt_name in X['wpt_name'].str.lower()]
    
    ### population ###
    X['population_unknown'] = X['population'] == 0
    X['population'].replace(0, np.nan, inplace=True)
    X["population"].fillna(X_train.groupby(['region', 'district_code'])["population"].transform("median"), inplace=True)

    X["population"].fillna(X_train.groupby(['region'])["population"].transform("median"), inplace=True)

    X["population"].fillna(X_train["population"].median(), inplace=True)

    
    ### subvillage ###
    # encode unknown as binary
    X['subvillage_unknown'] = X['subvillage'].isnull()
    # encode top 50, everything else as other
    X['subvillage'] = X['subvillage'].str.lower()
    sub_villages_count = X_train['subvillage'].str.lower().value_counts()
    X['subvillage_waterpoints'] = [sub_villages_count[vill] if vill in sub_villages_count.index else 0 for vill in X['subvillage'].str.lower()]
    
    top_subvillage = X_train['subvillage'].str.lower().value_counts().index[:50]
    X['subvillage'] = ['other' if vill not in top_subvillage else vill for vill in X['subvillage'].str.lower()]
    
    ### region code ###
    # encode region codes as strings, which will be automatically one hot encoded later
    X['region_code'] = X['region_code'].astype(str)
    X = X.drop('region', axis=1)
    
    ### district code ###
    # encode district codes as strings, which will be automatically one hot encoded later
    X['district_code'] = X['district_code'].astype(str)
    
    ### lga ###
    X['lga'] = X['lga'].str.lower()
    # encode urban and rural
    X['lga_rural'] = [lga.find('rural') != -1 for lga in X['lga']]
    X['lga_urban'] = [lga.find('urban') != -1 for lga in X['lga']]
    # encode anything except the top 30 as other
    top_lga = X_train['lga'].str.lower().value_counts().index[:30]
    X['lga'] = ['other' if lga not in top_lga else lga for lga in X['lga'].str.lower()]
    
    ### ward ###
    # create new variable with the number of waterpoints in a given ward
    ward_counts = X_train['ward'].str.lower().value_counts()
    X['ward_wpt_count'] = [ward_counts[ward] if ward in ward_counts.index else 0 for ward in X['ward'].str.lower()]
    
    
    ### public meeting ###
    X['public_meeting_unknown'] = X['public_meeting'].isnull() # encode missing vals to a binary var
    X['public_meeting'] = X['public_meeting'].fillna(False) # fill na with false
    
    ### recorded by ###
    X = X.drop('recorded_by', axis=1) # drop it, all the same value
    
    ### permit ###
    X['permit_unknown'] = X['permit'].isnull() # encode missing vals to a binary var
    X['permit'] = X['permit'].fillna(False) # fill na with false
    
    ### scheme name/management ###
    X['scheme_management'] = X['scheme_management'].fillna('None') # encode nulls as 'None'
    X = X.drop('scheme_name', axis=1)
    
    ### construction year ###
    X['construction_unknown'] = X['construction_year'] < 1900
#     X['early60s'] = X['construction_year'].between(1900, 1964)
#     X['late60s'] = X['construction_year'].between(1965, 1969)
#     X['early70s'] = X['construction_year'].between(1970, 1974)
#     X['late70s'] = X['construction_year'].between(1975, 1979)
#     X['early80s'] = X['construction_year'].between(1980, 1984)
#     X['late80s'] = X['construction_year'].between(1985, 1989)
#     X['early90s'] = X['construction_year'].between(1990, 1994)
#     X['late90s'] = X['construction_year'].between(1995, 1999)
#     X['early00s'] = X['construction_year'].between(2000, 2004)
#     X['late00s'] = X['construction_year'].between(2005, 2009)
#     X['early10s'] = X['construction_year'].between(2010, 2014)
#     X = X.drop('construction_year', axis=1)
    
    ### extraction type ###
    X = X.drop(['extraction_type_group', 'extraction_type_class'], axis=1)
    
    ### management ###
    X = X.drop('management_group', axis=1)
    
    ### payment ###
    X = X.drop('payment_type', axis=1)
    
    ### quality ###
    X = X.drop(['quality_group', 'quantity_group'], axis=1)
    
    ### source ###
    X = X.drop(['source_type', 'source_class'], axis=1)
    
    ### waterpoint type ###
    X = X.drop('waterpoint_type_group', axis=1)
    
    ### longitude/latitude/gps_height ###
    X['gps_height_bad'] = X['gps_height'] <= 0.0 # min height should be sea level
    X['latitude_bad'] = X['latitude'] < 25.0
    X['longitude_bad'] = X['longitude'] > -0.5
    
    # fill in missing values based on location - STOLEN FROM BREADWARD
    
#     training_data["gps_height"].fillna(training_data.groupby(['region', 'district_code'])["gps_height"].transform("mean"), inplace=True)
#     training_data["gps_height"].fillna(training_data.groupby(['region'])["gps_height"].transform("mean"), inplace=True)
#     training_data["gps_height"].fillna(training_data["gps_height"].mean(), inplace=True)
#     training_data["amount_tsh"].fillna(training_data.groupby(['region', 'district_code'])["amount_tsh"].transform("median"), inplace=True)
#     training_data["amount_tsh"].fillna(training_data.groupby(['region'])["amount_tsh"].transform("median"), inplace=True)
#     training_data["amount_tsh"].fillna(training_data["amount_tsh"].median(), inplace=True)
#     training_data["latitude"].fillna(training_data.groupby(['region', 'district_code'])["latitude"].transform("mean"), inplace=True)
#     training_data["longitude"].fillna(training_data.groupby(['region', 'district_code'])["longitude"].transform("mean"), inplace=True)
#     training_data["longitude"].fillna(training_data.groupby(['region'])["longitude"].transform("mean"), inplace=True)
    
    
    
    ### DROPPING AFTER FEATURE ENGINEERING ####
    X = X.drop('date_recorded', axis=1)
  
    return X # returned wrangled dataframe

df_raw = pd.read_csv('train_features.csv', index_col=0)
df_test_raw = pd.read_csv('test_features.csv', index_col=0)
X_train, X_test = wrangle(df_raw, df_raw), wrangle(df_test_raw, df_raw)
y_train = pd.read_csv('train_labels.csv', index_col=0)

In [180]:
# # input KNN predictions based on geography
# include_knn = False
# if include_knn:
#     from sklearn.neighbors import KNeighborsClassifier
#     geo = ['latitude', 'longitude']
#     knn = KNeighborsClassifier(n_neighbors=5).fit(X_train[geo], y_train)
#     geo_train = knn.predict_proba(X_train[geo])
#     geo_test = knn.predict_proba(X_test[geo])

#     X_train['knn_func'], X_train['knn_non_func'], X_train['knn_repair'] = geo_train[:,0], geo_train[:,1], geo_train[:,2]
#     X_test['knn_func'], X_test['knn_non_func'], X_test['knn_repair'] = geo_test[:,0], geo_test[:,1], geo_test[:,2]

# encoding categoricals
cat_encoder = ce.OneHotEncoder(use_cat_names=True)
cat_encoder.fit(X_train, y_train)
X_train_clean = cat_encoder.transform(X_train)
X_test_clean = cat_encoder.transform(X_test)

In [181]:
# encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_clean = le.fit_transform(y_train.values.ravel())

X_train_clean_train, X_train_clean_val, y_train_clean_train, y_train_clean_val = train_test_split(X_train_clean,
                                                                                                 y_train_clean,
                                                                                                 test_size=0.15,
                                                                                                 stratify=y_train_clean,
                                                                                                 shuffle=True)

dtrain = xgb.DMatrix(X_train_clean_train, label=y_train_clean_train)
dval = xgb.DMatrix(X_train_clean_val, label=y_train_clean_val)
dtest = xgb.DMatrix(X_test_clean)

## XGBoost Time

In [182]:
# load and wrangle
df_raw = pd.read_csv('train_features.csv', index_col=0)
df_test_raw = pd.read_csv('test_features.csv', index_col=0)
X_train, X_test = wrangle(df_raw, df_raw), wrangle(df_test_raw, df_raw)
y_train = pd.read_csv('train_labels.csv', index_col=0)

# encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_clean = le.fit_transform(y_train.values.ravel())

# encoding categoricals
cat_encoder = ce.OneHotEncoder(use_cat_names=True)
cat_encoder.fit(X_train, y_train)
X_train_clean = cat_encoder.transform(X_train)
X_test_clean = cat_encoder.transform(X_test)

X_train_clean_train, X_train_clean_val, y_train_clean_train, y_train_clean_val = train_test_split(X_train_clean,
                                                                                                 y_train_clean,
                                                                                                 test_size=0.10,
                                                                                                 stratify=y_train_clean,
                                                                                                 shuffle=True,
                                                                                                 random_state=420)

dtrain = xgb.DMatrix(X_train_clean_train, label=y_train_clean_train)
dval = xgb.DMatrix(X_train_clean_val, label=y_train_clean_val)
dtest = xgb.DMatrix(X_test_clean)

In [189]:
# v17 run with these params, and WITHOUT season

# Stopping. Best iteration:
# [107]	train-merror:0.109428	validation-merror:0.180471

# param = {'num_class' : 3,
#          'scale_pos_weight' : 1,
#          'max_depth': 45,
#          'eta': 0.1,
#          'n_thread' : 4,
#          'colsample_bytree' : 0.3,
#          'subsample' : 0.3,
#          'silent': 1, 
#          'n_estimators' : 100,
#          'reg_alpha' : 0.4,
#          'gamma' : 1,
#          'objective': 'multi:softprob',
#         'eval_metric' : 'merror'} 

# best params, v19
# [148]	train-merror:0.082417	validation-merror:0.18165
# Stopping. Best iteration:
# [98]	train-merror:0.100224	validation-merror:0.178956
# param = {'num_class' : 3,
#          'scale_pos_weight' : 1,
#          'max_depth': 70, # very deep tree
#          'eta': 0.1,
#          'n_thread' : 4,
#          'colsample_bytree' : 0.4,
#          'subsample' : 0.3,
#          'silent': 1, 
#          'n_estimators' : 1000, # lots of estimators
#          'reg_alpha' : 0.3,
#          'gamma' : 1,
#          'objective': 'multi:softprob',
#         'eval_metric' : 'merror'} 

# setting parameters 
param = {'num_class' : 3,
         'scale_pos_weight' : 1,
         'max_depth': 80, # very deep tree
         'eta': 0.1,
         'n_thread' : 4,
         'colsample_bytree' : 0.4,
         'subsample' : 0.3,
         'silent': 1, 
         'n_estimators' : 2000, # lots of estimators
         'reg_alpha' : 0.3,
         'gamma' : 1,
         'objective': 'multi:softprob',
        'eval_metric' : 'merror'} 
num_rounds = 150
eval_list = [(dtrain, 'train'), (dval, 'validation')]

bst = xgb.train(param, dtrain, num_rounds, eval_list, early_stopping_rounds=30)
bst.save_model('xgboost_iteration21_v0.model')


[0]	train-merror:0.245211	validation-merror:0.271549
Multiple eval metrics have been passed: 'validation-merror' will be used for early stopping.

Will train until validation-merror hasn't improved in 30 rounds.
[1]	train-merror:0.216162	validation-merror:0.244949
[2]	train-merror:0.200355	validation-merror:0.23064
[3]	train-merror:0.18743	validation-merror:0.219529
[4]	train-merror:0.182211	validation-merror:0.214478
[5]	train-merror:0.173962	validation-merror:0.202357
[6]	train-merror:0.173363	validation-merror:0.204714
[7]	train-merror:0.169921	validation-merror:0.202189
[8]	train-merror:0.168967	validation-merror:0.200505
[9]	train-merror:0.166648	validation-merror:0.198822
[10]	train-merror:0.166049	validation-merror:0.201347
[11]	train-merror:0.164478	validation-merror:0.19899
[12]	train-merror:0.164141	validation-merror:0.198653
[13]	train-merror:0.163412	validation-merror:0.2
[14]	train-merror:0.162514	validation-merror:0.200168
[15]	train-merror:0.160494	validation-merror:0.20

In [143]:
# num_rounds = 100
# bst_v1 = xgb.train(param, dtrain, num_rounds, eval_list, xgb_model='xgboost_iteration18_v0.model')
# bst_v1.save_model('xgboost_iteration18_v1.model')


In [190]:
# predict test data
preds = bst.predict(dtest)
# extracting most confident predictions
best_preds = le.inverse_transform(np.asarray([np.argmax(line) for line in preds]))
# create df to hold submission
df_boost = pd.DataFrame(data=best_preds, index=X_test.index, columns=['status_group'])
df_boost.to_csv('xgboost_iteration21_v0_submission.csv')

In [191]:
# SUBMIT!
%env KAGGLE_CONFIG_DIR=/Users/zach/Kaggle
!kaggle competitions submit -c ds1-predictive-modeling-challenge -f xgboost_iteration21_v0_submission.csv -m 'xgboost classifier final submit'



env: KAGGLE_CONFIG_DIR=/Users/zach/Kaggle
100%|█████████████████████████████████████████| 264k/264k [00:01<00:00, 214kB/s]
Successfully submitted to DS1 Predictive Modeling Challenge

In [187]:
# lets take a look at the classification report on the validation data
train_preds = le.inverse_transform(np.asarray([np.argmax(line) for line in bst.predict(dtrain)]))
val_preds = le.inverse_transform(np.asarray([np.argmax(line) for line in bst.predict(dval)]))
print ('Training accuracy: ', accuracy_score(le.inverse_transform(y_train_clean_train), train_preds))
print ('Validation accuracy: ', accuracy_score(le.inverse_transform(y_train_clean_val), val_preds))
print (classification_report(le.inverse_transform(y_train_clean_val), val_preds))

Training accuracy:  0.9175832398054621
Validation accuracy:  0.8183501683501684
                         precision    recall  f1-score   support

             functional       0.82      0.90      0.85      3226
functional needs repair       0.65      0.34      0.45       432
         non functional       0.84      0.80      0.82      2282

              micro avg       0.82      0.82      0.82      5940
              macro avg       0.77      0.68      0.71      5940
           weighted avg       0.81      0.82      0.81      5940

