In [None]:
import numpy as np
import pandas as pd

import random

from sklearn import metrics, model_selection, linear_model, ensemble, tree, neighbors

from xgboost import XGBClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from IPython.core.interactiveshell import InteractiveShell  
InteractiveShell.ast_node_interactivity = "all"

In [None]:
def train_and_report(model, data_train):
    
    X_train, X_test, y_train, y_test = model_selection.train_test_split(data_train.drop('id', axis = 1), target, random_state = 42)
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    print(metrics.classification_report(y_test, y_pred))
    
    return model

###    

def predict_and_submit(model, data_test):
    
    data_test['status_id'] = model.predict(data_test.drop('id', axis = 1))
    
    data_test['status_group'] = data_test['status_id'].map(lambda x: 'functional' if x==1
                                              else 'non functional' if x == 2
                                              else 'functional need repair')
    
    data_test[['id','status_group']].to_csv('submission.csv', index = False)

In [None]:
train = pd.read_csv('/kaggle/input/pump-it-up-data-mining-the-water-table/train.csv')
test = pd.read_csv('/kaggle/input/pump-it-up-data-mining-the-water-table/test.csv')

train_labels = pd.read_csv('/kaggle/input/pump-it-up-data-mining-the-water-table/train-labels.csv')

In [None]:
train = pd.merge(train, train_labels, on = 'id')

train['status_id'] = train['status_group'].map(lambda x: 1 if x=='functional'
                                              else 2 if x == 'non functional'
                                              else 3)

train = train.drop('status_group', axis = 1)

In [None]:
print('train data')
print(train.shape)
train.head()
print('######################')
print('test data')
print(test.shape)
test.head()

In [None]:
target = train['status_id']

data = pd.concat([train.drop('status_id', axis = 1), test]).reset_index(drop = True)

data.shape

In [None]:
cols_to_drop = ['funder', 'date_recorded', 'installer', 'wpt_name', 'subvillage', 'region', 'scheme_name']

In [None]:
data = data.drop(cols_to_drop, axis = 1)

In [None]:
data['public_meeting'].fillna(False, inplace = True)
data['public_meeting'] = data['public_meeting'].map(lambda x: 1 if x==True else 0)

data['permit'].fillna(False, inplace = True)

data['scheme_management'].fillna('None', inplace = True)

In [None]:
numeric = [x for x in data.columns if data[x].dtypes == 'float64' or data[x].dtypes == 'int64']
text = [x for x in data.columns if data[x].dtypes == 'object']

In [None]:
data[text].columns

In [None]:
data = pd.get_dummies(data, prefix = data[text].columns)

In [None]:
df_train = data.iloc[:len(target), :]

df_test = data.iloc[len(target):, :]

df_train.shape, target.shape, df_test.shape

In [None]:
classifiers = [neighbors.KNeighborsClassifier(),
               tree.DecisionTreeClassifier(),
               ensemble.RandomForestClassifier(),
               ensemble.GradientBoostingClassifier(),
               XGBClassifier()]

for model in classifiers:
    
    train_and_report(model, df_train)

In [None]:
tree = train_and_report(tree.DecisionTreeClassifier(), df_train)

In [None]:
xgb = train_and_report(XGBClassifier(n_jobs = -1, verbose = 2), df_train)

In [None]:
rf = train_and_report(ensemble.RandomForestClassifier(), df_train)

In [None]:
df_test[['id','status_group']].to_csv("submission_xgb.csv", index = False)