In [1]:
# Importing packages for train test split
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_roc_curve, plot_confusion_matrix
from sklearn.metrics import log_loss, classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from collections import Counter
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
pipeline_df = pd.read_csv("data/pipeline_df.csv", index_col = 0)

In [5]:
# Setting up train-test split - Dropped any remaining columns with missing values
X = pipeline_df.drop(columns = ['funder', 'installer', 'extraction_type_class',
       'basin', 'subvillage', 'region', 'lga', 'ward',
       'management', 'payment_type', 'quality_group', 'quantity',
       'source_type', 'waterpoint_type', 'status_group', 'status_group_num',
       'qualtiy_group_num', 'permit', 'id'], axis = 1)
y = pipeline_df['status_group_num']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [7]:
X.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,population,construction_year,payment_num,management_num,basin_num,quantity_num,source_type_num,waterpoint_type_num,region_num,extraction_num,well_age
0,6000.0,1390,34.938093,-9.856322,109,1999,5,0,6,0,0,0,0,0,23
1,0.0,1399,34.698766,-2.147466,280,2010,0,1,0,1,4,0,14,0,12
2,25.0,686,37.460664,-3.821329,250,2009,1,0,1,0,5,3,18,0,13
3,0.0,263,38.486161,-11.155298,58,1986,0,0,7,2,2,3,17,3,36
4,0.0,0,31.130847,-1.825359,0,2000,0,7,0,3,4,0,6,0,22


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54532 entries, 0 to 59399
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   amount_tsh           54532 non-null  float64
 1   gps_height           54532 non-null  int64  
 2   longitude            54532 non-null  float64
 3   latitude             54532 non-null  float64
 4   population           54532 non-null  int64  
 5   construction_year    54532 non-null  int64  
 6   payment_num          54532 non-null  int64  
 7   management_num       54532 non-null  int64  
 8   basin_num            54532 non-null  int64  
 9   quantity_num         54532 non-null  int64  
 10  source_type_num      54532 non-null  int64  
 11  waterpoint_type_num  54532 non-null  int64  
 12  region_num           54532 non-null  int64  
 13  extraction_num       54532 non-null  int64  
 14  well_age             54532 non-null  int64  
dtypes: float64(3), int64(12)
memory usag