In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.options.display.max_columns = None

In [2]:
train = pd.read_csv('pump.csv')
test = pd.read_csv('test.csv')
labels = pd.read_csv('pump_labels.csv')

#train['status'] = labels.status_group
train['date_recorded'] = pd.to_datetime(train['date_recorded']) # converting date to datetime object
test['date_recorded'] = pd.to_datetime(test['date_recorded'])

train['year_recorded'] = pd.DatetimeIndex(train['date_recorded']).year
train['month_recorded'] = pd.DatetimeIndex(train['date_recorded']).month

test['year_recorded'] = pd.DatetimeIndex(test['date_recorded']).year
test['month_recorded'] = pd.DatetimeIndex(test['date_recorded']).month

dropped_cols = ['amount_tsh', 'date_recorded', 'wpt_name', 'scheme_name', 'region_code', 'recorded_by', 
                    'management_group', 'payment_type', 'quality_group','quantity_group', 'source_type',
                   'waterpoint_type_group', 'subvillage', 'num_private', 'region']

train = train.drop(dropped_cols, axis = 1)
test = test.drop(dropped_cols, axis = 1)

In [3]:
train.nunique()

id                       59400
funder                    1897
gps_height                2428
installer                 2145
longitude                57516
latitude                 57517
basin                        9
district_code               20
lga                        125
ward                      2092
population                1049
public_meeting               2
scheme_management           12
permit                       2
construction_year           55
extraction_type             18
extraction_type_group       13
extraction_type_class        7
management                  12
payment                      7
water_quality                8
quantity                     5
source                      10
source_class                 3
waterpoint_type              7
year_recorded                5
month_recorded              12
dtype: int64

In [4]:
train.isna().sum()

id                          0
funder                   3635
gps_height                  0
installer                3655
longitude                   0
latitude                    0
basin                       0
district_code               0
lga                         0
ward                        0
population                  0
public_meeting           3334
scheme_management        3877
permit                   3056
construction_year           0
extraction_type             0
extraction_type_group       0
extraction_type_class       0
management                  0
payment                     0
water_quality               0
quantity                    0
source                      0
source_class                0
waterpoint_type             0
year_recorded               0
month_recorded              0
dtype: int64

Now we can see the number of missing values for the relevant numerical data. Unsurprisingly it looks like longitude and latitude have the same number of missing points. gps_height,

In [5]:
train.groupby('district_code')['gps_height'].mean()

district_code
0       0.000000
1     678.767188
2     853.418688
3     642.588918
4     860.858762
5     660.115243
6     307.589347
7     727.915345
8      30.663471
13     71.455243
23    135.327645
30    482.310553
33    374.255149
43    166.708911
53    140.238926
60    -18.444444
62     23.926606
63    249.276923
67    -25.333333
80      0.000000
Name: gps_height, dtype: float64

In [6]:
train.groupby('basin')['gps_height'].mean()

basin
Internal                    885.677328
Lake Nyasa                  796.058407
Lake Rukwa                  583.585575
Lake Tanganyika             715.002954
Lake Victoria               328.424961
Pangani                    1034.890045
Rufiji                      880.665873
Ruvuma / Southern Coast     410.640329
Wami / Ruvu                 206.458827
Name: gps_height, dtype: float64

It looks like some of the districts have all of the gps_height data missing. We will impute based on the one of the location columns. district_code looks like it is the smallest we can go without having too many categories which have very few datapoints.

For population, we'll probably do a similar method.

In [7]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [8]:
train.loc[train.longitude <= 5, 'longitude'] = np.nan # using 5 for safety
train.loc[train.latitude >= -0.5, 'latitude'] = np.nan # use -0.5 since some of the values aren't 0 but very close to 0
train.loc[train.gps_height <= 0, 'gps_height'] = np.nan
train.loc[train.population <= 0, 'population'] = np.nan
train.loc[train.construction_year <= 1900, 'construction_year'] = np.nan

test.loc[test.longitude <= 5, 'longitude'] = np.nan # using 5 for safety
test.loc[test.latitude >= -0.5, 'latitude'] = np.nan # use -0.5 since some of the values aren't 0 but very close to 0
test.loc[test.gps_height <= 0, 'gps_height'] = np.nan
test.loc[test.population <= 0, 'population'] = np.nan
test.loc[test.construction_year <= 1900, 'construction_year'] = np.nan

Since gps_height, population, latitude, longitude are dependent on the location, we will replace these values with the mean with respect to the ward and then district_code if values are still missing.

In [9]:
train.isna().sum().sort_values(ascending = False)

gps_height               21934
population               21381
construction_year        20709
scheme_management         3877
installer                 3655
funder                    3635
public_meeting            3334
permit                    3056
longitude                 1812
latitude                  1812
basin                        0
district_code                0
lga                          0
ward                         0
month_recorded               0
year_recorded                0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
payment                      0
water_quality                0
quantity                     0
source                       0
source_class                 0
waterpoint_type              0
id                           0
dtype: int64

In [10]:
# start with lga to fill in missing values. we will fill remaining with district and then basin

train['gps_height'] = train.groupby('lga')['gps_height'].apply(lambda x: x.fillna(x.mean()))
train['population'] = train.groupby('lga')['population'].apply(lambda x: x.fillna(x.mean()))
train['latitude'] = train.groupby('lga')['latitude'].apply(lambda x: x.fillna(x.mean()))
train['longitude'] = train.groupby('lga')['longitude'].apply(lambda x: x.fillna(x.mean()))

test['gps_height'] = test.groupby('lga')['gps_height'].apply(lambda x: x.fillna(x.mean()))
test['population'] = test.groupby('lga')['population'].apply(lambda x: x.fillna(x.mean()))
test['latitude'] = test.groupby('lga')['latitude'].apply(lambda x: x.fillna(x.mean()))
test['longitude'] = test.groupby('lga')['longitude'].apply(lambda x: x.fillna(x.mean()))

train.isna().sum().sort_values(ascending = False)

construction_year        20709
population               19689
gps_height               18988
scheme_management         3877
installer                 3655
funder                    3635
public_meeting            3334
permit                    3056
longitude                  488
latitude                   488
basin                        0
district_code                0
lga                          0
ward                         0
month_recorded               0
year_recorded                0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
payment                      0
water_quality                0
quantity                     0
source                       0
source_class                 0
waterpoint_type              0
id                           0
dtype: int64

In [11]:
train['gps_height'] = train.groupby('district_code')['gps_height'].apply(lambda x: x.fillna(x.mean()))
train['population'] = train.groupby('district_code')['population'].apply(lambda x: x.fillna(x.mean()))
train['latitude'] = train.groupby('district_code')['latitude'].apply(lambda x: x.fillna(x.mean()))
train['longitude'] = train.groupby('district_code')['longitude'].apply(lambda x: x.fillna(x.mean()))

test['gps_height'] = test.groupby('district_code')['gps_height'].apply(lambda x: x.fillna(x.mean()))
test['population'] = test.groupby('district_code')['population'].apply(lambda x: x.fillna(x.mean()))
test['latitude'] = test.groupby('district_code')['latitude'].apply(lambda x: x.fillna(x.mean()))
test['longitude'] = test.groupby('district_code')['longitude'].apply(lambda x: x.fillna(x.mean()))

train.isna().sum().sort_values(ascending = False)

construction_year        20709
scheme_management         3877
installer                 3655
funder                    3635
public_meeting            3334
permit                    3056
gps_height                  35
population                  35
longitude                    0
latitude                     0
basin                        0
district_code                0
lga                          0
ward                         0
month_recorded               0
year_recorded                0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
payment                      0
water_quality                0
quantity                     0
source                       0
source_class                 0
waterpoint_type              0
id                           0
dtype: int64

It looks like we still have some missing values for gps_height and population. We will replace those with mean in each of the basins which should fill in the rest of the data.

In [12]:
train['gps_height'] = train.groupby('basin')['gps_height'].apply(lambda x: x.fillna(x.mean()))
train['population'] = train.groupby('basin')['population'].apply(lambda x: x.fillna(x.mean()))

test['gps_height'] = test.groupby('basin')['gps_height'].apply(lambda x: x.fillna(x.mean()))
test['population'] = test.groupby('basin')['population'].apply(lambda x: x.fillna(x.mean()))
train.isna().sum().sort_values(ascending = False)

construction_year        20709
scheme_management         3877
installer                 3655
funder                    3635
public_meeting            3334
permit                    3056
gps_height                   0
longitude                    0
latitude                     0
basin                        0
district_code                0
lga                          0
ward                         0
population                   0
month_recorded               0
year_recorded                0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
payment                      0
water_quality                0
quantity                     0
source                       0
source_class                 0
waterpoint_type              0
id                           0
dtype: int64

So we have no more missing values for lat, long, gps_height, and population. For the categorical features, we will leave missing features with a missing label. We also need to impute construction_year. Since it doesn't look like there is a useful feature that will help us predict construction year, we'll just replace missing values with its overall mean. After we impute the values for construction_year, we can create our age feature which takes the difference of the year constructed and the year the report was recorded.

In [13]:
train['construction_year'] = train[['construction_year']].fillna(train['construction_year'].mean())
test['construction_year'] = test[['construction_year']].fillna(train['construction_year'].mean())
train = train.fillna('missing')
test = test.fillna('missing')

train['age'] = train['year_recorded'] - train['construction_year']
test['age'] = test['year_recorded'] - test['construction_year']

Now we need to deal with the categorical features with a large amount of unique values. So we need to deal with funder, installer, ward, and maybe lga. We will probably do this by putting the values with very few number of values into an other category. We can experiment with the cutoff in the future.

Note: 59400 rows in training set. 14850 rows in test set. So the same relative threshold on the test set would be $\frac{100}{59400}\cdot 14850 = 25$. So we should also check for categories in the test set with a count of > 25.

In [14]:
for col in ['funder', 'installer', 'ward']:
    value_counts_train = train[col].value_counts()
    value_counts_test = test[col].value_counts()
    train_categories = list(value_counts_train.keys())
    test_categories = list(value_counts_test.keys())
    other_categories =[]
    for j in range(len(train_categories)): # loops through categories in test set
        if value_counts_train[j] < 100:
            other_categories = train_categories[j:]
            break
    for j in range(len(test_categories)): # accounts for values that might be in the test set but not in the training set
        if value_counts_test[j] < 25:
            missing_categories = test_categories[j:]
            break
    
    train[col] = train[col].replace(other_categories, 'other')
    test[col] = test[col].replace(other_categories, 'other')
    test[col] = test[col].replace(missing_categories, 'other')

Now we have thrown some of the categories with a small number of datapoints into an 'other' category. Now we will combine the dataset for encoding then we will split it back up again.

In [15]:
train.columns

Index(['id', 'funder', 'gps_height', 'installer', 'longitude', 'latitude',
       'basin', 'district_code', 'lga', 'ward', 'population', 'public_meeting',
       'scheme_management', 'permit', 'construction_year', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'payment', 'water_quality', 'quantity', 'source', 'source_class',
       'waterpoint_type', 'year_recorded', 'month_recorded', 'age'],
      dtype='object')

In [16]:
train['group'] = 'train'
test['group'] = 'test'

num_cols = ['longitude', 'latitude', 'population', 'construction_year', 'year_recorded', 'month_recorded', 'age']
standardscaler = StandardScaler()
train[num_cols] = standardscaler.fit_transform(train[num_cols])
test[num_cols] = standardscaler.transform(test[num_cols])

In [17]:
df = pd.concat([train, test])
df['group'].value_counts()

train    59400
test     14850
Name: group, dtype: int64

In [18]:
df = df.drop(['year_recorded', 'construction_year'], axis = 1) # now that we're done using lga for imputation, we will drop it
df = pd.get_dummies(df, drop_first = True)
df = pd.get_dummies(df, prefix = ['district_code'], columns = ['district_code'])

In [19]:
df.head()

Unnamed: 0,id,gps_height,longitude,latitude,population,month_recorded,age,funder_Adb,funder_Adra,funder_African,funder_Amref,funder_Anglican Church,funder_Bsf,funder_Ces (gmbh),funder_Ces(gmbh),funder_Co,funder_Community,funder_Concern,funder_Concern World Wide,funder_Danida,funder_Ded,funder_Dfid,funder_Dh,funder_Dhv,funder_District Council,funder_Dmdd,funder_Dwe,funder_Dwsp,funder_Fini Water,funder_Finw,funder_Fw,funder_Germany,funder_Germany Republi,funder_Go,funder_Government Of Tanzania,funder_Halmashauri Ya Wilaya Sikonge,funder_He,funder_Hesawa,funder_Hifab,funder_Hsw,funder_Ir,funder_Is,funder_Isf,funder_Jaica,funder_Jica,funder_Ki,funder_Kiliwater,funder_Kkkt,funder_Kkkt_makwale,funder_Lamp,funder_Lawatefuka Water Supply,funder_Lga,funder_Lvia,funder_Magadini-makiwaru Water,funder_Ministry Of Water,funder_Mission,funder_Mkinga Distric Coun,funder_Muwsa,funder_Nethalan,funder_Netherlands,funder_No,funder_Norad,funder_Oikos E.Afrika,funder_Oxfam,funder_Oxfarm,funder_Plan Int,funder_Private,funder_Private Individual,funder_Rc,funder_Rc Church,funder_Roman,funder_Roman Catholic,funder_Ru,funder_Rudep,funder_Rural Water Supply And Sanitat,funder_Rwssp,funder_Shipo,funder_Snv,funder_Swedish,funder_Tardo,funder_Tasaf,funder_Tassaf,funder_Tcrs,funder_Unhcr,funder_Unice,funder_Unicef,funder_Village Council,funder_Villagers,funder_W.B,funder_Wananchi,funder_Water,funder_Wateraid,funder_World Bank,funder_World Vision,funder_Wsdp,funder_Wua,funder_Wvt,funder_missing,funder_other,installer_ACRA,installer_AMREF,installer_Amref,installer_Artisan,installer_CES,installer_Centr,installer_Central Government,installer_Central government,installer_Central govt,installer_Commu,installer_Community,installer_Consulting Engineer,installer_DANID,installer_DANIDA,installer_DDCA,installer_DED,installer_DH,installer_DW,installer_DWE,installer_DWSP,installer_Da,installer_Distri,installer_District Council,installer_District council,installer_Dmdd,installer_FINI WATER,installer_FW,installer_FinW,installer_Fini Water,installer_Fini water,installer_GOVER,installer_Gove,installer_Gover,installer_Government,installer_HE,installer_HESAWA,installer_HSW,installer_Halmashauri ya wilaya sikonge,installer_Handeni Trunk Main(,installer_Hesawa,installer_ISF,installer_Idara ya maji,installer_Ir,installer_Is,installer_JAICA,installer_JICA,installer_KKKT,installer_KKKT _ Konde and DWE,installer_Kiliwater,installer_Kuwait,installer_LGA,installer_Lawatefuka water sup,installer_MUWSA,installer_MWE,installer_Magadini-Makiwaru wa,installer_Mission,installer_NORAD,installer_Norad,installer_OXFAM,installer_Private,installer_RC,installer_RC CHURCH,installer_RWE,installer_RWSSP,installer_SEMA,installer_SHIPO,installer_Sengerema Water Department,installer_TASAF,installer_TCRS,installer_TWE,installer_TWESA,installer_Tardo,installer_UNICEF,installer_Villagers,installer_WATER AID,installer_WEDECO,installer_WU,installer_WVT,installer_Wizara ya maji,installer_World,installer_World Vision,installer_World vision,installer_missing,installer_other,installer_wananchi,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,lga_Arusha Urban,lga_Babati,lga_Bagamoyo,lga_Bahi,lga_Bariadi,lga_Biharamulo,lga_Bukoba Rural,lga_Bukoba Urban,lga_Bukombe,lga_Bunda,lga_Chamwino,lga_Chato,lga_Chunya,lga_Dodoma Urban,lga_Geita,lga_Hai,lga_Hanang,lga_Handeni,lga_Igunga,lga_Ilala,lga_Ileje,lga_Ilemela,lga_Iramba,lga_Iringa Rural,lga_Kahama,lga_Karagwe,lga_Karatu,lga_Kasulu,lga_Kibaha,lga_Kibondo,lga_Kigoma Rural,lga_Kigoma Urban,lga_Kilindi,lga_Kilolo,lga_Kilombero,lga_Kilosa,lga_Kilwa,lga_Kinondoni,lga_Kisarawe,lga_Kishapu,lga_Kiteto,lga_Kondoa,lga_Kongwa,lga_Korogwe,lga_Kwimba,lga_Kyela,lga_Lindi Rural,lga_Lindi Urban,lga_Liwale,lga_Longido,lga_Ludewa,lga_Lushoto,lga_Mafia,lga_Magu,lga_Makete,lga_Manyoni,lga_Masasi,lga_Maswa,lga_Mbarali,lga_Mbeya Rural,lga_Mbinga,lga_Mbozi,lga_Mbulu,lga_Meatu,lga_Meru,lga_Misenyi,lga_Missungwi,lga_Mkinga,lga_Mkuranga,lga_Monduli,lga_Morogoro Rural,lga_Morogoro Urban,lga_Moshi Rural,lga_Moshi Urban,lga_Mpanda,lga_Mpwapwa,lga_Mtwara Rural,lga_Mtwara Urban,lga_Mufindi,lga_Muheza,lga_Muleba,lga_Musoma Rural,lga_Mvomero,lga_Mwanga,lga_Nachingwea,lga_Namtumbo,lga_Nanyumbu,lga_Newala,lga_Ngara,lga_Ngorongoro,lga_Njombe,lga_Nkasi,lga_Nyamagana,lga_Nzega,lga_Pangani,lga_Rombo,lga_Rorya,lga_Ruangwa,lga_Rufiji,lga_Rungwe,lga_Same,lga_Sengerema,lga_Serengeti,lga_Shinyanga Rural,lga_Shinyanga Urban,lga_Siha,lga_Sikonge,lga_Simanjiro,lga_Singida Rural,lga_Singida Urban,lga_Songea Rural,lga_Songea Urban,lga_Sumbawanga Rural,lga_Sumbawanga Urban,lga_Tabora Urban,lga_Tandahimba,lga_Tanga,lga_Tarime,lga_Temeke,lga_Tunduru,lga_Ukerewe,lga_Ulanga,lga_Urambo,lga_Uyui,ward_Chalinze,ward_Chanika,ward_Chinamili,ward_Diongoya,ward_Hedaru,ward_Ifakara,ward_Igongolo,ward_Igosi,ward_Ihanda,ward_Imalinyi,ward_Isongole,ward_Itete,ward_Kagongo,ward_Kanga,ward_Kidatu,ward_Kikatiti,ward_Kimochi,ward_Kiranyi,ward_Kitunda,ward_Lupalilo,ward_Mabwerebwere,ward_Magomeni,ward_Mahembe,ward_Mahongole,ward_Maji ya Chai,ward_Makuyuni,ward_Makwale,ward_Malindi,ward_Mamire,ward_Maposeni,ward_Maramba,ward_Masama Magharibi,ward_Masama Mashariki,ward_Matola,ward_Mdandu,ward_Mishamo,ward_Mkongo,ward_Mlangali,ward_Msindo,ward_Mtwango,ward_Mvomero,ward_Nduruma,ward_Ngarenanyuki,ward_Nkoma,ward_Nkungulu,ward_Olkokola,ward_Rujewa,ward_Siha Kati,ward_Siha Mashariki,ward_Simbo,ward_Soga,ward_Tinde,ward_Usuka,ward_Vikindu,ward_Wanging'ombe,ward_Yombo,ward_Zinga/Ikerege,ward_other,public_meeting_True,public_meeting_missing,scheme_management_None,scheme_management_Other,scheme_management_Parastatal,scheme_management_Private operator,scheme_management_SWC,scheme_management_Trust,scheme_management_VWC,scheme_management_WUA,scheme_management_WUG,scheme_management_Water Board,scheme_management_Water authority,scheme_management_missing,permit_True,permit_missing,extraction_type_cemo,extraction_type_climax,extraction_type_gravity,extraction_type_india mark ii,extraction_type_india mark iii,extraction_type_ksb,extraction_type_mono,extraction_type_nira/tanira,extraction_type_other,extraction_type_other - mkulima/shinyanga,extraction_type_other - play pump,extraction_type_other - rope pump,extraction_type_other - swn 81,extraction_type_submersible,extraction_type_swn 80,extraction_type_walimi,extraction_type_windmill,extraction_type_group_gravity,extraction_type_group_india mark ii,extraction_type_group_india mark iii,extraction_type_group_mono,extraction_type_group_nira/tanira,extraction_type_group_other,extraction_type_group_other handpump,extraction_type_group_other motorpump,extraction_type_group_rope pump,extraction_type_group_submersible,extraction_type_group_swn 80,extraction_type_group_wind-powered,extraction_type_class_handpump,extraction_type_class_motorpump,extraction_type_class_other,extraction_type_class_rope pump,extraction_type_class_submersible,extraction_type_class_wind-powered,management_other,management_other - school,management_parastatal,management_private operator,management_trust,management_unknown,management_vwc,management_water authority,management_water board,management_wua,management_wug,payment_other,payment_pay annually,payment_pay monthly,payment_pay per bucket,payment_pay when scheme fails,payment_unknown,water_quality_fluoride,water_quality_fluoride abandoned,water_quality_milky,water_quality_salty,water_quality_salty abandoned,water_quality_soft,water_quality_unknown,quantity_enough,quantity_insufficient,quantity_seasonal,quantity_unknown,source_hand dtw,source_lake,source_machine dbh,source_other,source_rainwater harvesting,source_river,source_shallow well,source_spring,source_unknown,source_class_surface,source_class_unknown,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,group_train,district_code_0,district_code_1,district_code_2,district_code_3,district_code_4,district_code_5,district_code_6,district_code_7,district_code_8,district_code_13,district_code_23,district_code_30,district_code_33,district_code_43,district_code_53,district_code_60,district_code_62,district_code_63,district_code_67,district_code_80
0,69572,1390.0,-0.071638,-1.441303,-0.402478,-0.454123,-0.307721,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,8776,1399.0,-0.164618,1.309482,-0.029852,-0.454123,-1.199099,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,34310,686.0,0.908406,0.71219,-0.095225,-0.784241,-1.100057,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,67743,263.0,1.306821,-1.904822,-0.513612,-1.114359,1.177908,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,19728,1033.532978,-1.55079,1.424421,-0.036666,0.866348,-0.091284,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
train_processed = df[df['group_train'] == 1]
test_processed = df[df['group_train'] == 0]
train_processed.drop(['group_train'], axis = 1)
test_processed.drop(['group_train'], axis = 1)

Unnamed: 0,id,gps_height,longitude,latitude,population,month_recorded,age,funder_Adb,funder_Adra,funder_African,funder_Amref,funder_Anglican Church,funder_Bsf,funder_Ces (gmbh),funder_Ces(gmbh),funder_Co,funder_Community,funder_Concern,funder_Concern World Wide,funder_Danida,funder_Ded,funder_Dfid,funder_Dh,funder_Dhv,funder_District Council,funder_Dmdd,funder_Dwe,funder_Dwsp,funder_Fini Water,funder_Finw,funder_Fw,funder_Germany,funder_Germany Republi,funder_Go,funder_Government Of Tanzania,funder_Halmashauri Ya Wilaya Sikonge,funder_He,funder_Hesawa,funder_Hifab,funder_Hsw,funder_Ir,funder_Is,funder_Isf,funder_Jaica,funder_Jica,funder_Ki,funder_Kiliwater,funder_Kkkt,funder_Kkkt_makwale,funder_Lamp,funder_Lawatefuka Water Supply,funder_Lga,funder_Lvia,funder_Magadini-makiwaru Water,funder_Ministry Of Water,funder_Mission,funder_Mkinga Distric Coun,funder_Muwsa,funder_Nethalan,funder_Netherlands,funder_No,funder_Norad,funder_Oikos E.Afrika,funder_Oxfam,funder_Oxfarm,funder_Plan Int,funder_Private,funder_Private Individual,funder_Rc,funder_Rc Church,funder_Roman,funder_Roman Catholic,funder_Ru,funder_Rudep,funder_Rural Water Supply And Sanitat,funder_Rwssp,funder_Shipo,funder_Snv,funder_Swedish,funder_Tardo,funder_Tasaf,funder_Tassaf,funder_Tcrs,funder_Unhcr,funder_Unice,funder_Unicef,funder_Village Council,funder_Villagers,funder_W.B,funder_Wananchi,funder_Water,funder_Wateraid,funder_World Bank,funder_World Vision,funder_Wsdp,funder_Wua,funder_Wvt,funder_missing,funder_other,installer_ACRA,installer_AMREF,installer_Amref,installer_Artisan,installer_CES,installer_Centr,installer_Central Government,installer_Central government,installer_Central govt,installer_Commu,installer_Community,installer_Consulting Engineer,installer_DANID,installer_DANIDA,installer_DDCA,installer_DED,installer_DH,installer_DW,installer_DWE,installer_DWSP,installer_Da,installer_Distri,installer_District Council,installer_District council,installer_Dmdd,installer_FINI WATER,installer_FW,installer_FinW,installer_Fini Water,installer_Fini water,installer_GOVER,installer_Gove,installer_Gover,installer_Government,installer_HE,installer_HESAWA,installer_HSW,installer_Halmashauri ya wilaya sikonge,installer_Handeni Trunk Main(,installer_Hesawa,installer_ISF,installer_Idara ya maji,installer_Ir,installer_Is,installer_JAICA,installer_JICA,installer_KKKT,installer_KKKT _ Konde and DWE,installer_Kiliwater,installer_Kuwait,installer_LGA,installer_Lawatefuka water sup,installer_MUWSA,installer_MWE,installer_Magadini-Makiwaru wa,installer_Mission,installer_NORAD,installer_Norad,installer_OXFAM,installer_Private,installer_RC,installer_RC CHURCH,installer_RWE,installer_RWSSP,installer_SEMA,installer_SHIPO,installer_Sengerema Water Department,installer_TASAF,installer_TCRS,installer_TWE,installer_TWESA,installer_Tardo,installer_UNICEF,installer_Villagers,installer_WATER AID,installer_WEDECO,installer_WU,installer_WVT,installer_Wizara ya maji,installer_World,installer_World Vision,installer_World vision,installer_missing,installer_other,installer_wananchi,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,lga_Arusha Urban,lga_Babati,lga_Bagamoyo,lga_Bahi,lga_Bariadi,lga_Biharamulo,lga_Bukoba Rural,lga_Bukoba Urban,lga_Bukombe,lga_Bunda,lga_Chamwino,lga_Chato,lga_Chunya,lga_Dodoma Urban,lga_Geita,lga_Hai,lga_Hanang,lga_Handeni,lga_Igunga,lga_Ilala,lga_Ileje,lga_Ilemela,lga_Iramba,lga_Iringa Rural,lga_Kahama,lga_Karagwe,lga_Karatu,lga_Kasulu,lga_Kibaha,lga_Kibondo,lga_Kigoma Rural,lga_Kigoma Urban,lga_Kilindi,lga_Kilolo,lga_Kilombero,lga_Kilosa,lga_Kilwa,lga_Kinondoni,lga_Kisarawe,lga_Kishapu,lga_Kiteto,lga_Kondoa,lga_Kongwa,lga_Korogwe,lga_Kwimba,lga_Kyela,lga_Lindi Rural,lga_Lindi Urban,lga_Liwale,lga_Longido,lga_Ludewa,lga_Lushoto,lga_Mafia,lga_Magu,lga_Makete,lga_Manyoni,lga_Masasi,lga_Maswa,lga_Mbarali,lga_Mbeya Rural,lga_Mbinga,lga_Mbozi,lga_Mbulu,lga_Meatu,lga_Meru,lga_Misenyi,lga_Missungwi,lga_Mkinga,lga_Mkuranga,lga_Monduli,lga_Morogoro Rural,lga_Morogoro Urban,lga_Moshi Rural,lga_Moshi Urban,lga_Mpanda,lga_Mpwapwa,lga_Mtwara Rural,lga_Mtwara Urban,lga_Mufindi,lga_Muheza,lga_Muleba,lga_Musoma Rural,lga_Mvomero,lga_Mwanga,lga_Nachingwea,lga_Namtumbo,lga_Nanyumbu,lga_Newala,lga_Ngara,lga_Ngorongoro,lga_Njombe,lga_Nkasi,lga_Nyamagana,lga_Nzega,lga_Pangani,lga_Rombo,lga_Rorya,lga_Ruangwa,lga_Rufiji,lga_Rungwe,lga_Same,lga_Sengerema,lga_Serengeti,lga_Shinyanga Rural,lga_Shinyanga Urban,lga_Siha,lga_Sikonge,lga_Simanjiro,lga_Singida Rural,lga_Singida Urban,lga_Songea Rural,lga_Songea Urban,lga_Sumbawanga Rural,lga_Sumbawanga Urban,lga_Tabora Urban,lga_Tandahimba,lga_Tanga,lga_Tarime,lga_Temeke,lga_Tunduru,lga_Ukerewe,lga_Ulanga,lga_Urambo,lga_Uyui,ward_Chalinze,ward_Chanika,ward_Chinamili,ward_Diongoya,ward_Hedaru,ward_Ifakara,ward_Igongolo,ward_Igosi,ward_Ihanda,ward_Imalinyi,ward_Isongole,ward_Itete,ward_Kagongo,ward_Kanga,ward_Kidatu,ward_Kikatiti,ward_Kimochi,ward_Kiranyi,ward_Kitunda,ward_Lupalilo,ward_Mabwerebwere,ward_Magomeni,ward_Mahembe,ward_Mahongole,ward_Maji ya Chai,ward_Makuyuni,ward_Makwale,ward_Malindi,ward_Mamire,ward_Maposeni,ward_Maramba,ward_Masama Magharibi,ward_Masama Mashariki,ward_Matola,ward_Mdandu,ward_Mishamo,ward_Mkongo,ward_Mlangali,ward_Msindo,ward_Mtwango,ward_Mvomero,ward_Nduruma,ward_Ngarenanyuki,ward_Nkoma,ward_Nkungulu,ward_Olkokola,ward_Rujewa,ward_Siha Kati,ward_Siha Mashariki,ward_Simbo,ward_Soga,ward_Tinde,ward_Usuka,ward_Vikindu,ward_Wanging'ombe,ward_Yombo,ward_Zinga/Ikerege,ward_other,public_meeting_True,public_meeting_missing,scheme_management_None,scheme_management_Other,scheme_management_Parastatal,scheme_management_Private operator,scheme_management_SWC,scheme_management_Trust,scheme_management_VWC,scheme_management_WUA,scheme_management_WUG,scheme_management_Water Board,scheme_management_Water authority,scheme_management_missing,permit_True,permit_missing,extraction_type_cemo,extraction_type_climax,extraction_type_gravity,extraction_type_india mark ii,extraction_type_india mark iii,extraction_type_ksb,extraction_type_mono,extraction_type_nira/tanira,extraction_type_other,extraction_type_other - mkulima/shinyanga,extraction_type_other - play pump,extraction_type_other - rope pump,extraction_type_other - swn 81,extraction_type_submersible,extraction_type_swn 80,extraction_type_walimi,extraction_type_windmill,extraction_type_group_gravity,extraction_type_group_india mark ii,extraction_type_group_india mark iii,extraction_type_group_mono,extraction_type_group_nira/tanira,extraction_type_group_other,extraction_type_group_other handpump,extraction_type_group_other motorpump,extraction_type_group_rope pump,extraction_type_group_submersible,extraction_type_group_swn 80,extraction_type_group_wind-powered,extraction_type_class_handpump,extraction_type_class_motorpump,extraction_type_class_other,extraction_type_class_rope pump,extraction_type_class_submersible,extraction_type_class_wind-powered,management_other,management_other - school,management_parastatal,management_private operator,management_trust,management_unknown,management_vwc,management_water authority,management_water board,management_wua,management_wug,payment_other,payment_pay annually,payment_pay monthly,payment_pay per bucket,payment_pay when scheme fails,payment_unknown,water_quality_fluoride,water_quality_fluoride abandoned,water_quality_milky,water_quality_salty,water_quality_salty abandoned,water_quality_soft,water_quality_unknown,quantity_enough,quantity_insufficient,quantity_seasonal,quantity_unknown,source_hand dtw,source_lake,source_machine dbh,source_other,source_rainwater harvesting,source_river,source_shallow well,source_spring,source_unknown,source_class_surface,source_class_unknown,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,district_code_0,district_code_1,district_code_2,district_code_3,district_code_4,district_code_5,district_code_6,district_code_7,district_code_8,district_code_13,district_code_23,district_code_30,district_code_33,district_code_43,district_code_53,district_code_60,district_code_62,district_code_63,district_code_67,district_code_80
0,50785,1996.000000,0.065392,0.627132,0.059491,-0.784241,-1.397183,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,51630,1569.000000,0.596062,0.894930,0.013730,-0.784241,-0.208680,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,17168,1567.000000,-0.137773,0.290049,0.449550,-0.784241,-1.199099,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,45559,267.000000,1.140495,-1.285134,-0.095225,-1.114359,1.078866,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,49871,1260.000000,-0.045207,-1.831712,-0.509254,-0.454123,-0.208680,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14845,39307,34.000000,1.449213,-0.273212,-0.596418,-0.784241,0.781740,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14846,18990,1364.142529,0.904897,0.166555,5.810139,-0.454123,0.187488,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
14847,28749,1476.000000,-0.148674,0.439477,-0.204180,-0.454123,-1.199099,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14848,33492,998.000000,0.120534,-1.701020,-0.313135,-0.784241,-1.100057,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
train_processed.shape

(59400, 491)

In [22]:
test_processed.shape

(14850, 491)

It looks like we've successfully encoded our data.

In [23]:
train_processed['status'] = labels.status_group
train_processed = train_processed.drop('id', axis = 1)
test_processed = test_processed.drop('id', axis = 1)
train_processed.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_processed['status'] = labels.status_group


Unnamed: 0,gps_height,longitude,latitude,population,month_recorded,age,funder_Adb,funder_Adra,funder_African,funder_Amref,funder_Anglican Church,funder_Bsf,funder_Ces (gmbh),funder_Ces(gmbh),funder_Co,funder_Community,funder_Concern,funder_Concern World Wide,funder_Danida,funder_Ded,funder_Dfid,funder_Dh,funder_Dhv,funder_District Council,funder_Dmdd,funder_Dwe,funder_Dwsp,funder_Fini Water,funder_Finw,funder_Fw,funder_Germany,funder_Germany Republi,funder_Go,funder_Government Of Tanzania,funder_Halmashauri Ya Wilaya Sikonge,funder_He,funder_Hesawa,funder_Hifab,funder_Hsw,funder_Ir,funder_Is,funder_Isf,funder_Jaica,funder_Jica,funder_Ki,funder_Kiliwater,funder_Kkkt,funder_Kkkt_makwale,funder_Lamp,funder_Lawatefuka Water Supply,funder_Lga,funder_Lvia,funder_Magadini-makiwaru Water,funder_Ministry Of Water,funder_Mission,funder_Mkinga Distric Coun,funder_Muwsa,funder_Nethalan,funder_Netherlands,funder_No,funder_Norad,funder_Oikos E.Afrika,funder_Oxfam,funder_Oxfarm,funder_Plan Int,funder_Private,funder_Private Individual,funder_Rc,funder_Rc Church,funder_Roman,funder_Roman Catholic,funder_Ru,funder_Rudep,funder_Rural Water Supply And Sanitat,funder_Rwssp,funder_Shipo,funder_Snv,funder_Swedish,funder_Tardo,funder_Tasaf,funder_Tassaf,funder_Tcrs,funder_Unhcr,funder_Unice,funder_Unicef,funder_Village Council,funder_Villagers,funder_W.B,funder_Wananchi,funder_Water,funder_Wateraid,funder_World Bank,funder_World Vision,funder_Wsdp,funder_Wua,funder_Wvt,funder_missing,funder_other,installer_ACRA,installer_AMREF,installer_Amref,installer_Artisan,installer_CES,installer_Centr,installer_Central Government,installer_Central government,installer_Central govt,installer_Commu,installer_Community,installer_Consulting Engineer,installer_DANID,installer_DANIDA,installer_DDCA,installer_DED,installer_DH,installer_DW,installer_DWE,installer_DWSP,installer_Da,installer_Distri,installer_District Council,installer_District council,installer_Dmdd,installer_FINI WATER,installer_FW,installer_FinW,installer_Fini Water,installer_Fini water,installer_GOVER,installer_Gove,installer_Gover,installer_Government,installer_HE,installer_HESAWA,installer_HSW,installer_Halmashauri ya wilaya sikonge,installer_Handeni Trunk Main(,installer_Hesawa,installer_ISF,installer_Idara ya maji,installer_Ir,installer_Is,installer_JAICA,installer_JICA,installer_KKKT,installer_KKKT _ Konde and DWE,installer_Kiliwater,installer_Kuwait,installer_LGA,installer_Lawatefuka water sup,installer_MUWSA,installer_MWE,installer_Magadini-Makiwaru wa,installer_Mission,installer_NORAD,installer_Norad,installer_OXFAM,installer_Private,installer_RC,installer_RC CHURCH,installer_RWE,installer_RWSSP,installer_SEMA,installer_SHIPO,installer_Sengerema Water Department,installer_TASAF,installer_TCRS,installer_TWE,installer_TWESA,installer_Tardo,installer_UNICEF,installer_Villagers,installer_WATER AID,installer_WEDECO,installer_WU,installer_WVT,installer_Wizara ya maji,installer_World,installer_World Vision,installer_World vision,installer_missing,installer_other,installer_wananchi,basin_Lake Nyasa,basin_Lake Rukwa,basin_Lake Tanganyika,basin_Lake Victoria,basin_Pangani,basin_Rufiji,basin_Ruvuma / Southern Coast,basin_Wami / Ruvu,lga_Arusha Urban,lga_Babati,lga_Bagamoyo,lga_Bahi,lga_Bariadi,lga_Biharamulo,lga_Bukoba Rural,lga_Bukoba Urban,lga_Bukombe,lga_Bunda,lga_Chamwino,lga_Chato,lga_Chunya,lga_Dodoma Urban,lga_Geita,lga_Hai,lga_Hanang,lga_Handeni,lga_Igunga,lga_Ilala,lga_Ileje,lga_Ilemela,lga_Iramba,lga_Iringa Rural,lga_Kahama,lga_Karagwe,lga_Karatu,lga_Kasulu,lga_Kibaha,lga_Kibondo,lga_Kigoma Rural,lga_Kigoma Urban,lga_Kilindi,lga_Kilolo,lga_Kilombero,lga_Kilosa,lga_Kilwa,lga_Kinondoni,lga_Kisarawe,lga_Kishapu,lga_Kiteto,lga_Kondoa,lga_Kongwa,lga_Korogwe,lga_Kwimba,lga_Kyela,lga_Lindi Rural,lga_Lindi Urban,lga_Liwale,lga_Longido,lga_Ludewa,lga_Lushoto,lga_Mafia,lga_Magu,lga_Makete,lga_Manyoni,lga_Masasi,lga_Maswa,lga_Mbarali,lga_Mbeya Rural,lga_Mbinga,lga_Mbozi,lga_Mbulu,lga_Meatu,lga_Meru,lga_Misenyi,lga_Missungwi,lga_Mkinga,lga_Mkuranga,lga_Monduli,lga_Morogoro Rural,lga_Morogoro Urban,lga_Moshi Rural,lga_Moshi Urban,lga_Mpanda,lga_Mpwapwa,lga_Mtwara Rural,lga_Mtwara Urban,lga_Mufindi,lga_Muheza,lga_Muleba,lga_Musoma Rural,lga_Mvomero,lga_Mwanga,lga_Nachingwea,lga_Namtumbo,lga_Nanyumbu,lga_Newala,lga_Ngara,lga_Ngorongoro,lga_Njombe,lga_Nkasi,lga_Nyamagana,lga_Nzega,lga_Pangani,lga_Rombo,lga_Rorya,lga_Ruangwa,lga_Rufiji,lga_Rungwe,lga_Same,lga_Sengerema,lga_Serengeti,lga_Shinyanga Rural,lga_Shinyanga Urban,lga_Siha,lga_Sikonge,lga_Simanjiro,lga_Singida Rural,lga_Singida Urban,lga_Songea Rural,lga_Songea Urban,lga_Sumbawanga Rural,lga_Sumbawanga Urban,lga_Tabora Urban,lga_Tandahimba,lga_Tanga,lga_Tarime,lga_Temeke,lga_Tunduru,lga_Ukerewe,lga_Ulanga,lga_Urambo,lga_Uyui,ward_Chalinze,ward_Chanika,ward_Chinamili,ward_Diongoya,ward_Hedaru,ward_Ifakara,ward_Igongolo,ward_Igosi,ward_Ihanda,ward_Imalinyi,ward_Isongole,ward_Itete,ward_Kagongo,ward_Kanga,ward_Kidatu,ward_Kikatiti,ward_Kimochi,ward_Kiranyi,ward_Kitunda,ward_Lupalilo,ward_Mabwerebwere,ward_Magomeni,ward_Mahembe,ward_Mahongole,ward_Maji ya Chai,ward_Makuyuni,ward_Makwale,ward_Malindi,ward_Mamire,ward_Maposeni,ward_Maramba,ward_Masama Magharibi,ward_Masama Mashariki,ward_Matola,ward_Mdandu,ward_Mishamo,ward_Mkongo,ward_Mlangali,ward_Msindo,ward_Mtwango,ward_Mvomero,ward_Nduruma,ward_Ngarenanyuki,ward_Nkoma,ward_Nkungulu,ward_Olkokola,ward_Rujewa,ward_Siha Kati,ward_Siha Mashariki,ward_Simbo,ward_Soga,ward_Tinde,ward_Usuka,ward_Vikindu,ward_Wanging'ombe,ward_Yombo,ward_Zinga/Ikerege,ward_other,public_meeting_True,public_meeting_missing,scheme_management_None,scheme_management_Other,scheme_management_Parastatal,scheme_management_Private operator,scheme_management_SWC,scheme_management_Trust,scheme_management_VWC,scheme_management_WUA,scheme_management_WUG,scheme_management_Water Board,scheme_management_Water authority,scheme_management_missing,permit_True,permit_missing,extraction_type_cemo,extraction_type_climax,extraction_type_gravity,extraction_type_india mark ii,extraction_type_india mark iii,extraction_type_ksb,extraction_type_mono,extraction_type_nira/tanira,extraction_type_other,extraction_type_other - mkulima/shinyanga,extraction_type_other - play pump,extraction_type_other - rope pump,extraction_type_other - swn 81,extraction_type_submersible,extraction_type_swn 80,extraction_type_walimi,extraction_type_windmill,extraction_type_group_gravity,extraction_type_group_india mark ii,extraction_type_group_india mark iii,extraction_type_group_mono,extraction_type_group_nira/tanira,extraction_type_group_other,extraction_type_group_other handpump,extraction_type_group_other motorpump,extraction_type_group_rope pump,extraction_type_group_submersible,extraction_type_group_swn 80,extraction_type_group_wind-powered,extraction_type_class_handpump,extraction_type_class_motorpump,extraction_type_class_other,extraction_type_class_rope pump,extraction_type_class_submersible,extraction_type_class_wind-powered,management_other,management_other - school,management_parastatal,management_private operator,management_trust,management_unknown,management_vwc,management_water authority,management_water board,management_wua,management_wug,payment_other,payment_pay annually,payment_pay monthly,payment_pay per bucket,payment_pay when scheme fails,payment_unknown,water_quality_fluoride,water_quality_fluoride abandoned,water_quality_milky,water_quality_salty,water_quality_salty abandoned,water_quality_soft,water_quality_unknown,quantity_enough,quantity_insufficient,quantity_seasonal,quantity_unknown,source_hand dtw,source_lake,source_machine dbh,source_other,source_rainwater harvesting,source_river,source_shallow well,source_spring,source_unknown,source_class_surface,source_class_unknown,waterpoint_type_communal standpipe,waterpoint_type_communal standpipe multiple,waterpoint_type_dam,waterpoint_type_hand pump,waterpoint_type_improved spring,waterpoint_type_other,group_train,district_code_0,district_code_1,district_code_2,district_code_3,district_code_4,district_code_5,district_code_6,district_code_7,district_code_8,district_code_13,district_code_23,district_code_30,district_code_33,district_code_43,district_code_53,district_code_60,district_code_62,district_code_63,district_code_67,district_code_80,status
0,1390.0,-0.071638,-1.441303,-0.402478,-0.454123,-0.307721,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,functional
1,1399.0,-0.164618,1.309482,-0.029852,-0.454123,-1.199099,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,functional
2,686.0,0.908406,0.71219,-0.095225,-0.784241,-1.100057,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,functional
3,263.0,1.306821,-1.904822,-0.513612,-1.114359,1.177908,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,non functional
4,1033.532978,-1.55079,1.424421,-0.036666,0.866348,-0.091284,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,functional


In [34]:
train_processed.to_csv(path_or_buf="train_processed.csv", sep=',', index = False)
test_processed.to_csv(path_or_buf="test_processed.csv", sep=',', index = False)