**Import Libraries**

In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 889 kB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.1.post0


In [None]:
import pandas as pd
import pickle
from tensorflow.keras.models import load_model
import numpy as np
import time
import category_encoders as ce

**Data Cleaning Pipeline**

In [None]:
def data_cleaning(X):
  #Cleaning funder feature
  X['funder'] = X['funder'] .str.replace(' ','_')
  X['funder']= X['funder'] .str.replace('-','_')
  X['funder'] = X['funder'].str.replace(' The ','')
  X['funder'] = X['funder'].str.replace(' ','')
  X['funder'] = X['funder'].str.replace('&','_')
  X['funder'] = X['funder'].str.replace(',','_')
  X['funder'] = X['funder'] .str.lower()

  #Cleaning installer feature
  X['installer'] = X['installer'] .str.replace(' ','_')
  X['installer']= X['installer'] .str.replace('-','_')
  X['installer'] = X['installer'].str.replace(' The ','')
  X['installer'] = X['installer'].str.replace(' ','')
  X['installer'] = X['installer'].str.replace('&','_')
  X['installer'] = X['installer'].str.replace(',','_')
  X['installer'] = X['installer'] .str.lower()

  #cleaning basin feature
  X['basin'] = X['basin'].str.replace(' ','')
  X['basin'] = X['basin'].str.replace(' ','_')
  X['basin'] = X['basin'].str.replace(',','_')
  X['basin'] = X['basin'].str.replace('_/_','_')
  X['basin'] = X['basin'] .str.lower()

  #cleaning region feature
  X['region'] = X['region'] .str.replace(' ','_')
  X['region'] = X['region'] .str.lower()

  #cleaning lga feature
  X['lga'] = X['lga'] .str.replace(' ','_')
  X['lga'] = X['lga'] .str.lower()

  #cleaning scheme_name feature
  X['scheme_name'] = X['scheme_name'] .str.replace(' ','_')
  X['scheme_name'] = X['scheme_name'] .str.lower()

  #cleaning extraction_type_class feature
  X['extraction_type_class'] = X['extraction_type_class'] .str.replace(' ','_')
  X['extraction_type_class'] = X['extraction_type_class'] .str.replace('-','_')

  #cleaning management feature
  X['management'] = X['management'] .str.replace(' ','_')
  X['management'] = X['management'] .str.replace('-','')
  X['management'] = X['management'] .str.replace('__','')

  #cleaning payment feature
  X['payment'] = X['payment'] .str.replace(' ','_')

  #cleaning source feature
  X['source'] = X['source'] .str.replace(' ','_')

  #cleaning waterpoint_type feature
  X['waterpoint_type'] = X['waterpoint_type'] .str.replace(' ','_')

  return X

**Function to return predictions**

In [None]:
def final_fun_1(X):
  #adding operating years of pump
  X['date_recorded'] = pd.to_datetime(X['date_recorded'])
  X['operational_year'] = X.date_recorded.dt.year - X.construction_year

  #replace zero values with np.nan
  X['amount_tsh']=X['amount_tsh'].replace(0,np.nan)
  X['population']=X['population'].replace(0,np.nan)
  X['gps_height']=X['gps_height'].replace(0,np.nan)
  
  #Eliminate features
  features_tobe_eliminated=['construction_year','date_recorded','num_private','water_quality','payment_type','quantity_group','waterpoint_type_group','extraction_type_group','source_type','management_group','district_code','num_private','scheme_management','id','subvillage','wpt_name','recorded_by','permit','public_meeting','ward','extraction_type']
  X1=X.drop(columns=features_tobe_eliminated)

  #Region based median imputation of latitude and longtitude features
  long_medians_test = X1.groupby(['region'])['longitude'].transform('median')
  lat_medians_test = X1.groupby(['region'])['latitude'].transform('median')
  X1['latitude']=X1['latitude'].fillna(lat_medians_test)
  X1['longitude']=X1['longitude'].fillna(long_medians_test)

  #Filling missing values with other
  X1['installer']=X1['installer'].fillna('other')
  X1['funder']=X1['funder'].fillna('other')
  X1['scheme_name']=X1['scheme_name'].fillna('other')

  #function created for cleaning data
  X1=data_cleaning(X1)

  #Mice Imputation
  X_mice = X1.filter(['amount_tsh','gps_height','population'], axis=1).copy()
  mice_imputer = pickle.load(open('mice_imputer.pkl', 'rb'))
  X_mice_imputed = pd.DataFrame(mice_imputer.fit_transform(X_mice), columns=X_mice.columns)
  X1['amount_tsh']=X_mice_imputed['amount_tsh'].values
  X1['gps_height']=X_mice_imputed['gps_height'].values
  X1['population']=X_mice_imputed['population'].values

  #Target Encoding
  enc = pickle.load(open('enc.pkl', 'rb'))
  X_cat=X1.drop(columns=['amount_tsh','population','gps_height','longitude','latitude','operational_year'])
  X_numerical=X1[['amount_tsh','population','gps_height','longitude','latitude','operational_year']]
  X_target_cat=enc.transform(X_cat)
  X_target_cat = X_target_cat.astype(np.float32)
  X_numerical = X_numerical.astype(np.float32)
  test_data=pd.concat([X_target_cat,X_numerical],axis=1)

  #Adding auto_encoder features
  encoder = load_model('mice_encoder.h5')
  X_encoded = encoder.predict(test_data)
  X_final = np.hstack((np.array(test_data),X_encoded))

  #predict with best model
  best_model = pickle.load(open('best_model.pkl','rb'))
  y_pred = best_model.predict(X_final)
  
  y_prediction=[]

  for i in y_pred:
    if i==0.0:
      y_prediction.append("functional - the waterpoint is operational and there are no repairs needed")
    elif i==1.0:
      y_prediction.append("functional needs repair - the waterpoint is operational, but needs repairs")
    else:
      y_prediction.append("non functional - the waterpoint is not operational")

  return y_prediction

In [None]:
X_test= pd.read_csv('X_test.csv')
X_te=X_test.drop(columns=['Unnamed: 0','id.1'])

**Predictions on test data points**

In [None]:
start = time.time()
y = final_fun_1(X_te)
print(y)
end = time.time()
print(f"Runtime of the program is {end - start}")



['functional needs repair - the waterpoint is operational, but needs repairs', 'functional - the waterpoint is operational and there are no repairs needed', 'functional - the waterpoint is operational and there are no repairs needed', 'non functional - the waterpoint is not operational', 'non functional - the waterpoint is not operational', 'non functional - the waterpoint is not operational', 'functional needs repair - the waterpoint is operational, but needs repairs', 'non functional - the waterpoint is not operational', 'functional needs repair - the waterpoint is operational, but needs repairs', 'non functional - the waterpoint is not operational', 'non functional - the waterpoint is not operational', 'functional - the waterpoint is operational and there are no repairs needed', 'functional - the waterpoint is operational and there are no repairs needed', 'functional - the waterpoint is operational and there are no repairs needed', 'non functional - the waterpoint is not operational

**Taking single data point as input**

In [None]:
functional_datapoint = pd.DataFrame(X_te.iloc[1]).T #Functional data point
functional_needs_repair_datapoint = pd.DataFrame(X_te.iloc[0]).T #Functional needs repair data point
non_functional= pd.DataFrame(X_te.iloc[3]).T #Non functional data point

**Prediction on Functional Pump data point**

In [None]:
start = time.time()
y = final_fun_1(non_functional)
if y[0]==0.0:
  print('functional - the waterpoint is operational and there are no repairs needed')
elif y[0]==1.0:
  print('functional needs repair - the waterpoint is operational, but needs repairs')
else:
  print('non functional - the waterpoint is not operational')
end = time.time()
print(f"Runtime of the program is {end - start}")



non functional - the waterpoint is not operational
Runtime of the program is 0.30409836769104004


**Prediction on functional needs repair Pump data point**

In [None]:
start = time.time()
y = final_fun_1(functional_needs_repair_datapoint)
if y[0]==0.0:
  print('functional - the waterpoint is operational and there are no repairs needed')
elif y[0]==1.0:
  print('functional needs repair - the waterpoint is operational, but needs repairs')
else:
  print('non functional - the waterpoint is not operational')
end = time.time()
print(f"Runtime of the program is {end - start}")



functional needs repair - the waterpoint is operational, but needs repairs
Runtime of the program is 0.4373955726623535


**Prediction on Non functional Pump data point**

In [None]:
start = time.time()
y = final_fun_1(non_functional)
if y[0]==0.0:
  print('functional - the waterpoint is operational and there are no repairs needed')
elif y[0]==1.0:
  print('functional needs repair - the waterpoint is operational, but needs repairs')
else:
  print('non functional - the waterpoint is not operational')
end = time.time()
print(f"Runtime of the program is {end - start}")



non functional - the waterpoint is not operational
Runtime of the program is 0.3177323341369629
