In [9]:
# Downloading data using gdown
!gdown 1HwXjx-eubi_aQJoZXBTEw527sgrOO1Gx # train_csv
!gdown 18SeMBB0xEcnx81JdA5fMlk82U40Z8TyP # train_labels
!gdown 1QX8G7PnK6F5teCSmUN-9Vm8TFhkVCOyW # test_csv

Downloading...
From: https://drive.google.com/uc?id=1HwXjx-eubi_aQJoZXBTEw527sgrOO1Gx
To: /content/train.csv.csv
100% 20.1M/20.1M [00:00<00:00, 61.6MB/s]
Downloading...
From: https://drive.google.com/uc?id=18SeMBB0xEcnx81JdA5fMlk82U40Z8TyP
To: /content/train_labels.csv.csv
100% 1.15M/1.15M [00:00<00:00, 117MB/s]
Downloading...
From: https://drive.google.com/uc?id=1QX8G7PnK6F5teCSmUN-9Vm8TFhkVCOyW
To: /content/test.csv.csv
100% 5.02M/5.02M [00:00<00:00, 40.0MB/s]


# <font color='orange'>8. Data Pipeline for Model Deployment

A machine learning pipeline is used to help automate machine learning workflows. They operate by enabling a sequence of data to be transformed and correlated together in a model that can be tested and evaluated to achieve an outcome.

 We will create two pipelines, one for prediction and one for model evaluation.

##<font color='brown'>8.1 Installing/Importing Dependancies </font>

We have to install and/or import necessary library dependancies needed for our pipelines so that our pipelines run smoothly without any error.

In [1]:
# Installing/Importing Dependancies
!pip install gdown==4.5.4
!pip install catboost
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import random
import joblib
import gdown
from scipy.sparse import hstack
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown==4.5.4
  Downloading gdown-4.5.4-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.5.4
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 15 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [2]:
# Downloading data transformation and ml model files
!gdown 1ijMRu5HGAp0jOrrigFxzPM585LgKFso_ # transformations file
!gdown 1vZkZEBG4MiuQye1CI5SU7iys7DC6MwNY # ml model

Downloading...
From: https://drive.google.com/uc?id=1ijMRu5HGAp0jOrrigFxzPM585LgKFso_
To: /content/data_transformations.rar
100% 13.3k/13.3k [00:00<00:00, 14.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1vZkZEBG4MiuQye1CI5SU7iys7DC6MwNY
To: /content/rf_model.rar
100% 32.8M/32.8M [00:00<00:00, 144MB/s]


In [6]:
# Exracting compressed files
!unrar x "data_transformations.rar"
!unrar x "rf_model.rar"


UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from data_transformations.rar


Would you like to replace the existing file region_encoder
   919 bytes, modified on 2022-12-06 06:39
with a new one
   919 bytes, modified on 2022-12-06 06:39

[Y]es, [N]o, [A]ll, n[E]ver, [R]ename, [Q]uit y

Extracting  region_encoder                                                 5%  OK 

Would you like to replace the existing file scheme_management_encoder
   832 bytes, modified on 2022-12-06 06:39
with a new one
   832 bytes, modified on 2022-12-06 06:39

[Y]es, [N]o, [A]ll, n[E]ver, [R]ename, [Q]uit a

Extracting  scheme_management_encoder                                     10%  OK 
Extracting  source_class_encoder                                          14%  OK 
Extracting  source_type_encoder                                           19%  OK 
Extracting  water_quality_encoder                                         23%  

##<font color='brown'>8.2 Prediction Pipeline Function </font>

We define predict pipeline function such that it takes input data to be predicted as input and returns the predictions as output.

In [7]:
# prediction pipeline function
def predict(input_samples:any, return_id=True):
    """
    This function takes the raw input samples and returns the predicted
    functionality status.
    """
    # Columns names in data
    columns =  ['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
                'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
                'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
                'ward', 'population', 'public_meeting', 'recorded_by',
                'scheme_management', 'scheme_name', 'permit', 'construction_year',
                'extraction_type', 'extraction_type_group', 'extraction_type_class',
                'management', 'management_group', 'payment', 'payment_type',
                'water_quality', 'quality_group', 'quantity', 'quantity_group',
                'source', 'source_type', 'source_class', 'waterpoint_type',
                'waterpoint_type_group']
    
    # Create dataframe from input samples
    input_df = pd.DataFrame(input_samples, columns=columns)
    
    # Imputing missing values
    ## gps_height
    GPS_HEIGHT_MEDIAN = 369.0
    input_df['gps_height'] = np.where(input_df['gps_height'] <= 0, GPS_HEIGHT_MEDIAN, input_df['gps_height'])
    ## latitude
    LATITUDE_MEDIAN = -5.0216
    input_df['latitude'] = np.where(input_df['latitude'] < -11.8, LATITUDE_MEDIAN, input_df['latitude'])
    input_df['latitude'] = np.where(input_df['latitude'] > -1, LATITUDE_MEDIAN, input_df['latitude'])
    ## longitude
    LONGITUDE_MEDIAN = 34.9087
    input_df['longitude'] = np.where(input_df['longitude'] < 29, LONGITUDE_MEDIAN, input_df['longitude'])
    input_df['longitude'] = np.where(input_df['longitude'] > 40.8, LONGITUDE_MEDIAN, input_df['longitude'])
    ## public_meeting
    PUBLIC_MEETING_MODE = True
    input_df['public_meeting'] = input_df['public_meeting'].fillna(PUBLIC_MEETING_MODE)
    ## scheme_management
    SCHEME_MANAGEMENT_MODE = "VWC"
    input_df['scheme_management'] = input_df['scheme_management'].fillna(SCHEME_MANAGEMENT_MODE)
    ## permit
    PERMIT_MODE = True
    input_df['permit'] = input_df['permit'].fillna(PERMIT_MODE)
    ## construction_year
    def year_imputer(x):
        """
        Function to impute construction_year in data
        """
        if x == 0:
            year_range = list(range(2000, 2013))
            impute_year = random.choice(year_range)
            return impute_year

    input_df['construction_year'] = np.where(input_df['construction_year'] == 0,
                                           input_df['construction_year'].apply(year_imputer),
                                           input_df['construction_year'])
    
    # Feature Engineering
    ## Creating separate year and month columns
    input_df['date_recorded'] = pd.to_datetime(input_df['date_recorded'])
    input_df['record_year'] = input_df['date_recorded'].dt.year
    input_df['record_month'] = input_df['date_recorded'].dt.month
    input_df['waterpoint_age'] = input_df['record_year'] - input_df['construction_year']

    # treating record_year and record_month as categorical features
    input_df[['record_year', 'record_month']] = input_df[['record_year', 'record_month']].astype('object')

    # Feature Selection
    ids = input_df['id'].values
    dropped_features = ['id', 'amount_tsh', 'funder', 'installer', 'wpt_name',
                        'num_private', 'subvillage', 'region_code', 'district_code',
                        'ward', 'recorded_by', 'scheme_name', 'construction_year', 
                        'extraction_type', 'extraction_type_group', 'management_group', 
                        'payment', 'quality_group', 'quantity_group', 'source', 
                        'waterpoint_type_group', 'date_recorded']
    input_df = input_df.drop(dropped_features, axis=1)

    # Data Preparation
    
    ## gps_height
    gps_height_normalizer = joblib.load("gps_height_normalizer")
    test_gps_height_normalized = gps_height_normalizer.transform(input_df['gps_height'].values.reshape(-1, 1))
    ## longitude
    longitude_normalizer = joblib.load("longitude_normalizer")
    test_longitude_normalized = longitude_normalizer.transform(input_df['longitude'].values.reshape(-1, 1))
    ## latitude
    latitude_normalizer = joblib.load("latitude_normalizer")
    test_latitude_normalized = latitude_normalizer.transform(input_df['latitude'].values.reshape(-1, 1))
    ## population
    population_normalizer = joblib.load("population_normalizer")
    test_population_normalized = population_normalizer.transform(input_df['population'].values.reshape(-1, 1))
    ## waterpoint_age
    waterpoint_age_normalizer = joblib.load("waterpoint_age_normalizer")
    test_waterpoint_age_normalized = waterpoint_age_normalizer.transform(input_df['waterpoint_age'].values.reshape(-1, 1))
    ## basin
    basin_encoder = joblib.load("basin_encoder")
    test_basin_encoded = basin_encoder.transform(input_df['basin'].values.reshape(-1, 1))
    ## region
    region_encoder = joblib.load("region_encoder")
    test_region_encoded = region_encoder.transform(input_df['region'].values.reshape(-1, 1))
    ## lga
    lga_encoder = joblib.load("lga_encoder")  
    test_lga_encoded = lga_encoder.transform(input_df['lga'].values.reshape(-1, 1))
    ## public_meeting
    public_meeting_encoder = joblib.load("public_meeting_encoder")
    test_public_meeting_encoded = public_meeting_encoder.transform(input_df['public_meeting'].values.reshape(-1, 1))
    ## scheme_management
    scheme_management_encoder = joblib.load("scheme_management_encoder")
    test_scheme_management_encoded = scheme_management_encoder.transform(input_df['scheme_management'].values.reshape(-1, 1))
    ## permit
    permit_encoder = joblib.load("permit_encoder")
    test_permit_encoded = permit_encoder.transform(input_df['permit'].values.reshape(-1, 1))
    ## extraction_type_class 
    extraction_type_class_encoder = joblib.load("extraction_type_class_encoder")
    test_extraction_type_class_encoded = extraction_type_class_encoder.transform(input_df['extraction_type_class'].values.reshape(-1, 1))
    ## management
    management_encoder = joblib.load("management_encoder")
    test_management_encoded = management_encoder.transform(input_df['management'].values.reshape(-1, 1))
    ## payment_type
    payment_type_encoder = joblib.load("payment_type_encoder")
    test_payment_type_encoded = payment_type_encoder.transform(input_df['payment_type'].values.reshape(-1, 1))
    ## water_quality
    water_quality_encoder = joblib.load("water_quality_encoder")
    test_water_quality_encoded = water_quality_encoder.transform(input_df['water_quality'].values.reshape(-1, 1))
    ## quantity
    quantity_encoder = joblib.load("quantity_encoder")
    test_quantity_encoded = quantity_encoder.transform(input_df['quantity'].values.reshape(-1, 1))
    ## source_type
    source_type_encoder = joblib.load("source_type_encoder")
    test_source_type_encoded = source_type_encoder.transform(input_df['source_type'].values.reshape(-1, 1))
    ## source_class
    source_class_encoder = joblib.load("source_class_encoder")
    test_source_class_encoded = source_class_encoder.transform(input_df['source_class'].values.reshape(-1, 1))
    ## waterpoint_type
    waterpoint_type_encoder = joblib.load("waterpoint_type_encoder")
    test_waterpoint_type_encoded = waterpoint_type_encoder.transform(input_df['waterpoint_type'].values.reshape(-1, 1))
    ## record_year
    record_year_encoder = joblib.load("record_year_encoder")
    test_record_year_encoded = record_year_encoder.transform(input_df['record_year'].values.reshape(-1, 1))
    ## record_month
    record_month_encoder = joblib.load("record_month_encoder")
    test_record_month_encoded = record_month_encoder.transform(input_df['record_month'].values.reshape(-1, 1))

    # Creating data matrix
    input_encoded = hstack([test_gps_height_normalized, test_longitude_normalized, test_latitude_normalized,
                        test_basin_encoded, test_region_encoded, test_lga_encoded, test_population_normalized,
                        test_public_meeting_encoded, test_scheme_management_encoded, test_permit_encoded,
                        test_extraction_type_class_encoded, test_management_encoded, test_payment_type_encoded,
                        test_water_quality_encoded, test_quantity_encoded, test_source_type_encoded,
                        test_source_class_encoded, test_waterpoint_type_encoded, test_record_year_encoded,
                        test_record_month_encoded, test_waterpoint_age_normalized]).tocsr()

    # Load model
    model = joblib.load("rf_model")

    #predictions
    predictions = model.predict(input_encoded)
    label_encoder = joblib.load("label_encoder")
    predictions = label_encoder.inverse_transform(predictions)
    if return_id:
        pred_dict = {"id":ids, "status_group":predictions}
    else:
        pred_dict = {"status_group":predictions}
    pred_df = pd.DataFrame(pred_dict)
    return pred_df

__Getting Predictions using pipeline__:

In [10]:
test_df = pd.read_csv("test.csv.csv")
predictions = predict(test_df, return_id=False)
print(f"input data for prediction shape: {test_df.shape}")
print(f"predictions shape               : {predictions.shape}")
print("\n")
print("Predictions sample: ")
predictions.head()

input data for prediction shape: (14850, 40)
predictions shape               : (14850, 1)


Predictions sample: 


Unnamed: 0,status_group
0,functional
1,functional
2,functional
3,non functional
4,functional


In [None]:
# Save Predictions
predictions.to_csv("predictions.csv", index=False)

##<font color='brown'>8.3 Evaluation Pipeline Function </font>

We define evaluation pipeline function such that it takes input data and their corresponding labels as input and returns the metric value as output.

In [None]:
# evaluation pipeline function
def evaluate(input_samples:any, class_labels:any):
    """
    This function takes the raw input samples and returns the predicted
    functionality status.
    """
    # Columns names in data
    columns =  ['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height',
                'installer', 'longitude', 'latitude', 'wpt_name', 'num_private',
                'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga',
                'ward', 'population', 'public_meeting', 'recorded_by',
                'scheme_management', 'scheme_name', 'permit', 'construction_year',
                'extraction_type', 'extraction_type_group', 'extraction_type_class',
                'management', 'management_group', 'payment', 'payment_type',
                'water_quality', 'quality_group', 'quantity', 'quantity_group',
                'source', 'source_type', 'source_class', 'waterpoint_type',
                'waterpoint_type_group']
    
    # Create dataframe from input samples
    input_df = pd.DataFrame(input_samples, columns=columns)
    
    # Imputing missing values
    ## gps_height
    GPS_HEIGHT_MEDIAN = 369.0
    input_df['gps_height'] = np.where(input_df['gps_height'] <= 0, GPS_HEIGHT_MEDIAN, input_df['gps_height'])
    ## latitude
    LATITUDE_MEDIAN = -5.0216
    input_df['latitude'] = np.where(input_df['latitude'] < -11.8, LATITUDE_MEDIAN, input_df['latitude'])
    input_df['latitude'] = np.where(input_df['latitude'] > -1, LATITUDE_MEDIAN, input_df['latitude'])
    ## longitude
    LONGITUDE_MEDIAN = 34.9087
    input_df['longitude'] = np.where(input_df['longitude'] < 29, LONGITUDE_MEDIAN, input_df['longitude'])
    input_df['longitude'] = np.where(input_df['longitude'] > 40.8, LONGITUDE_MEDIAN, input_df['longitude'])
    ## public_meeting
    PUBLIC_MEETING_MODE = True
    input_df['public_meeting'] = input_df['public_meeting'].fillna(PUBLIC_MEETING_MODE)
    ## scheme_management
    SCHEME_MANAGEMENT_MODE = "VWC"
    input_df['scheme_management'] = input_df['scheme_management'].fillna(SCHEME_MANAGEMENT_MODE)
    ## permit
    PERMIT_MODE = True
    input_df['permit'] = input_df['permit'].fillna(PERMIT_MODE)
    ## construction_year
    def year_imputer(x):
        """
        Function to impute construction_year in data
        """
        if x == 0:
            year_range = list(range(2000, 2013))
            impute_year = random.choice(year_range)
            return impute_year

    input_df['construction_year'] = np.where(input_df['construction_year'] == 0,
                                           input_df['construction_year'].apply(year_imputer),
                                           input_df['construction_year'])
    
    # Feature Engineering
    ## Creating separate year and month columns
    input_df['date_recorded'] = pd.to_datetime(input_df['date_recorded'])
    input_df['record_year'] = input_df['date_recorded'].dt.year
    input_df['record_month'] = input_df['date_recorded'].dt.month
    input_df['waterpoint_age'] = input_df['record_year'] - input_df['construction_year']

    # treating record_year and record_month as categorical features
    input_df[['record_year', 'record_month']] = input_df[['record_year', 'record_month']].astype('object')
    
    # Feature Selection
    ids = input_df['id'].values
    dropped_features = ['id', 'amount_tsh', 'funder', 'installer', 'wpt_name',
                        'num_private', 'subvillage', 'region_code', 'district_code',
                        'ward', 'recorded_by', 'scheme_name', 'construction_year', 
                        'extraction_type', 'extraction_type_group', 'management_group', 
                        'payment', 'quality_group', 'quantity_group', 'source', 
                        'waterpoint_type_group', 'date_recorded']
    input_df = input_df.drop(dropped_features, axis=1)

    # Data Preparation
    
    ## gps_height
    gps_height_normalizer = joblib.load("gps_height_normalizer")
    test_gps_height_normalized = gps_height_normalizer.transform(input_df['gps_height'].values.reshape(-1, 1))
    ## longitude
    longitude_normalizer = joblib.load("longitude_normalizer")
    test_longitude_normalized = longitude_normalizer.transform(input_df['longitude'].values.reshape(-1, 1))
    ## latitude
    latitude_normalizer = joblib.load("latitude_normalizer")
    test_latitude_normalized = latitude_normalizer.transform(input_df['latitude'].values.reshape(-1, 1))
    ## population
    population_normalizer = joblib.load("population_normalizer")
    test_population_normalized = population_normalizer.transform(input_df['population'].values.reshape(-1, 1))
    ## waterpoint_age
    waterpoint_age_normalizer = joblib.load("waterpoint_age_normalizer")
    test_waterpoint_age_normalized = waterpoint_age_normalizer.transform(input_df['waterpoint_age'].values.reshape(-1, 1))
    ## basin
    basin_encoder = joblib.load("basin_encoder")
    test_basin_encoded = basin_encoder.transform(input_df['basin'].values.reshape(-1, 1))
    ## region
    region_encoder = joblib.load("region_encoder")
    test_region_encoded = region_encoder.transform(input_df['region'].values.reshape(-1, 1))
    ## lga
    lga_encoder = joblib.load("lga_encoder")  
    test_lga_encoded = lga_encoder.transform(input_df['lga'].values.reshape(-1, 1))
    ## public_meeting
    public_meeting_encoder = joblib.load("public_meeting_encoder")
    test_public_meeting_encoded = public_meeting_encoder.transform(input_df['public_meeting'].values.reshape(-1, 1))
    ## scheme_management
    scheme_management_encoder = joblib.load("scheme_management_encoder")
    test_scheme_management_encoded = scheme_management_encoder.transform(input_df['scheme_management'].values.reshape(-1, 1))
    ## permit
    permit_encoder = joblib.load("permit_encoder")
    test_permit_encoded = permit_encoder.transform(input_df['permit'].values.reshape(-1, 1))
    ## extraction_type_class 
    extraction_type_class_encoder = joblib.load("extraction_type_class_encoder")
    test_extraction_type_class_encoded = extraction_type_class_encoder.transform(input_df['extraction_type_class'].values.reshape(-1, 1))
    ## management
    management_encoder = joblib.load("management_encoder")
    test_management_encoded = management_encoder.transform(input_df['management'].values.reshape(-1, 1))
    ## payment_type
    payment_type_encoder = joblib.load("payment_type_encoder")
    test_payment_type_encoded = payment_type_encoder.transform(input_df['payment_type'].values.reshape(-1, 1))
    ## water_quality
    water_quality_encoder = joblib.load("water_quality_encoder")
    test_water_quality_encoded = water_quality_encoder.transform(input_df['water_quality'].values.reshape(-1, 1))
    ## quantity
    quantity_encoder = joblib.load("quantity_encoder")
    test_quantity_encoded = quantity_encoder.transform(input_df['quantity'].values.reshape(-1, 1))
    ## source_type
    source_type_encoder = joblib.load("source_type_encoder")
    test_source_type_encoded = source_type_encoder.transform(input_df['source_type'].values.reshape(-1, 1))
    ## source_class
    source_class_encoder = joblib.load("source_class_encoder")
    test_source_class_encoded = source_class_encoder.transform(input_df['source_class'].values.reshape(-1, 1))
    ## waterpoint_type
    waterpoint_type_encoder = joblib.load("waterpoint_type_encoder")
    test_waterpoint_type_encoded = waterpoint_type_encoder.transform(input_df['waterpoint_type'].values.reshape(-1, 1))
    ## record_year
    record_year_encoder = joblib.load("record_year_encoder")
    test_record_year_encoded = record_year_encoder.transform(input_df['record_year'].values.reshape(-1, 1))
    ## record_month
    record_month_encoder = joblib.load("record_month_encoder")
    test_record_month_encoded = record_month_encoder.transform(input_df['record_month'].values.reshape(-1, 1))

    # Creating data matrix
    input_encoded = hstack([test_gps_height_normalized, test_longitude_normalized, test_latitude_normalized,
                        test_basin_encoded, test_region_encoded, test_lga_encoded, test_population_normalized,
                        test_public_meeting_encoded, test_scheme_management_encoded, test_permit_encoded,
                        test_extraction_type_class_encoded, test_management_encoded, test_payment_type_encoded,
                        test_water_quality_encoded, test_quantity_encoded, test_source_type_encoded,
                        test_source_class_encoded, test_waterpoint_type_encoded, test_record_year_encoded,
                        test_record_month_encoded, test_waterpoint_age_normalized]).tocsr()

    # Load model
    model = joblib.load("mvc_model")

    #predictions
    predictions = model.predict(input_encoded)
    label_encoder = joblib.load("label_encoder")
    predictions = label_encoder.inverse_transform(predictions)
    score = f1_score(class_labels, predictions, average='micro')
    score = np.round(score, 4)
    return score

In [None]:
# Checking evaluation pipeline
input_data = pd.read_csv("train.csv.csv")
labels = pd.read_csv("train_labels.csv.csv")
labels = labels['status_group'].values
score = evaluate(input_data, labels)
print(f"micro_f1_score for model: {score}")

micro_f1_score for model: 0.8848
