Importing packages
---

__We need to run the scripts install requirements.sh__

In [1]:
import json
import os
import numpy as np
import pandas as pd
import pickle
import uuid
import time
import tempfile

from googleapiclient import discovery
from googleapiclient import errors

from google.cloud import bigquery
from jinja2 import Template
from kfp.components import func_to_container_op
from typing import NamedTuple

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
!(gcloud config get-value core/project)

zeta-rush-341516


Preparing the dataset
--

In [3]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
DATASET_ID='covertype_dataset'
DATASET_LOCATION='US'
TABLE_ID='covertype'
DATA_SOURCE='gs://workshop-datasets/covertype/small/dataset.csv'

SCHEMA='Elevation:INTEGER,Aspect:INTEGER,Slope:INTEGER,Horizontal_Distance_To_Hydrology:\
INTEGER,Vertical_Distance_To_Hydrology:INTEGER,Horizontal_Distance_To_Roadways:INTEGER,Hillshade_9am:\
INTEGER,Hillshade_Noon:INTEGER,Hillshade_3pm:INTEGER,Horizontal_Distance_To_Fire_Points:INTEGER,\
Wilderness_Area:STRING,Soil_Type:STRING,Cover_Type:INTEGER'

__We create the BigQuery dataset and upload the Covertype csv data into a table__

__The pipeline ingests data from BigQuery. The cell below uploads the Covertype dataset to BigQuery__

In [4]:
!bq --location=$DATASET_LOCATION --project_id=$PROJECT_ID mk --dataset $DATASET_ID

BigQuery error in mk operation: Dataset 'zeta-rush-341516:covertype_dataset'
already exists.


In [5]:
!bq --project_id=$PROJECT_ID --dataset_id=$DATASET_ID load \
--source_format=CSV \
--skip_leading_rows=1 \
--replace \
$TABLE_ID \
$DATA_SOURCE \
$SCHEMA

Waiting on bqjob_r1a261ada7bedac6e_0000017f95549ef8_1 ... (2s) Current status: DONE   


Configuring environment settings
---

In [6]:
!gsutil ls

gs://artifacts.zeta-rush-341516.appspot.com/
gs://cloud-ai-platform-4aa74d0a-5386-461c-8135-3f0feac88a35/
gs://cloud-ai-platform-fffcccf5-f8f6-480b-9bb1-7a0a20be6be1/
gs://mlops-youness/
gs://storage_bucket_speech/
gs://zeta-rush-341516_cloudbuild/


In [7]:
REGION = 'us-central1'
ARTIFACT_STORE = 'gs://mlops-youness'

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
DATA_ROOT='{}/data'.format(ARTIFACT_STORE)
JOB_DIR_ROOT='{}/jobs'.format(ARTIFACT_STORE)
TRAINING_FILE_PATH='{}/{}/{}'.format(DATA_ROOT, 'training', 'dataset.csv')
VALIDATION_FILE_PATH='{}/{}/{}'.format(DATA_ROOT, 'validation', 'dataset.csv')

Exploring the Covertype dataset
--

In [8]:
%%bigquery
SELECT *
FROM `covertype_dataset.covertype`

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 1215.56query/s]                        
Downloading: 100%|██████████| 100000/100000 [00:01<00:00, 92715.65rows/s]


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2085,256,18,150,27,738,176,248,208,914,Cache,C2702,5
1,2125,256,20,30,12,871,169,248,215,300,Cache,C2702,2
2,2146,256,34,150,62,1253,122,237,239,511,Cache,C2702,2
3,2186,256,38,210,102,1294,109,232,244,552,Cache,C2702,2
4,2831,256,25,277,183,1706,153,246,225,1485,Commanche,C2705,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3136,254,12,319,60,5734,193,248,193,2467,Rawah,C7746,1
99996,3242,254,12,636,148,3551,193,248,193,2010,Commanche,C7757,0
99997,2071,255,12,234,63,342,192,247,193,247,Cache,C2706,2
99998,3248,255,12,730,113,725,192,247,193,2724,Commanche,C7756,1


Creating a training split
--

__We Run the query below in order to have repeatable sampling of the data in BigQuery__

In [9]:
!bq query \
-n 0 \
--destination_table covertype_dataset.training \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `covertype_dataset.covertype` AS cover \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), 10) IN (1, 2, 3, 4)' 

Waiting on bqjob_r62de6020f6501c97_0000017f9555177a_1 ... (1s) Current status: DONE   


__We export the BigQuery training table to GCS at $TRAINING_FILE_PATH__

In [10]:
!bq extract \
--destination_format CSV \
covertype_dataset.training \
$TRAINING_FILE_PATH

Waiting on bqjob_r1a578ffe791da479_0000017f95552b1d_1 ... (0s) Current status: DONE   


Creating a validation split
---

 __We create a validation split that takes 10% of the data using the `bq` command and export this split into the BigQuery table `covertype_dataset.validation`__

In [11]:
!bq query \
-n 0 \
--destination_table covertype_dataset.validation \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `covertype_dataset.covertype` AS cover \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), 10) IN (8)' 

Waiting on bqjob_r3bc65f64f09d8a59_0000017f95555861_1 ... (1s) Current status: DONE   


In [12]:
!bq extract \
--destination_format CSV \
covertype_dataset.validation \
$VALIDATION_FILE_PATH

Waiting on bqjob_r23b86d45a22ec838_0000017f955596a9_1 ... (0s) Current status: DONE   


In [13]:
TRAINING_FILE_PATH, VALIDATION_FILE_PATH

('gs://mlops-youness/data/training/dataset.csv',
 'gs://mlops-youness/data/validation/dataset.csv')

In [14]:
df_train = pd.read_csv(TRAINING_FILE_PATH)
df_validation = pd.read_csv(VALIDATION_FILE_PATH)
print(df_train.shape)
print(df_validation.shape)

(40009, 13)
(9836, 13)


The training application
--

The training pipeline preprocesses data by standardizing all numeric features using `sklearn.preprocessing.StandardScaler` and encoding all categorical features using `sklearn.preprocessing.OneHotEncoder`. It uses stochastic gradient descent linear classifier (SGDClassifier) for modeling.

In [15]:
numeric_feature_indexes = slice(0, 10)
categorical_feature_indexes = slice(10, 12)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_feature_indexes),
        ('cat', OneHotEncoder(), categorical_feature_indexes) 
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log', tol=1e-3))
])

In [16]:
num_features_type_map = {feature: 'float64' for feature in df_train.columns[numeric_feature_indexes]}

df_train = df_train.astype(num_features_type_map)
df_validation = df_validation.astype(num_features_type_map)

__Run the pipeline locally.__

In [17]:
X_train = df_train.drop('Cover_Type', axis=1)
y_train = df_train['Cover_Type']
X_validation = df_validation.drop('Cover_Type', axis=1)
y_validation = df_validation['Cover_Type']

pipeline.set_params(classifier__alpha=0.001, classifier__max_iter=200)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  slice(0, 10, None)),
                                                 ('cat', OneHotEncoder(),
                                                  slice(10, 12, None))])),
                ('classifier',
                 SGDClassifier(alpha=0.001, loss='log', max_iter=200))])

In [18]:
accuracy = pipeline.score(X_validation, y_validation)
print(accuracy)

0.6982513216754779


__Prepare the hyperparameter tuning application.__

In [19]:
TRAINING_APP_FOLDER = 'training_app'
os.makedirs(TRAINING_APP_FOLDER, exist_ok=True)

In [20]:
IMAGE_NAME='trainer_image'
IMAGE_TAG='latest'
IMAGE_URI='gcr.io/{}/{}:{}'.format(PROJECT_ID, IMAGE_NAME, IMAGE_TAG)

In [21]:
IMAGE_URI

'gcr.io/zeta-rush-341516/trainer_image:latest'

__Build the docker image__

In [22]:
!gcloud builds submit --tag $IMAGE_URI $TRAINING_APP_FOLDER

Creating temporary tarball archive of 6 file(s) totalling 6.0 KiB before compression.
Uploading tarball of [training_app] to [gs://zeta-rush-341516_cloudbuild/source/1647477979.084049-6af7f304c194456c88a30a32f6ef1f0f.tgz]
Created [https://cloudbuild.googleapis.com/v1/projects/zeta-rush-341516/locations/global/builds/3d083302-0757-42da-a50a-341705ff8e6b].
Logs are available at [https://console.cloud.google.com/cloud-build/builds/3d083302-0757-42da-a50a-341705ff8e6b?project=156920671469].
----------------------------- REMOTE BUILD OUTPUT ------------------------------
starting build "3d083302-0757-42da-a50a-341705ff8e6b"

FETCHSOURCE
Fetching storage object: gs://zeta-rush-341516_cloudbuild/source/1647477979.084049-6af7f304c194456c88a30a32f6ef1f0f.tgz#1647477979401432
Copying gs://zeta-rush-341516_cloudbuild/source/1647477979.084049-6af7f304c194456c88a30a32f6ef1f0f.tgz#1647477979401432...
/ [1 files][  1.7 KiB/  1.7 KiB]                                                
Operation completed

__Submit an AI Platform hyperparameter tuning job__

In [23]:
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = "{}/{}".format(JOB_DIR_ROOT, JOB_NAME)
SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=$REGION \
--job-dir=$JOB_DIR \
--master-image-uri=$IMAGE_URI \
--scale-tier=$SCALE_TIER \
--config $TRAINING_APP_FOLDER/hptuning_config.yaml \
-- \
--training_dataset_path=$TRAINING_FILE_PATH \
--validation_dataset_path=$VALIDATION_FILE_PATH \
--hptune

Job [JOB_20220317_004937] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20220317_004937

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20220317_004937
jobId: JOB_20220317_004937
state: QUEUED


In [24]:
JOB_NAME

'JOB_20220317_004937'

In [25]:
!gcloud ai-platform jobs describe $JOB_NAME

createTime: '2022-03-17T00:49:39Z'
etag: v-ELqiuttJU=
jobId: JOB_20220317_004937
jobPosition: '0'
startTime: '2022-03-17T00:49:43Z'
state: RUNNING
trainingInput:
  args:
  - --training_dataset_path=gs://mlops-youness/data/training/dataset.csv
  - --validation_dataset_path=gs://mlops-youness/data/validation/dataset.csv
  - --hptune
  hyperparameters:
    enableTrialEarlyStopping: true
    goal: MAXIMIZE
    hyperparameterMetricTag: accuracy
    maxParallelTrials: 2
    maxTrials: 4
    params:
    - discreteValues:
      - 200.0
      - 500.0
      parameterName: max_iter
      type: DISCRETE
    - maxValue: 0.001
      minValue: 1e-05
      parameterName: alpha
      scaleType: UNIT_LINEAR_SCALE
      type: DOUBLE
  jobDir: gs://mlops-youness/jobs/JOB_20220317_004937
  masterConfig:
    imageUri: gcr.io/zeta-rush-341516/trainer_image:latest
  region: us-central1
trainingOutput:
  hyperparameterMetricTag: accuracy
  isHyperparameterTuningJob: true

View job in the Cloud Console at:
http

__Retrieve HP-tuning results__

In [27]:
ml = discovery.build('ml', 'v1')

job_id = 'projects/{}/jobs/{}'.format(PROJECT_ID, JOB_NAME)
request = ml.projects().jobs().get(name=job_id)

try:
    response = request.execute()
except errors.HttpError as err:
    print(err)
except:
    print("Unexpected error")
    
response

{'jobId': 'JOB_20220317_004937',
 'trainingInput': {'args': ['--training_dataset_path=gs://mlops-youness/data/training/dataset.csv',
   '--validation_dataset_path=gs://mlops-youness/data/validation/dataset.csv',
   '--hptune'],
  'hyperparameters': {'goal': 'MAXIMIZE',
   'params': [{'parameterName': 'max_iter',
     'type': 'DISCRETE',
     'discreteValues': [200, 500]},
    {'parameterName': 'alpha',
     'minValue': 1e-05,
     'maxValue': 0.001,
     'type': 'DOUBLE',
     'scaleType': 'UNIT_LINEAR_SCALE'}],
   'maxTrials': 4,
   'maxParallelTrials': 2,
   'hyperparameterMetricTag': 'accuracy',
   'enableTrialEarlyStopping': True},
  'region': 'us-central1',
  'jobDir': 'gs://mlops-youness/jobs/JOB_20220317_004937',
  'masterConfig': {'imageUri': 'gcr.io/zeta-rush-341516/trainer_image:latest'}},
 'createTime': '2022-03-17T00:49:39Z',
 'startTime': '2022-03-17T00:49:43Z',
 'endTime': '2022-03-17T01:06:51Z',
 'state': 'SUCCEEDED',
 'trainingOutput': {'completedTrialCount': '4',
  'tr

In [28]:
response['trainingOutput']['trials'][0]

{'trialId': '2',
 'hyperparameters': {'alpha': '0.00028772843036299335', 'max_iter': '500'},
 'finalMetric': {'trainingStep': '1', 'objectiveValue': 0.7068930459536397},
 'startTime': '2022-03-17T00:49:47.331239063Z',
 'endTime': '2022-03-17T00:57:14Z',
 'state': 'SUCCEEDED'}

__Retrain the model with the best hyperparameters__

In [29]:
alpha = response['trainingOutput']['trials'][0]['hyperparameters']['alpha']
max_iter = response['trainingOutput']['trials'][0]['hyperparameters']['max_iter']

In [30]:
JOB_NAME = "JOB_{}".format(time.strftime("%Y%m%d_%H%M%S"))
JOB_DIR = "{}/{}".format(JOB_DIR_ROOT, JOB_NAME)
SCALE_TIER = "BASIC"

!gcloud ai-platform jobs submit training $JOB_NAME \
--region=$REGION \
--job-dir=$JOB_DIR \
--master-image-uri=$IMAGE_URI \
--scale-tier=$SCALE_TIER \
-- \
--training_dataset_path=$TRAINING_FILE_PATH \
--validation_dataset_path=$VALIDATION_FILE_PATH \
--alpha=$alpha \
--max_iter=$max_iter \
--nohptune


Job [JOB_20220317_012710] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe JOB_20220317_012710

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs JOB_20220317_012710
jobId: JOB_20220317_012710
state: QUEUED


In [32]:
!gcloud ai-platform jobs describe $JOB_NAME

createTime: '2022-03-17T01:27:11Z'
etag: AkwfjmR3dHQ=
jobId: JOB_20220317_012710
jobPosition: '0'
startTime: '2022-03-17T01:29:06Z'
state: RUNNING
trainingInput:
  args:
  - --training_dataset_path=gs://mlops-youness/data/training/dataset.csv
  - --validation_dataset_path=gs://mlops-youness/data/validation/dataset.csv
  - --alpha=0.00028772843036299335
  - --max_iter=500
  - --nohptune
  jobDir: gs://mlops-youness/jobs/JOB_20220317_012710
  masterConfig:
    imageUri: gcr.io/zeta-rush-341516/trainer_image:latest
  region: us-central1
trainingOutput: {}

View job in the Cloud Console at:
https://console.cloud.google.com/mlengine/jobs/JOB_20220317_012710?project=zeta-rush-341516

View logs at:
https://console.cloud.google.com/logs?resource=ml_job%2Fjob_id%2FJOB_20220317_012710&project=zeta-rush-341516


In [34]:
!gsutil ls $JOB_DIR

gs://mlops-youness/jobs/JOB_20220317_012710/model.pkl


Deploy the model to AI Platform Prediction
--

In [35]:
model_name = 'forest_cover_classifier'
labels = "task=classifier,domain=forestry"

!gcloud ai-platform models create  $model_name \
 --regions=$REGION \
 --labels=$labels

Using endpoint [https://ml.googleapis.com/]
Created ai platform model [projects/zeta-rush-341516/models/forest_cover_classifier].


__Create a model version__

In [36]:
model_version = 'v01'

!gcloud ai-platform versions create {model_version} \
 --model={model_name} \
 --origin=$JOB_DIR \
 --runtime-version=1.15 \
 --framework=scikit-learn \
 --python-version=3.7\
 --region global

Using endpoint [https://ml.googleapis.com/]
Creating version (this might take a few minutes)......done.                    


__Serve predictions__ : Prepare the input file with JSON formated instances.

In [37]:
input_file = 'serving_instances.json'

with open(input_file, 'w') as f:
    for index, row in X_validation.head().iterrows():
        f.write(json.dumps(list(row.values)))
        f.write('\n')

In [38]:
!cat $input_file

[2841.0, 45.0, 0.0, 644.0, 282.0, 1376.0, 218.0, 237.0, 156.0, 1003.0, "Commanche", "C4758"]
[2494.0, 180.0, 0.0, 0.0, 0.0, 819.0, 219.0, 238.0, 157.0, 5531.0, "Rawah", "C6101"]
[3153.0, 90.0, 0.0, 335.0, 11.0, 5842.0, 219.0, 237.0, 155.0, 930.0, "Rawah", "C7101"]
[3021.0, 90.0, 0.0, 42.0, 1.0, 4389.0, 219.0, 237.0, 155.0, 902.0, "Rawah", "C7745"]
[2916.0, 0.0, 0.0, 0.0, 0.0, 4562.0, 218.0, 238.0, 156.0, 5442.0, "Rawah", "C7745"]


__Invoke the model__

In [39]:
!gcloud ai-platform predict \
--model $model_name \
--version $model_version \
--json-instances $input_file\
--region global

Using endpoint [https://ml.googleapis.com/]
[1, 1, 0, 1, 1]
