Importing packages
---

__We need to run the scripts install requirements.sh__

In [1]:
import json
import os
import numpy as np
import pandas as pd
import pickle
import uuid
import time
import tempfile

from googleapiclient import discovery
from googleapiclient import errors

from google.cloud import bigquery
from jinja2 import Template
from kfp.components import func_to_container_op
from typing import NamedTuple

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
!(gcloud config get-value core/project)

zeta-rush-341516


In [11]:
PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
DATASET_ID='covertype_dataset'
DATASET_LOCATION='US'
TABLE_ID='covertype'
DATA_SOURCE='gs://workshop-datasets/covertype/small/dataset.csv'

SCHEMA='Elevation:INTEGER,Aspect:INTEGER,Slope:INTEGER,Horizontal_Distance_To_Hydrology:\
INTEGER,Vertical_Distance_To_Hydrology:INTEGER,Horizontal_Distance_To_Roadways:INTEGER,Hillshade_9am:\
INTEGER,Hillshade_Noon:INTEGER,Hillshade_3pm:INTEGER,Horizontal_Distance_To_Fire_Points:INTEGER,\
Wilderness_Area:STRING,Soil_Type:STRING,Cover_Type:INTEGER'

__We create the BigQuery dataset and upload the Covertype csv data into a table__

The pipeline ingests data from BigQuery. The cell below uploads the Covertype dataset to BigQuery.

In [12]:
!bq --location=$DATASET_LOCATION --project_id=$PROJECT_ID mk --dataset $DATASET_ID

Dataset 'zeta-rush-341516:covertype_dataset' successfully created.


In [13]:
!bq --project_id=$PROJECT_ID --dataset_id=$DATASET_ID load \
--source_format=CSV \
--skip_leading_rows=1 \
--replace \
$TABLE_ID \
$DATA_SOURCE \
$SCHEMA

Waiting on bqjob_r1a8f8fe8edfb9d5e_0000017f8aa4139e_1 ... (2s) Current status: DONE   


Configuring environment settings
---

In [15]:
!gsutil ls

gs://artifacts.zeta-rush-341516.appspot.com/
gs://cloud-ai-platform-4aa74d0a-5386-461c-8135-3f0feac88a35/
gs://cloud-ai-platform-fffcccf5-f8f6-480b-9bb1-7a0a20be6be1/
gs://mlops-youness/
gs://storage_bucket_speech/


In [33]:
REGION = 'us-central1'
ARTIFACT_STORE = 'gs://mlops-youness'

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]
DATA_ROOT='{}/data'.format(ARTIFACT_STORE)
JOB_DIR_ROOT='{}/jobs'.format(ARTIFACT_STORE)
TRAINING_FILE_PATH='{}/{}/{}'.format(DATA_ROOT, 'training', 'dataset.csv')
VALIDATION_FILE_PATH='{}/{}/{}'.format(DATA_ROOT, 'validation', 'dataset.csv')

Exploring the Covertype dataset
--

In [29]:
%%bigquery
SELECT *
FROM `covertype_dataset.covertype`

Query complete after 0.00s: 100%|██████████| 1/1 [00:00<00:00, 985.97query/s] 
Downloading: 100%|██████████| 100000/100000 [00:01<00:00, 98886.35rows/s]


Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type
0,2085,256,18,150,27,738,176,248,208,914,Cache,C2702,5
1,2125,256,20,30,12,871,169,248,215,300,Cache,C2702,2
2,2146,256,34,150,62,1253,122,237,239,511,Cache,C2702,2
3,2186,256,38,210,102,1294,109,232,244,552,Cache,C2702,2
4,2831,256,25,277,183,1706,153,246,225,1485,Commanche,C2705,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3136,254,12,319,60,5734,193,248,193,2467,Rawah,C7746,1
99996,3242,254,12,636,148,3551,193,248,193,2010,Commanche,C7757,0
99997,2071,255,12,234,63,342,192,247,193,247,Cache,C2706,2
99998,3248,255,12,730,113,725,192,247,193,2724,Commanche,C7756,1


Creating a training split
--

In [18]:
!bq query \
-n 0 \
--destination_table covertype_dataset.training \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `covertype_dataset.covertype` AS cover \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), 10) IN (1, 2, 3, 4)' 

Waiting on bqjob_r169ba5514bcb90a1_0000017f8ac0a8f4_1 ... (1s) Current status: DONE   


In [19]:
!bq extract \
--destination_format CSV \
covertype_dataset.training \
$TRAINING_FILE_PATH

Waiting on bqjob_r5789390986c4e704_0000017f8ac0f4ad_1 ... (0s) Current status: DONE   


Create a validation split
---

In [20]:
!bq query \
-n 0 \
--destination_table covertype_dataset.validation \
--replace \
--use_legacy_sql=false \
'SELECT * \
FROM `covertype_dataset.covertype` AS cover \
WHERE \
MOD(ABS(FARM_FINGERPRINT(TO_JSON_STRING(cover))), 10) IN (8)' 

Waiting on bqjob_r1dc102be3e54f4f9_0000017f8ad637e8_1 ... (1s) Current status: DONE   


In [21]:
!bq extract \
--destination_format CSV \
covertype_dataset.validation \
$VALIDATION_FILE_PATH

Waiting on bqjob_r494f9481d6d3fff7_0000017f8ad657ef_1 ... (0s) Current status: DONE   


In [37]:
TRAINING_FILE_PATH, VALIDATION_FILE_PATH

('gs://mlops-youness/data/training/dataset.csv',
 'gs://mlops-youness/data/validation/dataset.csv')

In [38]:
df_train = pd.read_csv(TRAINING_FILE_PATH)
df_validation = pd.read_csv(VALIDATION_FILE_PATH)
print(df_train.shape)
print(df_validation.shape)

(40009, 13)
(9836, 13)


In [39]:
numeric_feature_indexes = slice(0, 10)
categorical_feature_indexes = slice(10, 12)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_feature_indexes),
        ('cat', OneHotEncoder(), categorical_feature_indexes) 
    ])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SGDClassifier(loss='log', tol=1e-3))
])