https://cloud.google.com/ai-platform/training/docs/getting-started-scikit-xgboost

https://cloud.google.com/ai-platform/prediction/docs/using-pipelines-for-preprocessing#gcloud_2

In [1]:
import sys
sys.path.append('/usr/local/lib/python3.8/site-packages')
sys.path.append('/Users/kristine/Library/Python/3.8/lib/python/site-packages')
from gfw_forestlearn import fl_binary_classification
#from gfw_forestlearn import fl_regression
#import pandas as pd
import numpy as np
from datetime import datetime


In [72]:
PROJECT_ID = "wri-congo-deforestation" #@param {type:"string"}
! gcloud config set project $PROJECT_ID
%env GOOGLE_APPLICATION_CREDENTIALS 'wri-congo-deforestation-92666c07ca00.json'
BUCKET_NAME = "wri-congo-deforestation" #@param {type:"string"}
REGION = "us-east4" #@param {type:"string"}
%env REGION = us-east4

Updated property [core/project].
env: GOOGLE_APPLICATION_CREDENTIALS='wri-congo-deforestation-92666c07ca00.json'
env: REGION=us-east4


In [3]:
!gsutil cp gs://wri-congo-deforestation/testing_points.csv .

Copying gs://wri-congo-deforestation/testing_points.csv...
/ [1 files][  8.3 KiB/  8.3 KiB]                                                
Operation completed over 1 objects/8.3 KiB.                                      


In [4]:
%%writefile ai_platform_model.py
from gfw_forestlearn import fl_binary_classification
import pandas as pd
import subprocess

training_csv = 'testing_points.csv'
training_set = pd.read_csv(training_csv)
mask_column = ['TrainingSplit','ValidationSplit','system:index','.geo']
y_column = 'loss'


params = {
    'learn__n_estimators': [100],
    'learn__max_features': ['auto'],
    'learn__min_samples_leaf': [0.1],
    'learn__max_depth': [6]
}
predictors = ['earlyLossDistance','elevation','huntingAreasDistance','protectedAreasDistance',
              'roadsDistance','ruralComplexDistance','slope']
learning = fl_binary_classification.ForestLearn(predictors=predictors, y_column=y_column)
modelfilename= 'randfor.pkl'
out_modelfilename= 'randfor2.pkl'
cv_results_filename = 'cv_results.csv'
feature_importance_filename = 'feature_importance.csv'
out_train_file = 'out_train.csv'
scores_out_file = 'scores_out_file.csv'

learning.setup_rf_model_classify_scale()
learning.tune_param_set(training_set, params, modelfilename, cv_results_filename,k=3,
                        scoring='roc_auc', n_jobs=1,verbose=0)
learning.save_feature_importances(feature_importance_filename)
learning.fit_model_with_params(training_set, out_modelfilename,in_modelfilename=modelfilename)
learning.load_model_from_file(modelfilename)
learning.predict_supervised_data(training_csv, out_train_file, name='Train')
learning.save_scores(scores_out_file)



Overwriting ai_platform_model.py


In [63]:
%%writefile iris_sklearn_trainer/training.py
# This file is for training on AI Platform with scikit-learn.


# [START setup]
import datetime
import os
import subprocess
import sys
import pandas as pd
from sklearn import svm
import glob
import pandas as pd


training_csv = 'training_points.csv'
data_dir = 'gs://wri-congo-deforestation'
fl_module = 'gfw_forestlearn'

# gsutil outputs everything to stderr so we need to divert it to stdout.
subprocess.check_call(['gsutil', 'cp', os.path.join(data_dir, training_csv),'.'], stderr=sys.stdout)
subprocess.check_call(['gsutil', 'cp','-r', os.path.join(data_dir, fl_module),'.'], stderr=sys.stdout)


from gfw_forestlearn import fl_binary_classification

BUCKET_NAME = 'wri-congo-deforestation'
training_csv = 'training_points.csv'
training_set = pd.read_csv(training_csv)
mask_column = ['TrainingSplit','ValidationSplit','system:index','.geo']
y_column = 'loss'


params = {
    'learn__n_estimators': [100],
    'learn__max_features': ['auto'],
    'learn__min_samples_leaf': [0.1],
    'learn__max_depth': [6]
}
predictors = ['earlyLossDistance','elevation','huntingAreasDistance','protectedAreasDistance',
              'roadsDistance','ruralComplexDistance','slope']
learning = fl_binary_classification.ForestLearn(predictors=predictors, y_column=y_column)
modelfilename= 'randfor.pkl'
cv_results_filename = 'cv_results.csv'
scores_out_file = 'scores_out_file.csv'

learning.setup_rf_model_classify_scale()
trained_model = learning.tune_param_set(training_set, params, modelfilename, cv_results_filename,k=3,
                        scoring='roc_auc', n_jobs=1,verbose=0)



# [START upload-model]
# Upload the saved model file to Cloud Storage
gcs_model_path = os.path.join('gs://', BUCKET_NAME, modelfilename)
subprocess.check_call(['gsutil', 'cp', modelfilename, gcs_model_path],
    stderr=sys.stdout)
# [END upload-model]

Overwriting iris_sklearn_trainer/training.py


In [64]:
TRAINING_PACKAGE_PATH="./iris_sklearn_trainer/"
MAIN_TRAINER_MODULE="iris_sklearn_trainer.training"

In [65]:
!gcloud ai-platform local train \
  --package-path $TRAINING_PACKAGE_PATH \
  --module-name $MAIN_TRAINER_MODULE

Copying gs://wri-congo-deforestation/training_points.csv...
- [1 files][756.5 KiB/756.5 KiB]                                                
Operation completed over 1 objects/756.5 KiB.                                    
Copying gs://wri-congo-deforestation/gfw_forestlearn/__init__.py...
Copying gs://wri-congo-deforestation/gfw_forestlearn/__pycache__/__init__.cpython-38.pyc...
Copying gs://wri-congo-deforestation/gfw_forestlearn/__pycache__/fl_binary_classification.cpython-38.pyc...
Copying gs://wri-congo-deforestation/gfw_forestlearn/__pycache__/fl_regression.cpython-38.pyc...
/ [4 files][ 18.9 KiB/ 18.9 KiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://wri-congo-deforestation/gfw_forestlearn/fl_binary_classification.p

In [93]:
# datetime object containing current date and time
now = datetime.now()
dt_string = now.strftime("%Y%m%d_%H%M%S")
JOB_NAME = 'iris_sklearn_{}'.format(dt_string)
runtime_version = 2.3
%env BUCKET_NAME=wri-congo-deforestation
%env JOB_NAME=$JOB_NAME
%env JOB_DIR=gs://$BUCKET_NAME/scikit_learn_job_dir
%env TRAINING_PACKAGE_PATH=iris_sklearn_trainer/
%env MAIN_TRAINER_MODULE=iris_sklearn_trainer.training
%env REGION=us-east4
%env RUNTIME_VERSION=$runtime_version
%env PYTHON_VERSION=3.7
%env SCALE_TIER=BASIC
%env MODEL_DIR=gs://wri-congo-deforestation
%env VERSION_NAME=v0_1_2
%env MODEL_NAME=iris_sklearn
%env FRAMEWORK=SCIKIT_LEARN

env: BUCKET_NAME=wri-congo-deforestation
env: JOB_NAME=iris_sklearn_20210308_161451
env: JOB_DIR=gs://wri-congo-deforestation/scikit_learn_job_dir
env: TRAINING_PACKAGE_PATH=iris_sklearn_trainer/
env: MAIN_TRAINER_MODULE=iris_sklearn_trainer.training
env: REGION=us-east4
env: RUNTIME_VERSION=2.3
env: PYTHON_VERSION=3.7
env: SCALE_TIER=BASIC


In [67]:
!gcloud ai-platform jobs submit training $JOB_NAME \
  --job-dir $JOB_DIR \
  --package-path $TRAINING_PACKAGE_PATH \
  --module-name $MAIN_TRAINER_MODULE \
  --region $REGION \
  --runtime-version=$RUNTIME_VERSION \
  --python-version=$PYTHON_VERSION \
  --scale-tier $SCALE_TIER

Job [iris_sklearn_20210308_131146] submitted successfully.
Your job is still active. You may view the status of your job with the command

  $ gcloud ai-platform jobs describe iris_sklearn_20210308_131146

or continue streaming the logs with the command

  $ gcloud ai-platform jobs stream-logs iris_sklearn_20210308_131146
jobId: iris_sklearn_20210308_131146
state: QUEUED


In [102]:
%env MODEL_DIR=gs://wri-congo-deforestation
%env VERSION_NAME=v0_1_2
%env MODEL_NAME=iris_sklearn
%env FRAMEWORK=SCIKIT_LEARN


env: MODEL_DIR=gs://wri-congo-deforestation
env: VERSION_NAME=v0_1_2
env: MODEL_NAME=iris_sklearn
env: FRAMEWORK=SCIKIT_LEARN


In [97]:
!gcloud ai-platform models create $MODEL_NAME \
  --region=$REGION

Using endpoint [https://us-east4-ml.googleapis.com/]
[1;31mERROR:[0m (gcloud.ai-platform.models.create) Resource in projects [wri-congo-deforestation] is the subject of a conflict: Field: model.name Error: A model with the same name already exists.
- '@type': type.googleapis.com/google.rpc.BadRequest
  fieldViolations:
  - description: A model with the same name already exists.
    field: model.name


In [103]:
!gcloud ai-platform versions create $VERSION_NAME \
  --model=$MODEL_NAME \
  --origin=$MODEL_DIR \
  --runtime-version=$RUNTIME_VERSION \
  --framework=$FRAMEWORK \
  --python-version=3.7 \
  --region=$REGION \
  --machine-type=$MACHINE_TYPE

Using endpoint [https://us-east4-ml.googleapis.com/]
Creating version (this might take a few minutes)......done.                    
