In [1]:
import kfp
import matplotlib.pyplot as plt
import pandas as pd
import requests

from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, ClassificationMetrics, Metrics, component)

from google.cloud import aiplatform
from google.cloud.aiplatform import pipeline_jobs
from typing import NamedTuple

# We'll use this beta library for metadata querying
from google.cloud import aiplatform_v1beta1
from datetime import datetime

In [2]:
PATH=%env PATH
%env PATH={PATH}:/home/jupyter/.local/bin
REGION="us-central1"

PIPELINE_ROOT = 'gs://propensity_assets/pipeline_root'
PIPELINE_ROOT

env: PATH=/opt/conda/bin:/opt/conda/condabin:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/games:/home/jupyter/.local/bin


'gs://propensity_assets/pipeline_root'

In [3]:
VIEW_NAME = 'ga_data' # BigQuery view you create for input data to model
DATA_SET_ID = 'propensity' # The Data Set ID where the view sits
PROJECT_ID = 'tai-demo-experimental-gke' 
BUCKET_NAME = 'tai-test' # Bucket where the base_sql.txt file lives
BLOB_PATH = 'tai_test_pipeline/base_sql.txt' # The actual blob path

In [4]:
@component(
    # this component builds a BQ view, which will be the underlying source for model
    packages_to_install=["google-cloud-bigquery", "google-cloud-storage"],
    base_image="python:3.9",
    output_component_file="output_component/create_input_view.yaml",
)

def create_input_view(view_name: str, 
                      data_set_id: str, 
                      project_id: str,
                      bucket_name: str,
                      blob_path: str
                    
):
    from google.cloud import bigquery
    from google.cloud import storage
    client = bigquery.Client(project=project_id)
    dataset = client.dataset(data_set_id)
    table_ref = dataset.table(view_name)
    ga_data_ref = 'bigquery-public-data.google_analytics_sample.ga_sessions_*'
    conversion = "hits.page.pageTitle like '%Shopping Cart%'" # this is sql like syntax used to define the conversion in the GA360 raw export
    start_date = '20170101'
    end_date = '20170131'

    
    def get_sql(bucket_name, blob_path):
        from google.cloud import storage
        storage_client = storage.Client()
        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.get_blob(blob_path)
        content = blob.download_as_string()
        return content

    def if_tbl_exists(client, table_ref):
        from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False

    if if_tbl_exists(client, table_ref):
        print("view already exists")
        
    else: 
        #load sql from base_sql.txt.  This can be modified if you want to modify your query
        content = get_sql()
        content = str(content, 'utf-8')
        create_base_feature_set_query = content.format(start_date = start_date,
                                                       end_date = end_date, 
                                                       ga_data_ref = ga_data_ref, 
                                                       conversion = conversion)

        shared_dataset_ref = client.dataset(data_set_id)
        base_feature_set_view_ref = shared_dataset_ref.table(view_name)
        base_feature_set_view = bigquery.Table(base_feature_set_view_ref)
        base_feature_set_view.view_query = create_base_feature_set_query.format(project_id)
        base_feature_set_view = client.create_table(base_feature_set_view)  # API request

In [5]:
@component(
    # this component builds a logistic regression with BQML
    packages_to_install=["google-cloud-bigquery"],
    base_image="python:3.9",
    output_component_file="output_component/create_bqml_model_logistic.yaml"
)


def build_bqml_logistic(project_id: str, 
                         data_set_id: str, 
                         model_name: str, 
                         training_view: str
):
    from google.cloud import bigquery
    client = bigquery.Client(project=project_id)

    model_name = f"{project_id}.{data_set_id}.{model_name}"
    training_set = f"{project_id}.{data_set_id}.{training_view}"
    build_model_query_bqml_logistic = '''
    CREATE OR REPLACE MODEL `{model_name}`
    OPTIONS(model_type='logistic_reg'
    , INPUT_LABEL_COLS = ['label']
    , L1_REG = 1
    , DATA_SPLIT_METHOD = 'RANDOM'
    , DATA_SPLIT_EVAL_FRACTION = 0.20
    ) AS
        SELECT * EXCEPT (fullVisitorId, label), 
        CASE WHEN label is null then 0 ELSE label end as label
    FROM `{training_set}`
    '''.format(model_name = model_name, training_set = training_set)

    job_config = bigquery.QueryJobConfig()
    client.query(build_model_query_bqml_logistic, job_config=job_config)  # Make an API request.

In [6]:
@component(
    # this component builds an xgboost classifier with BQML
    packages_to_install=["google-cloud-bigquery"],
    base_image="python:3.9",
    output_component_file="output_component/create_bqml_model_xgboost.yaml"
)


def build_bqml_xgboost(project_id: str, 
                         data_set_id: str, 
                         model_name: str, 
                         training_view: str
):
    from google.cloud import bigquery
    client = bigquery.Client(project=project_id)

    model_name = f"{project_id}.{data_set_id}.{model_name}"
    training_set = f"{project_id}.{data_set_id}.{training_view}"
    build_model_query_bqml_xgboost = '''
    CREATE OR REPLACE MODEL `{model_name}`
    OPTIONS(model_type='BOOSTED_TREE_CLASSIFIER'
    , INPUT_LABEL_COLS = ['label']
    , L1_REG = 1
    , DATA_SPLIT_METHOD = 'RANDOM'
    , DATA_SPLIT_EVAL_FRACTION = 0.20
    ) AS
        SELECT * EXCEPT (fullVisitorId, label), 
        CASE WHEN label is null then 0 ELSE label end as label
    FROM `{training_set}`
    '''.format(model_name = model_name, training_set = training_set)

    job_config = bigquery.QueryJobConfig()
    client.query(build_model_query_bqml_xgboost, job_config=job_config)  # Make an API request.

In [7]:
@component(
    # this component builds an AutoML classifier with BQML
    packages_to_install=["google-cloud-bigquery"],
    base_image="python:3.9",
    output_component_file="output_component/create_bqml_model_automl.yaml"
)


def build_bqml_automl(project_id: str, 
                         data_set_id: str, 
                         model_name: str, 
                         training_view: str
):
    from google.cloud import bigquery
    client = bigquery.Client(project=project_id)

    model_name = f"{project_id}.{data_set_id}.{model_name}"
    training_set = f"{project_id}.{data_set_id}.{training_view}"
    build_model_query_bqml_automl = '''
    CREATE OR REPLACE MODEL `{model_name}`
    OPTIONS(model_type='BOOSTED_TREE_CLASSIFIER'
    , INPUT_LABEL_COLS = ['label']
    ) AS
        SELECT * EXCEPT (fullVisitorId, label), 
        CASE WHEN label is null then 0 ELSE label end as label
    FROM `{training_set}`
    '''.format(model_name = model_name, training_set = training_set)

    job_config = bigquery.QueryJobConfig()
    client.query(build_model_query_bqml_automl, job_config=job_config)  # Make an API request.

In [8]:
@component(
    # this component builds an xgboost classifier with xgboost
    packages_to_install=["google-cloud-bigquery", "xgboost", "pandas", "sklearn", "joblib", "pyarrow"],
    base_image="python:3.9",
    output_component_file="output_component/create_xgb_model_xgboost.yaml"
)

def build_xgb_xgboost(project_id: str, 
                            data_set_id: str, 
                            training_view: str,
                            metrics: Output[Metrics],
                            model: Output[Model]

):
    from google.cloud import bigquery
    import xgboost as xgb
    from xgboost import XGBClassifier
    from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
    from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_curve
    from joblib import dump
    import pandas as pd
    import pyarrow

    client = bigquery.Client(project=project_id)

    data_set = f"{project_id}.{data_set_id}.{training_view}"
    build_df_for_xgboost = '''
    SELECT * FROM `{data_set}`
    '''.format(data_set = data_set)

    job_config = bigquery.QueryJobConfig()
    df = client.query(build_df_for_xgboost, job_config=job_config).to_dataframe()  # Make an API request.
    df = pd.get_dummies(df.drop(['fullVisitorId'], axis=1), prefix=['visited_dma', 'visited_daypart', 'visited_dow'])


    X = df.drop(['label'], axis=1).values
    y = df['label'].values

    X_train, X_test, y_train, y_test  = train_test_split(X,y)
    train = xgb.DMatrix(X_train, label=y_train)
    test = xgb.DMatrix(X_test, label=y_test)

    params = {
        'reg_lambda':[0,1],
        'gamma': [1, 1.5, 2, 2.5, 3],
        'max_depth':[2,3,4,5,10,20],
        'learning_rate': [.1,.01]

    }

    xgb_model = XGBClassifier(n_estimators=50, objective='binary:hinge',
                              silent=True, nthread=1,
                              eval_metric="auc")

    folds = 3
    param_comb = 5

    skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

    random_search = RandomizedSearchCV(xgb_model, param_distributions=params, 
                                       n_iter=param_comb, scoring='precision', 
                                       n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, 
                                       random_state=1001 )


    random_search.fit(X_train, y_train)
    xgb_model_best = random_search.best_estimator_
    predictions = xgb_model_best.predict(X_test)
    score = accuracy_score(y_test, predictions)
    auc = roc_auc_score(y_test, predictions)
    precision_recall = precision_recall_curve(y_test, predictions)
    
    metrics.log_metric("accuracy",(score * 100.0))
    metrics.log_metric("framework", "xgboost")
    metrics.log_metric("dataset_size", len(df))
    metrics.log_metric("AUC", auc)
    
    dump(xgb_model_best, model.path + ".joblib")


In [9]:
@component(
    # this component evaluations Logistic Regression
    packages_to_install=["google-cloud-bigquery", "pandas", "pyarrow", "matplotlib"],
    base_image="python:3.9",
    output_component_file="output_component/evaluate_bqml_model_logistic.yaml"
)


def evaluate_bqml_logistic(project_id: str, 
                            data_set_id: str, 
                            model_name: str, 
                            training_view: str,
                            logistic_data_path: OutputPath("Dataset")
):
    from google.cloud import bigquery
    import pandas as pd
    import pyarrow
    import matplotlib as plt
    client = bigquery.Client(project=project_id)

    model_name = project_id+'.'+data_set_id+'.'+model_name
    training_set = project_id+'.'+data_set_id+'.'+training_view
    evaluate_model_query_bqml_logistic = '''    
    SELECT
      round(threshold, 2) as threshold,
      * except(threshold), 
      true_positives / (true_positives + false_positives) AS precision
    FROM
      ML.ROC_CURVE(MODEL `{model_name}`,
                   TABLE `{table_name}`,
                   GENERATE_ARRAY(0,1, 0.01))

    ORDER BY threshold
    '''.format(model_name = model_name, table_name = training_set)
    
    job_config = bigquery.QueryJobConfig()
    query_job = client.query(evaluate_model_query_bqml_logistic, job_config=job_config)  # Make an API request.
    df_evaluation_logistic = query_job.result()
    df_evaluation_logistic = df_evaluation_logistic.to_dataframe()
    df_evaluation_logistic.to_csv(logistic_data_path)
    graph = df_evaluation_logistic.plot(x='threshold', y=['precision', 'recall']).get_figure()
    graph.savefig(logistic_data_path)


In [10]:
@component(
    # this component evaluates BQML xgboost
    packages_to_install=["google-cloud-bigquery", "pandas", "pyarrow", "matplotlib"],
    base_image="python:3.9",
    output_component_file="output_component/evaluate_bqml_model_xgboost.yaml"
)


def evaluate_bqml_xgboost(project_id: str, 
                            data_set_id: str, 
                            model_name: str, 
                            training_view: str,
                            xgboost_data_path: OutputPath("Dataset")
):
    from google.cloud import bigquery
    import pandas as pd
    import pyarrow
    import matplotlib as plt
    client = bigquery.Client(project=project_id)

    model_name = f"{project_id}.{data_set_id}.{model_name}"
    training_set = f"{project_id}.{data_set_id}.{training_view}"
    evaluate_model_query_bqml_xgboost = '''    
    SELECT
      round(threshold, 2) as threshold,
      * except(threshold), 
      true_positives / (true_positives + false_positives) AS precision
    FROM
      ML.ROC_CURVE(MODEL `{model_name}`,
                   TABLE `{table_name}`,
                   GENERATE_ARRAY(0,1, 0.01))

    ORDER BY threshold
    '''.format(model_name = model_name, table_name = training_set)
    

    job_config = bigquery.QueryJobConfig()
    query_job = client.query(evaluate_model_query_bqml_xgboost, job_config=job_config)  # Make an API request.
    df_evaluation_xgboost = query_job.result()
    df_evaluation_xgboost = df_evaluation_xgboost.to_dataframe()
    df_evaluation_xgboost.to_csv(xgboost_data_path)
    graph = df_evaluation_xgboost.plot(x='threshold', y=['precision', 'recall']).get_figure()
    graph.savefig(xgboost_data_path)

In [11]:
@component(
    # this component evaluates BQML autoML
    packages_to_install=["google-cloud-bigquery", "pandas", "pyarrow", "matplotlib"],
    base_image="python:3.9",
    output_component_file="output_component/evaluate_bqml_model_automl.yaml"
)


def evaluate_bqml_automl(project_id: str, 
                            data_set_id: str, 
                            model_name: str, 
                            training_view: str,
                            automl_data_path: OutputPath("Dataset")
):
    from google.cloud import bigquery
    import pandas as pd
    import pyarrow
    import matplotlib as plt
    client = bigquery.Client(project=project_id)

    model_name = f"{project_id}.{data_set_id}.{model_name}"
    training_set = f"{project_id}.{data_set_id}.{training_view}"
    evaluate_model_query_bqml_automl = '''    
    SELECT
      round(threshold, 2) as threshold,
      * except(threshold), 
      true_positives / (true_positives + false_positives) AS precision
    FROM
      ML.ROC_CURVE(MODEL `{model_name}`,
                   TABLE `{table_name}`,
                   GENERATE_ARRAY(0,1, 0.01))

    ORDER BY threshold
    '''.format(model_name = model_name, table_name = training_set)
    

    job_config = bigquery.QueryJobConfig()
    query_job = client.query(evaluate_model_query_bqml_automl, job_config=job_config)  # Make an API request.
    df_evaluation_automl = query_job.result()
    df_evaluation_automl = df_evaluation_automl.to_dataframe()
    df_evaluation_automl.to_csv(automl_data_path)
    graph = df_evaluation_automl.plot(x='threshold', y=['precision', 'recall']).get_figure()
    graph.savefig(automl_data_path)

In [12]:
@component(
    # Deploys xgboost model 
    packages_to_install=["google-cloud-aiplatform", "joblib", "sklearn", "xgboost"],
    base_image="python:3.9",
    output_component_file="output_component/xgboost_deploy_component.yaml",
)
def deploy_xgb(
    model: Input[Model],
    project_id: str,
    vertex_endpoint: Output[Artifact],
    vertex_model: Output[Model]
):
    from google.cloud import aiplatform
    aiplatform.init(project=project_id)
    deployed_model = aiplatform.Model.upload(
        display_name="tai-propensity-test-pipeline",
        artifact_uri = model.uri.replace("model", ""),
        serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-4:latest"
    )
    endpoint = deployed_model.deploy(machine_type="n1-standard-4")

    # Save data to the output params
    vertex_endpoint.uri = endpoint.resource_name
    vertex_model.uri = deployed_model.resource_name


In [13]:
@dsl.pipeline(
    # Default pipeline root. You can override it when submitting the pipeline.
    pipeline_root=PIPELINE_ROOT,
    # A name for the pipeline.
    name="pipeline-test",
    description='Propensity BQML Test'
)
def pipeline():
    
    
    create_input_view_op = create_input_view(view_name = VIEW_NAME,
                                             data_set_id = DATA_SET_ID,
                                             project_id = PROJECT_ID,
                                             bucket_name = BUCKET_NAME,
                                             blob_path = BLOB_PATH
                                             )
    

    build_bqml_logistic_op = build_bqml_logistic(project_id = PROJECT_ID, 
                                                   data_set_id = DATA_SET_ID, 
                                                   model_name = 'bqml_logistic_model', 
                                                   training_view = VIEW_NAME
                                                   )
    
    build_bqml_xgboost_op = build_bqml_xgboost(project_id = PROJECT_ID, 
                                                 data_set_id = DATA_SET_ID, 
                                                 model_name = 'bqml_xgboost_model', 
                                                 training_view = VIEW_NAME
                                                 )
    
    build_bqml_automl_op = build_bqml_automl (project_id = PROJECT_ID, 
                                                data_set_id = DATA_SET_ID, 
                                                model_name = 'bqml_automl_model', 
                                                training_view = VIEW_NAME
                                               )
    
    
    
    build_xgb_xgboost_op = build_xgb_xgboost(project_id = PROJECT_ID, 
                                                         data_set_id = DATA_SET_ID, 
                                                         training_view = VIEW_NAME
                                                        )  
       
    
    evaluate_bqml_logistic_op = evaluate_bqml_logistic(project_id = PROJECT_ID, 
                                                         data_set_id = DATA_SET_ID, 
                                                         model_name = 'bqml_logistic_model', 
                                                         training_view = VIEW_NAME
                                                         )
    
    evaluate_bqml_xgboost_op = evaluate_bqml_xgboost(project_id = PROJECT_ID, 
                                                         data_set_id = DATA_SET_ID, 
                                                         model_name = 'bqml_xgboost_model', 
                                                         training_view = VIEW_NAME
                                                         )
    
    evaluate_bqml_automl_op = evaluate_bqml_automl(project_id = PROJECT_ID, 
                                                         data_set_id = DATA_SET_ID, 
                                                         model_name = 'bqml_automl_model', 
                                                         training_view = VIEW_NAME
                                                         )
    
    
    deploy_xgb_op = deploy_xgb(project_id = PROJECT_ID,
                                   model=build_xgb_xgboost_op.outputs["model"]
                                  ) 
                                                         
                                
    build_bqml_logistic_op.after(create_input_view_op)
    build_bqml_xgboost_op.after(create_input_view_op)
    build_bqml_automl_op.after(create_input_view_op)
    build_xgb_xgboost_op.after(create_input_view_op)
    
    evaluate_bqml_logistic_op.after(build_bqml_logistic_op)
    evaluate_bqml_xgboost_op.after(build_bqml_xgboost_op)
    evaluate_bqml_automl_op.after(build_bqml_automl_op)

In [14]:
compiler.Compiler().compile(
    pipeline_func=pipeline, package_path="pipeline.json"
)

In [15]:
TIMESTAMP = datetime.now().strftime("%Y%m%d%H%M%S")
run = pipeline_jobs.PipelineJob(
    display_name="test-pipeine",
    template_path="pipeline.json",
    
    job_id="test-{0}".format(TIMESTAMP),
    enable_caching=True
)

In [16]:
run.run()

INFO:google.cloud.aiplatform.pipeline_jobs:Creating PipelineJob
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob created. Resource name: projects/449379755990/locations/us-central1/pipelineJobs/test-20210928201453
INFO:google.cloud.aiplatform.pipeline_jobs:To use this PipelineJob in another session:
INFO:google.cloud.aiplatform.pipeline_jobs:pipeline_job = aiplatform.PipelineJob.get('projects/449379755990/locations/us-central1/pipelineJobs/test-20210928201453')
INFO:google.cloud.aiplatform.pipeline_jobs:View Pipeline Job:
https://console.cloud.google.com/vertex-ai/locations/us-central1/pipelines/runs/test-20210928201453?project=449379755990
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/449379755990/locations/us-central1/pipelineJobs/test-20210928201453 current state:
PipelineState.PIPELINE_STATE_RUNNING
INFO:google.cloud.aiplatform.pipeline_jobs:PipelineJob projects/449379755990/locations/us-central1/pipelineJobs/test-20210928201453 current state:
PipelineState.

In [None]:
# this schedules a cron like job by building an endpoint using cloud functions and then scheduler

from kfp.v2.google.client import AIPlatformClient

api_client = AIPlatformClient(project_id=PROJECT_ID,
                             region='us-central1'
                             )

api_client.create_schedule_from_job_spec(
    job_spec_path='pipeline.json',
    schedule='0 * * * *',
    enable_caching=False
)

In [None]:
##################
# Junkyard

In [None]:
from google.cloud import storage


    # The ID of your GCS bucket
    # bucket_name = "your-bucket-name"
    
    
    # The path to your file to upload
    # source_file_name = "local/path/to/file"
    # The ID of your GCS object
    # destination_blob_name = "storage-object-name"
    
    
#     storage_client = storage.Client()
#     bucket = storage_client.bucket(bucket_name)
#     blob = bucket.blob(destination_blob_name)

#     blob.upload_from_filename(source_file_name)
    
client = storage.Client()
bucket = storage_client.bucket('propensity_model_assets')
blob = bucket.blob('pipeline_root/449379755990/test-20210909214717/evaluate-model-logistic_-1619527851155914752')
blob.upload_from_filename('image.png')

In [None]:
import io
from google.cloud import storage

client = storage.Client()
bucket = client.bucket('propensity_model_assets')
blob = bucket.blob('pipeline_root/449379755990/test-20210910193817/evaluate-model-xgboost_-6068239858067832832/xgboost_image_path/your-filename.png')

# temporarily save image to buffer
buf = io.BytesIO()
graph.savefig(buf, format='png')

# upload buffer contents to gcs
blob.upload_from_string(
    buf.getvalue(),
    content_type='image/png')

buf.close()

# gcs url to uploaded matplotlib image
url = blob.public_url

In [None]:
url

In [None]:
https://pantheon.corp.google.com/storage/browser/_details/propensity_model_assets/pipeline_root/449379755990/test-20210910193817/evaluate-model-xgboost_-6068239858067832832/xgboost_image_path;tab=live_object?project=tai-demo-experimental-gke

In [None]:
graph

In [None]:
from google.cloud import storage


storage_client = storage.Client()
bucket = storage_client.bucket('propensity_model_assets')
blob = bucket.blob("pipeline_root/449379755990/test-20210909205951/evaluate-model-xgboost_7096063327712837632/test_image.png")
blob.upload_from_filename("image.png")



In [None]:
https://pantheon.corp.google.com/storage/browser/propensity_model_assets/pipeline_root/449379755990/test-20210909205951/evaluate-model-xgboost_7096063327712837632;tab=objects?project=tai-demo-experimental-gke&prefix=&forceOnObjectsSortingFiltering=false

In [None]:
create_input_view('test', 'propensity', 'tai-demo-experimental-gke')

In [None]:
from google.cloud import bigquery
import xgboost as xgb
# from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

client = bigquery.Client()

data_set = "tai-demo-experimental-gke.propensity.test"
build_df_for_xgboost = '''
SELECT * FROM `{data_set}`
'''.format(data_set = data_set)

job_config = bigquery.QueryJobConfig()
df = client.query(build_df_for_xgboost, job_config=job_config).to_dataframe()  # Make an API request.
df = pd.get_dummies(df.drop(['fullVisitorId'], axis=1), prefix=['visited_dma', 'visited_daypart', 'visited_dow'])


X = df.drop(['label'], axis=1).values
y = df['label'].values

X_train, X_test, y_train, y_test  = train_test_split(X,y)
train = xgb.DMatrix(X_train, label=y_train)
test = xgb.DMatrix(X_test, label=y_test)

param = {
    'reg_lambda':1,
    'gamma':0,
    'max_depth':3,
    'objective': 'binary:hinge',
    'eta': .2
}
epochs = 3

model = xgb.train(param, train, epochs)
predictions = model.predict(test)
score = accuracy_score(y_test, predictions)

In [None]:
score

In [None]:
# from kfp.v2.google.client import AIPlatformClient

# api_client = AIPlatformClient(project_id=PROJECT_ID,
#                            region=REGION)

# api_client.create_schedule_from_job_spec(
#     job_spec_path=COMPILED_PIPELINE_PATH,
#     schedule=SCHEDULE,
#     time_zone=TIME_ZONE,
#     parameter_values=PIPELINE_PARAMETERS
# )

In [3]:
project_id = 'tai-demo-experimental-gke'
data_set_id = 'propensity'
model_name = 'bqml_xgboost_model' 
training_view = 'test'


from google.cloud import bigquery
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from joblib import dump
import pandas as pd
import pyarrow


client = bigquery.Client(project=project_id)

data_set = project_id+'.'+data_set_id+'.'+training_view
build_df_for_xgboost = '''
SELECT * FROM `{data_set}`
'''.format(data_set = data_set)

job_config = bigquery.QueryJobConfig()
df = client.query(build_df_for_xgboost, job_config=job_config).to_dataframe()  # Make an API request.
df = pd.get_dummies(df.drop(['fullVisitorId'], axis=1), prefix=['visited_dma', 'visited_daypart', 'visited_dow'])


X = df.drop(['label'], axis=1).values
y = df['label'].values


X_train, X_test, y_train, y_test  = train_test_split(X,y)
train = xgb.DMatrix(X_train, label=y_train)
test = xgb.DMatrix(X_test, label=y_test)

In [21]:
params = {
    'reg_lambda':[0,1],
    'gamma': [1, 1.5, 2, 2.5, 3],
    'max_depth':[2,3,4,5,10,20],
    'learning_rate': [.1,.01]
    
}

# params = {
#     'reg_lambda':[1],
#     'gamma': [1, 1.5, 2, 2.5, 3],
#     'max_depth':[2,3,4,5,6],
#     'eta': [.2,.3]
    
# }
# epochs = 2
#     'objective': 'binary:hinge',




# estimator = XGBClassifier(
#     objective= 'binary:logistic',
#     nthread=4,
#     seed=42
# )

# grid_search = GridSearchCV(
#     estimator=estimator,
#     param_grid=params,
#     scoring = 'roc_auc',
#     n_jobs = 2,
#     cv = 2,
#     verbose=True
# )

# grid_search.fit(X, y)


# model = XGBClassifier(n_estimators=600, objective='binary:hinge',
#                     silent=True, nthread=1, use_label_encoder=False,
#                          eval_metric="error")

model = XGBClassifier(n_estimators=50, objective='binary:hinge',
                    silent=True, nthread=1,
                         eval_metric="auc")



folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(model, param_distributions=params, 
                                   n_iter=param_comb, scoring='precision', 
                                   n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, 
                                   random_state=1001 )



random_search.fit(X_train, y_train)
model = random_search.best_estimator_
auc = random_search.best_score_

predictions = model.predict(X_test)
score = accuracy_score(y_test, predictions)
auc = roc_auc_score(y_test, predictions)
precision_recall = precision_recall_curve(y_test, predictions)

Fitting 3 folds for each of 5 candidates, totalling 15 fits




Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [23]:
score

0.9771886559802713

In [6]:
precision_recall

(array([0.02240033, 1.        ]), array([1., 0.]), array([0]))

In [None]:
random_search.best_estimator_

In [None]:
project_id = 'tai-demo-experimental-gke'
data_set_id = 'propensity'
model_name = 'bqml_xgboost_model' 
training_view = 'test'

from google.cloud import bigquery
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_curve
from joblib import dump
import pandas as pd
import pyarrow

client = bigquery.Client(project=project_id)

data_set = project_id+'.'+data_set_id+'.'+training_view
build_df_for_xgboost = '''
SELECT * FROM `{data_set}`
'''.format(data_set = data_set)

job_config = bigquery.QueryJobConfig()
df = client.query(build_df_for_xgboost, job_config=job_config).to_dataframe()  # Make an API request.
df = pd.get_dummies(df.drop(['fullVisitorId'], axis=1), prefix=['visited_dma', 'visited_daypart', 'visited_dow'])


X = df.drop(['label'], axis=1).values
y = df['label'].values

X_train, X_test, y_train, y_test  = train_test_split(X,y)
train = xgb.DMatrix(X_train, label=y_train)
test = xgb.DMatrix(X_test, label=y_test)


In [None]:

params = {
    'reg_lambda':[1],
    'gamma': [1, 1.5],
    'max_depth':[2,3,4],
    'eta': [.2]
}

model = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1, use_label_encoder=False,
                         eval_metric="error")

folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(model, param_distributions=params, 
                                   n_iter=param_comb, scoring='precision', 
                                   n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, 
                                   random_state=1001 )


random_search.fit(X_train, y_train)

In [None]:
model = random_search.best_estimator_
predictions = model.predict(X_test)
score = accuracy_score(y_test, predictions)
auc = roc_auc_score(y_test, predictions)
precision_recall = precision_recall_curve(y_test, predictions)

# metrics.log_metric("accuracy",(score * 100.0))
# metrics.log_metric("framework", "xgboost")
# metrics.log_metric("dataset_size", len(df))
# metrics.log_metric("AUC", auc)

In [None]:
model.save_model("model.joblib")

In [None]:
dump(model, "model.joblib")

In [None]:
# def deploy_model(
#     model: Input[Model],
#     project_id: str,
#     vertex_endpoint: Output[Artifact],
#     vertex_model: Output[Model]
# ):
    
    
from google.cloud import aiplatform

aiplatform.init(project=project_id)
#aiplatform.init(project=project, location=region)



In [None]:
deployed_model = aiplatform.Model.upload(
display_name="tai-propensity-test-pipeline",
artifact_uri = model.uri.replace("model_path", ""),
serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/xgboost-cpu.1-4:latest"
)
endpoint = deployed_model.deploy(machine_type="n1-standard-4")

# Save data to the output params
vertex_endpoint.uri = endpoint.resource_name
vertex_model.uri = deployed_model.resource_name

In [3]:
project_id = 'tai-demo-experimental-gke'
data_set_id = 'propensity'
model_name = 'bqml_xgboost_model' 
training_view = 'test'


from google.cloud import bigquery
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, precision_recall_curve
from joblib import dump
import pandas as pd
import pyarrow

client = bigquery.Client(project=project_id)

data_set = project_id+'.'+data_set_id+'.'+training_view
build_df_for_xgboost = '''
SELECT * FROM `{data_set}`
'''.format(data_set = data_set)

job_config = bigquery.QueryJobConfig()
df = client.query(build_df_for_xgboost, job_config=job_config).to_dataframe()  # Make an API request.
df = pd.get_dummies(df.drop(['fullVisitorId'], axis=1), prefix=['visited_dma', 'visited_daypart', 'visited_dow'])


In [5]:


X = df.drop(['label'], axis=1).values
y = df['label'].values

X_train, X_test, y_train, y_test  = train_test_split(X,y)
train = xgb.DMatrix(X_train, label=y_train)
test = xgb.DMatrix(X_test, label=y_test)

params = {
    'reg_lambda':[1],
    'gamma': [1, 1.5],
    'max_depth':[2,3,4],
    'eta': [.2]
}

xgb_model = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1, use_label_encoder=False,
                         eval_metric="error")

folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb_model, param_distributions=params, 
                                   n_iter=param_comb, scoring='precision', 
                                   n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, 
                                   random_state=1001 )


random_search.fit(X_train, y_train)
xgb_model_best = random_search.best_estimator_
predictions = xgb_model_best.predict(X_test)
score = accuracy_score(y_test, predictions)
auc = roc_auc_score(y_test, predictions)
precision_recall = precision_recall_curve(y_test, predictions)

# metrics.log_metric("accuracy",(score * 100.0))
# metrics.log_metric("framework", "xgboost")
# metrics.log_metric("dataset_size", len(df))
# metrics.log_metric("AUC", auc)

dump(xgb_model_best, "model_path" + ".joblib")
#model.save_model(model_path)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




['model_path.joblib']