In [1]:
# Authenticate with Docker Registry
!gcloud auth configure-docker --quiet

# Create a /tmp directory (needed in next steps)
!mkdir /tmp

`docker` and `docker-credential-gcloud` need to be in the same PATH in order to work correctly together.
gcloud's Docker credential helper can be configured but it will not work until this is corrected.
Docker configuration file updated.


In [2]:
!pip install sklearn pandas xgboost fairing

Collecting sklearn
  Downloading https://files.pythonhosted.org/packages/1e/7a/dbb3be0ce9bd5c8b7e3d87328e79063f8b263b2b1bfa4774cb1147bfcd3f/sklearn-0.0.tar.gz
Collecting kubernetes>=9.0.0 (from fairing)
[?25l  Downloading https://files.pythonhosted.org/packages/00/f7/4f196c55f1c2713d3edc8252c4b45326306eef4dc10048f13916fe446e2b/kubernetes-9.0.0-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 16.8MB/s ta 0:00:01
Collecting google-cloud-storage>=1.13.2 (from fairing)
[?25l  Downloading https://files.pythonhosted.org/packages/e2/4e/aee59b19321eb1063317c2e6fa4c2f3cfe21740586de78578eedbd2bed3d/google_cloud_storage-1.16.1-py2.py3-none-any.whl (65kB)
[K    100% |████████████████████████████████| 71kB 27.7MB/s ta 0:00:01
Collecting google-auth>=1.6.2 (from fairing)
[?25l  Downloading https://files.pythonhosted.org/packages/c5/9b/ed0516cc1f7609fb0217e3057ff4f0f9f3e3ce79a369c6af4a6c5ca25664/google_auth-1.6.3-py2.py3-none-any.whl (73kB)
[K    100% |██████████

[?25l  Downloading https://files.pythonhosted.org/packages/10/d6/8b1e8d79a8a56649af3a094e3d90dd213278da942f36d831b57c0ca4a503/google_api_core-1.11.1-py2.py3-none-any.whl (66kB)
[K    100% |████████████████████████████████| 71kB 18.8MB/s ta 0:00:01
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/76/03/bb/589d421d27431bcd2c6da284d5f2286c8e3b2ea3cf1594c074
Successfully built sklearn
[31mkfp 0.1 has requirement google-auth==1.6.1, but you'll have google-auth 1.6.3 which is incompatible.[0m
[31mkfp 0.1 has requirement google-cloud-storage==1.13.0, but you'll have google-cloud-storage 1.16.1 which is incompatible.[0m
[31mkfp 0.1 has requirement kubernetes==8.0.0, but you'll have kubernetes 9.0.0 which is incompatible.[0m
[31mgoogle-cloud-bigquery 1.6.1 has requirement google-cloud-core<0.30dev,>=0.28.0, but you'll have google-cloud-core 1.0.2 which is incompatible.[0m


In [3]:
import argparse
import logging
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from xgboost import XGBRegressor
import urllib.request


TRAINING_URL="https://raw.githubusercontent.com/kubeflow/examples/master/xgboost_ames_housing/ames_dataset/train.csv"
TRAINING_FILE="train.csv"

ESTIMATORS=1000
LEARNING_RATE=0.1
TEST_FRACTION_SIZE=0.25
EARLY_STOPPING_ROUNDS=50

def run_training_and_eval():
    (train_X, train_y), (test_X, test_y) = read_input()
    model = train_model(train_X,
                        train_y,
                        test_X,
                        test_y,
                        ESTIMATORS,
                        LEARNING_RATE)

    eval_model(model, test_X, test_y)

def download(url, file_name):
    with urllib.request.urlopen(url) as response, open(file_name, "wb") as file:
        file.write(response.read())

def read_input(test_size=TEST_FRACTION_SIZE):
    """Read input data and split it into train and test."""
    download(TRAINING_URL, TRAINING_FILE)
    data = pd.read_csv(TRAINING_FILE)
    data.dropna(axis=0, subset=['SalePrice'], inplace=True)

    y = data.SalePrice
    X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])

    train_X, test_X, train_y, test_y = train_test_split(X.values,
                                                        y.values,
                                                        test_size=test_size,
                                                        shuffle=False)

    imputer = Imputer()
    train_X = imputer.fit_transform(train_X)
    test_X = imputer.transform(test_X)

    return (train_X, train_y), (test_X, test_y)

def train_model(train_X,
                train_y,
                test_X,
                test_y,
                n_estimators,
                learning_rate):
    """Train the model using XGBRegressor."""
    model = XGBRegressor(n_estimators=n_estimators,
                      learning_rate=learning_rate)

    model.fit(train_X,
              train_y,
              early_stopping_rounds=EARLY_STOPPING_ROUNDS,
              eval_set=[(test_X, test_y)])

    logging.info("Best RMSE on eval: %.2f with %d rounds",
                 model.best_score,
                 model.best_iteration+1)
    return model

def eval_model(model, test_X, test_y):
    """Evaluate the model performance."""
    predictions = model.predict(test_X)
    logging.info("mean_absolute_error=%.2f", mean_absolute_error(predictions, test_y))

In [4]:
import fairing
run_training_and_eval = fairing.config.fn(run_training_and_eval)
run_training_and_eval()

Building image...
/opt/conda/lib/python3.6/site-packages/fairing/__init__.py already exists in Fairing context, skipping...
Loading Docker credentials for repository 'gcr.io/kubeflow-images-public/fairing:dev'
Invoking 'docker-credential-gcloud' to obtain Docker credentials.
Successfully obtained Docker credentials.
Image successfully built in 0.6095852610014845s.
Pushing image gcr.io/pipelineai2/fairing-job:7ABA4B88...
Loading Docker credentials for repository 'gcr.io/pipelineai2/fairing-job:7ABA4B88'
Invoking 'docker-credential-gcloud' to obtain Docker credentials.
Successfully obtained Docker credentials.
Uploading gcr.io/pipelineai2/fairing-job:7ABA4B88
Layer sha256:830c3b063b0f1cffb88718599fbfdadb89d1c857fcb168920315be8a438d1976 exists, skipping
Layer sha256:c7849095338ab97d368a10ead4c5e4e63a64b87244988af729a8e15840d1fd17 exists, skipping
Layer sha256:fa3f2f277e67c5cbbf1dac21dc27111a60d3cd2ef494d94aa1515d3319f2a245 exists, skipping
Layer sha256:398d32b153e84fe343f0c5b07d65e89b0555

[0]	validation_0-rmse:177514
Will train until validation_0-rmse hasn't improved in 50 rounds.
[1]	validation_0-rmse:161858
[2]	validation_0-rmse:147237
[3]	validation_0-rmse:134132
[4]	validation_0-rmse:122224
[5]	validation_0-rmse:111538
[6]	validation_0-rmse:102142
[7]	validation_0-rmse:93392.2
[8]	validation_0-rmse:85824.6
[9]	validation_0-rmse:79667.6
[10]	validation_0-rmse:73463.4
[11]	validation_0-rmse:68059.4
[12]	validation_0-rmse:63350.5
[13]	validation_0-rmse:59732.1
[14]	validation_0-rmse:56260.7
[15]	validation_0-rmse:53392.6
[16]	validation_0-rmse:50770.8
[17]	validation_0-rmse:48107.8
[18]	validation_0-rmse:45923.9
[19]	validation_0-rmse:44154.2
[20]	validation_0-rmse:42488.1
[21]	validation_0-rmse:41263.3
[22]	validation_0-rmse:40212.8
[23]	validation_0-rmse:39089.1
[24]	validation_0-rmse:37691.1
[25]	validation_0-rmse:36875.2
[26]	validation_0-rmse:36276.2
[27]	validation_0-rmse:35444.1
[28]	validation_0-rmse:34831.5
[29]	validation_0-rmse:34205.4
[30]	validation_0-rmse

Cleaning up job fairing-job-9zmbk...
