In [104]:
import pandas as pd
import numpy as np
import yaml
from sklearn.model_selection import train_test_split, cross_val_score

%load_ext autoreload
%autoreload 2

SEED = 123

# Data transformation
# This data has an interesting format
# Every second row contains the last 2 features and the housing
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
cleaned_df = raw_df.dropna()  # Drop missing data
data = np.hstack([cleaned_df.values[::2, :], cleaned_df.values[1::2, :2]])
target = cleaned_df.values[1::2, 2]

In [52]:
# Data Preprocessing

def preprocess_data(data, target, test_size=0.2, random_state=SEED):


    # Check Missing data

    data = np.hstack([cleaned_df.values[::2, :], cleaned_df.values[1::2, :2]])
    target = cleaned_df.values[1::2, 2]

    # Feature Scalling/Normatiization 
    # XGBoost is not sensitive to the scale of features
    # Log Transformation of the target varaible Price as price tends to be right-skewed
    target = np.log1p(target)

    x_train, x_test, y_train, y_test = train_test_split(data, target, 
                                                        test_size=test_size, 
                                                        random_state=random_state)
    
    return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = preprocess_data(data, target, test_size)

assert len(x_train) == len(y_train)
assert len(x_test) == len(y_test)



In [28]:
import pandas as pd
data_url = "http://lib.stat.cmu.edu/datasets/boston"

raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
raw_df.to_csv('boston_housing.csv', index=False)

raw_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.00632,18.00,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3
1,396.90000,4.98,24.00,,,,,,,,
2,0.02731,0.00,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8
3,396.90000,9.14,21.60,,,,,,,,
4,0.02729,0.00,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8
...,...,...,...,...,...,...,...,...,...,...,...
1007,396.90000,5.64,23.90,,,,,,,,
1008,0.10959,0.00,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0
1009,393.45000,6.48,22.00,,,,,,,,
1010,0.04741,0.00,11.93,0.0,0.573,6.030,80.8,2.5050,1.0,273.0,21.0


In [29]:
raw_df = pd.read_csv('boston_housing.csv', sep="\s+", header=None)

In [30]:
raw_df

Unnamed: 0,0
0,012345678910
1,"0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1...."
2,"396.9,4.98,24.0,,,,,,,,"
3,"0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2..."
4,"396.9,9.14,21.6,,,,,,,,"
...,...
1008,"396.9,5.64,23.9,,,,,,,,"
1009,"0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,..."
1010,"393.45,6.48,22.0,,,,,,,,"
1011,"0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1...."


In [24]:
%load_ext autoreload
%autoreload 2

from XGBoostTunner import XGBoostTunner
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'vertex-ai-key.json'
tunner = XGBoostTunner(project_id="ml-coding-439503", region="australia-southeast2", staging_bucket="ml-coding-au")
tunner.prepare_training_data()
tunner.train_and_tune()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Creating HyperparameterTuningJob


InvalidArgument: 400 You do not have permission to act as service_account: vertex-ai-sa2@ml-coding-439503.iam.gserviceaccount.com. (or it may not exist).

In [86]:
!/home/kelvin/ML_task/.venv/bin/pip install xgboost

/home/kelvin/.bashrc: line 52: unexpected argument `]]' to conditional unary operator
/home/kelvin/.bashrc: line 52: syntax error near `]]'
/home/kelvin/.bashrc: line 52: `[[ -s  ]] && . '
Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.1-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.1


In [78]:
from google.cloud import aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt