In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from google.cloud import bigquery

In [2]:
query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 10000
"""
df = bigquery.Client().query(query).to_dataframe()
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,7.561856,False,30,1,40
1,7.749249,False,34,1,40
2,8.313632,True,27,1,38
3,7.438397,True,27,1,37
4,9.124933,True,20,1,41


In [3]:
df.describe()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
count,9990.0,10000,10000.0,10000.0,9951.0
unique,,2,,,
top,,True,,,
freq,,5136,,,
mean,7.260462,,27.352,1.0344,38.657321
std,1.324675,,6.140233,0.191886,2.579655
min,0.500449,,13.0,1.0,17.0
25%,6.624891,,22.0,1.0,38.0
50%,7.364542,,27.0,1.0,39.0
75%,8.062305,,32.0,1.0,40.0


In [4]:
df['is_male'].value_counts()

True     5136
False    4864
Name: is_male, dtype: Int64

In [5]:
df.isna().sum()

weight_pounds      10
is_male             0
mother_age          0
plurality           0
gestation_weeks    49
dtype: int64

In [6]:
df = df.dropna()
df = shuffle(df, random_state=2)

In [7]:
labels = df['weight_pounds']
data = df.drop(columns=['weight_pounds'])

In [8]:
data['is_male'] = data['is_male'].astype(int)

In [9]:
x,y = data,labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [10]:
model = xgb.XGBRegressor(
    objective='reg:squarederror'
)

In [14]:
y_train.dtypes

dtype('float64')

In [17]:
model.fit( x_train.values, y_train.values )

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
y_pred = model.predict(x_test.values)

In [19]:
for i in range(20):
    print('Predicted weight: ', y_pred[i])
    print('Actual weight: ', y_test.iloc[i])
    print()

Predicted weight:  7.1952724
Actual weight:  7.25100379718

Predicted weight:  7.5855474
Actual weight:  8.375361333379999

Predicted weight:  7.4657526
Actual weight:  8.375361333379999

Predicted weight:  6.9071813
Actual weight:  7.5618555866

Predicted weight:  7.649332
Actual weight:  8.12623897732

Predicted weight:  7.4163094
Actual weight:  5.98995965854

Predicted weight:  7.1451097
Actual weight:  5.93704871566

Predicted weight:  8.054473
Actual weight:  6.4992274837599995

Predicted weight:  7.709208
Actual weight:  7.10770332688

Predicted weight:  7.8527303
Actual weight:  6.62709559572

Predicted weight:  8.13757
Actual weight:  7.8374334140999995

Predicted weight:  7.3027678
Actual weight:  6.4374980503999994

Predicted weight:  7.401359
Actual weight:  7.8837304891199995

Predicted weight:  6.4014626
Actual weight:  5.2029093832

Predicted weight:  7.35351
Actual weight:  6.8122838958

Predicted weight:  7.571242
Actual weight:  7.12534030784

Predicted weight:  7.404

In [20]:
model.save_model('model.bst')

In [21]:
!gcloud config list project --format "value(core.project)"

turnkey-banner-371806


In [22]:
# Update these to your own GCP project, model, and version names
GCP_PROJECT = 'turnkey-banner-371806'
MODEL_BUCKET = 'gs://'+GCP_PROJECT
VERSION_NAME = 'v1'
MODEL_NAME = 'baby_weight'

In [23]:
!gsutil mb $MODEL_BUCKET

Creating gs://turnkey-banner-371806/...


In [24]:
!gsutil cp ./model.bst $MODEL_BUCKET

Copying file://./model.bst [Content-Type=application/octet-stream]...
/ [1 files][314.1 KiB/314.1 KiB]                                                
Operation completed over 1 objects/314.1 KiB.                                    


In [27]:
!gcloud ai-platform models create $MODEL_NAME --region=us-central1

Using endpoint [https://us-central1-ml.googleapis.com/]
Created ai platform model [projects/turnkey-banner-371806/models/baby_weight].


In [28]:
!gcloud ai-platform versions create $VERSION_NAME \
--model=$MODEL_NAME \
--framework='XGBOOST' \
--runtime-version=2.5 \
--origin=$MODEL_BUCKET \
--python-version=3.7 \
--project=$GCP_PROJECT \
--region=us-central1

Using endpoint [https://us-central1-ml.googleapis.com/]
Creating version (this might take a few minutes)......done.                    


In [29]:
%%writefile predictions.json
[0.0, 33.0, 1.0, 27.0]
[1.0, 26.0, 1.0, 40.0]


Writing predictions.json


In [30]:
prediction = !gcloud ai-platform predict --model=$MODEL_NAME --json-instances=predictions.json --version=$VERSION_NAME
print(prediction.s)

Using endpoint [https://us-central1-ml.googleapis.com/] [1.6867361068725586, 7.945723056793213]
