# Prerequisites

In [185]:
import sys
!{sys.executable} -m pip install mlrun



# Create an MLRun project and configuration

In [186]:
from os import path, getenv
from mlrun import new_project

project_name = '-'.join(filter(None, ['nyc-taxi', getenv('V3IO_USERNAME', None)]))
project_path = path.abspath('conf')
project = new_project(project_name, project_path, init_git=True)

print(f'Project path: {project_path}\nProject name: {project_name}')

Project path: /User/code-migration-mlrun/conf
Project name: nyc-taxi-edmond


In [187]:
from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io

# Target location for storing pipeline artifacts
artifact_path = path.abspath('jobs')
# MLRun DB path or API service URL
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

print(f'Artifacts path: {artifact_path}\nMLRun DB path: {mlconf.dbpath}')

Artifacts path: /User/code-migration-mlrun/jobs
MLRun DB path: http://mlrun-api:8080


# Define Nuclio and MLRun functions

In [188]:
import nuclio

In [189]:
# nuclio: start-code

In [190]:
%%nuclio cmd 
pip install lightgbm



In [191]:
%nuclio config spec.build.baseImage = "mlrun/mlrun"
%nuclio config spec.image = "mlrun/ml-models"
%nuclio config kind = "job"

%nuclio: setting spec.build.baseImage to 'mlrun/mlrun'
%nuclio: setting spec.image to 'mlrun/ml-models'
%nuclio: setting kind to 'job'


In [192]:
import numpy as np 
import pandas as pd
import scipy as scipy
import datetime as dt
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
import os
import gc
from os import path, getenv
from mlrun.run import get_dataitem
from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem
from mlrun.artifacts import get_model, update_model
from pickle import dumps

In [193]:
def clean_df(df):
    return df[(df.fare_amount > 0)  & (df.fare_amount <= 500) &
          # (df.passenger_count >= 0) & (df.passenger_count <= 8)  &
           ((df.pickup_longitude != 0) & (df.pickup_latitude != 0) & (df.dropoff_longitude != 0) & (df.dropoff_latitude != 0) )]

In [194]:
# To Compute Haversine distance
def sphere_dist(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = dropoff_lon - pickup_lon
    
    #Compute haversine distance
    a = np.sin(dlat/2.0)**2 + np.cos(pickup_lat) * np.cos(dropoff_lat) * np.sin(dlon/2.0)**2
    return 2 * R_earth * np.arcsin(np.sqrt(a))

In [195]:
def sphere_dist_bear(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = pickup_lon - dropoff_lon
    
    #Compute bearing distance
    a = np.arctan2(np.sin(dlon * np.cos(dropoff_lat)),np.cos(pickup_lat) * np.sin(dropoff_lat) - np.sin(pickup_lat) * np.cos(dropoff_lat) * np.cos(dlon))
    return a

In [196]:
def radian_conv(degree):
    """
    Return radian.
    """
    return  np.radians(degree)

In [197]:
def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    SOL: Statue of Liberty 
    NYC: Newyork Central
    """
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    sol_coord = (40.6892,-74.0445) # Statue of Liberty
    nyc_coord = (40.7141667,-74.0063889) 
    
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon)
    pickup_sol = sphere_dist(pickup_lat, pickup_lon, sol_coord[0], sol_coord[1]) 
    dropoff_sol = sphere_dist(sol_coord[0], sol_coord[1], dropoff_lat, dropoff_lon)
    pickup_nyc = sphere_dist(pickup_lat, pickup_lon, nyc_coord[0], nyc_coord[1]) 
    dropoff_nyc = sphere_dist(nyc_coord[0], nyc_coord[1], dropoff_lat, dropoff_lon)
    
    
    
    dataset['jfk_dist'] = pickup_jfk + dropoff_jfk
    dataset['ewr_dist'] = pickup_ewr + dropoff_ewr
    dataset['lga_dist'] = pickup_lga + dropoff_lga
    dataset['sol_dist'] = pickup_sol + dropoff_sol
    dataset['nyc_dist'] = pickup_nyc + dropoff_nyc
    
    return dataset

In [198]:
def add_datetime_info(dataset):
    
    #Convert to datetime format
    dataset['pickup_datetime'] = pd.to_datetime(dataset['pickup_datetime'],format="%Y-%m-%d %H:%M:%S UTC")
    
    dataset['hour'] = dataset.pickup_datetime.dt.hour
    dataset['day'] = dataset.pickup_datetime.dt.day
    dataset['month'] = dataset.pickup_datetime.dt.month
    dataset['weekday'] = dataset.pickup_datetime.dt.weekday
    dataset['year'] = dataset.pickup_datetime.dt.year
    
    return dataset

In [199]:
def fetch_data(context : MLClientCtx, csv_path):
    
    context.logger.info('Reading data from {}'.format(csv_path))
    
    dataset = csv_path.as_df()
    
    target_path = path.join(context.artifact_path, 'data')
    # Optionally print data to your logger
    context.logger.info('Saving nyc taxi dataset to {} ...'.format(target_path))

    # Store the data set in your artifacts database
    context.log_dataset('nyc-taxi-dataset', df=dataset, format='csv',
                        index=False, artifact_path=target_path)

In [200]:
def transform_dataset(context : MLClientCtx, input_ds: str):
    
    context.logger.info('Begin dataset transform')
        
    train_df = get_dataitem(input_ds).as_df()
    
    # Drop rows with null values
    train_df = train_df.dropna(how = 'any', axis = 'rows')
    
    # Clean DF
    train_df = clean_df(train_df)
    
    # Enrich DF
    train_df = add_datetime_info(train_df)
    train_df = add_airport_dist(train_df)
    train_df['distance'] = sphere_dist(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                       train_df['dropoff_latitude'] , train_df['dropoff_longitude']) 

    train_df['bearing'] = sphere_dist_bear(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                           train_df['dropoff_latitude'] , train_df['dropoff_longitude'])
    train_df['pickup_latitude'] = radian_conv(train_df['pickup_latitude'])
    train_df['pickup_longitude'] = radian_conv(train_df['pickup_longitude'])
    train_df['dropoff_latitude'] = radian_conv(train_df['dropoff_latitude'])
    train_df['dropoff_longitude'] = radian_conv(train_df['dropoff_longitude'])

    train_df.drop(columns=['key', 'pickup_datetime'], inplace=True)
    
    # Save dataset to artifact
    target_path = path.join(context.artifact_path, 'data')
    context.log_dataset('nyc-taxi-dataset-transformed', df=train_df, artifact_path=target_path, format='csv')    
    
    context.logger.info('End dataset transform')

In [201]:
import pandas as pd
import numpy as np

def prepare_test_df(csv_path):
    
    test_df =  pd.read_csv(csv_path)
    
    test_df = add_datetime_info(test_df)
    test_df = add_airport_dist(test_df)
    test_df['distance'] = sphere_dist(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                      test_df['dropoff_latitude'], test_df['dropoff_longitude'])

    test_df['bearing'] = sphere_dist_bear(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                          test_df['dropoff_latitude'] , test_df['dropoff_longitude'])  
    test_df['pickup_latitude'] = radian_conv(test_df['pickup_latitude'])
    test_df['pickup_longitude'] = radian_conv(test_df['pickup_longitude'])
    test_df['dropoff_latitude'] = radian_conv(test_df['dropoff_latitude'])
    test_df['dropoff_longitude'] = radian_conv(test_df['dropoff_longitude'])
                                                                    
    test_key = test_df['key']
    test_df = test_df.drop(columns=['key', 'pickup_datetime'])
    
    return test_df

In [202]:
params = {
        'boosting_type':'gbdt',
        'objective': 'regression',
        'nthread': 4,
        'num_leaves': 31,
        'learning_rate': 0.05,
        'max_depth': -1,
        'subsample': 0.8,
        'bagging_fraction' : 1,
        'max_bin' : 5000 ,
        'bagging_freq': 20,
        'colsample_bytree': 0.6,
        'metric': 'rmse',
        'min_split_gain': 0.5,
        'min_child_weight': 1,
        'min_child_samples': 10,
        'scale_pos_weight':1,
        'zero_as_missing': True,
        'seed':0,
        'num_rounds':50000
    }

In [203]:
def train_model(context: MLClientCtx, input_ds: str, model_path: str):
    
    context.logger.info('Begin training')
    context.logger.info('LGBM version is ' + str(lgbm.__version__))
    
    train_df = get_dataitem(input_ds).as_df()
    
    y = train_df['fare_amount']
    train_df = train_df.drop(columns=['fare_amount'])
    train_df = train_df.drop(train_df.columns[[0]], axis=1)
    x_train,x_test,y_train,y_test = train_test_split(train_df,y,random_state=123,test_size=0.10)
    
    train_set = lgbm.Dataset(x_train, y_train, silent=False,categorical_feature=['year','month','day','weekday'])
    valid_set = lgbm.Dataset(x_test, y_test, silent=False,categorical_feature=['year','month','day','weekday'])
    model = lgbm.train(params, train_set = train_set, num_boost_round=10000,early_stopping_rounds=500,verbose_eval=500, valid_sets=valid_set)
        
    # model.save_model('/User/code-migration-mlrun/FareModel.pkl')
    model.save_model(model_path + '/FareModel.pkl')
    
    context.log_model('FareModel',
                     body=dumps(model),
                     artifact_path=context.artifact_subpath("models"),
                     model_file="FareModel.pkl")
    
    context.logger.info('End training')

In [204]:
import time

def test_func(context):
    # time.sleep(60*10)
    context.logger.info('LGBM version is ' + str(lgbm.__version__))

##### Make sure to put the absolute path on the pkl file. Must be under /User

In [205]:
model_file = '/User/code-migration-mlrun/FareModel.pkl'

In [206]:
import mlrun
import numpy as np
import lightgbm as lgbm

class LGBMModel(mlrun.runtimes.MLModelServer):
    
    def load(self):
        self.model = lgbm.Booster(model_file=model_file)

    def predict(self, body):
        try:
            feats = np.asarray(body['instances'])
            result = self.model.predict(feats)
            return result.tolist()
        except Exception as e:
            raise Exception("Failed to predict %s" % e)

In [207]:
# nuclio: end-code

## Run fetch_data locally

In [208]:
fetch_data_run = run_local(name='fetch_data',
                         handler=fetch_data,
                         inputs={'csv_path': 'https://s3.wasabisys.com/iguazio/data/Taxi/ny_taxi_train_subset.csv'},
                         project=project_name, artifact_path=artifact_path)

> 2020-10-13 08:56:02,428 [info] starting run fetch_data uid=269c6eacddf34436866c3024954babd1  -> http://mlrun-api:8080
> 2020-10-13 08:56:02,710 [info] Reading data from https://s3.wasabisys.com/iguazio/data/Taxi/ny_taxi_train_subset.csv
> 2020-10-13 08:56:06,331 [info] Saving nyc taxi dataset to /User/code-migration-mlrun/jobs/data ...


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...954babd1,0,Oct 13 08:56:02,completed,fetch_data,v3io_user=edmondkind=handlerowner=edmondhost=jupyter-edmond-86984dbb4c-nsnll,csv_path,,,nyc-taxi-dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run 269c6eacddf34436866c3024954babd1 --project nyc-taxi-edmond , !mlrun logs 269c6eacddf34436866c3024954babd1 --project nyc-taxi-edmond
> 2020-10-13 08:56:25,884 [info] run executed, status=completed


In [209]:
fetch_data_run.outputs

{'nyc-taxi-dataset': 'store://nyc-taxi-edmond/fetch_data_nyc-taxi-dataset#269c6eacddf34436866c3024954babd1'}

## Run fetch_data on cluster

#### Prepare cluster function

In [210]:
from mlrun import code_to_function

# Convert the local ingest_data_to_dataset function into a gen_func project function
gen_func = code_to_function(name='cluster-function')
project.set_function(gen_func)
cluster_func = project.func('cluster-function').apply(mount_v3io(remote="users/" + getenv('V3IO_USERNAME', None), mount_path="/User"))

In [211]:
# ===========================================

In [212]:
fetch_data_run = cluster_func.run(name='fetch_data',
                                 handler='fetch_data',
                                 inputs={'csv_path': 'http://iguazio-sample-data.s3.amazonaws.com/ny_taxi_train_subset.csv'},
                                 artifact_path=artifact_path)

> 2020-10-13 08:57:57,183 [info] starting run fetch_data uid=2477d17301e841e9a3b3cc69a71f0405  -> http://mlrun-api:8080
> 2020-10-13 08:57:57,583 [info] Job is running in the background, pod: fetch-data-999v7
> 2020-10-13 08:58:05,069 [info] Reading data from http://iguazio-sample-data.s3.amazonaws.com/ny_taxi_train_subset.csv
> 2020-10-13 08:58:08,652 [info] Saving nyc taxi dataset to /User/code-migration-mlrun/jobs/data ...
> 2020-10-13 08:58:28,893 [info] run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...a71f0405,0,Oct 13 08:58:04,completed,fetch_data,v3io_user=edmondkind=jobowner=edmondhost=fetch-data-999v7,csv_path,,,nyc-taxi-dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run 2477d17301e841e9a3b3cc69a71f0405 --project nyc-taxi-edmond , !mlrun logs 2477d17301e841e9a3b3cc69a71f0405 --project nyc-taxi-edmond
> 2020-10-13 08:58:37,105 [info] run executed, status=completed


In [213]:
fetch_data_run.outputs

{'nyc-taxi-dataset': 'store://nyc-taxi-edmond/fetch_data_nyc-taxi-dataset#2477d17301e841e9a3b3cc69a71f0405'}

## Run transform_dataset locally

In [214]:
# Run the data-ingestion function locally in Jupyter Notebook
transform_dataset_run = run_local(name='transform_dataset',
                         handler=transform_dataset,
                         inputs={'input_ds': fetch_data_run.outputs['nyc-taxi-dataset']},
                         project=project_name, artifact_path=artifact_path)

> 2020-10-13 08:58:41,969 [info] starting run transform_dataset uid=542fa3fc8ed24e358d745435fbf0abe1  -> http://mlrun-api:8080
> 2020-10-13 08:58:42,470 [info] Begin dataset transform
> 2020-10-13 08:59:33,373 [info] End dataset transform


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...fbf0abe1,0,Oct 13 08:58:42,completed,transform_dataset,v3io_user=edmondkind=handlerowner=edmondhost=jupyter-edmond-86984dbb4c-nsnll,input_ds,,,nyc-taxi-dataset-transformed


to track results use .show() or .logs() or in CLI: 
!mlrun get run 542fa3fc8ed24e358d745435fbf0abe1 --project nyc-taxi-edmond , !mlrun logs 542fa3fc8ed24e358d745435fbf0abe1 --project nyc-taxi-edmond
> 2020-10-13 08:59:33,634 [info] run executed, status=completed


## Run transform_dataset on a cluster

In [215]:
transform_dataset_run = cluster_func.run(name='transform_dataset',
                                 handler='transform_dataset',
                                 inputs={'input_ds': fetch_data_run.outputs['nyc-taxi-dataset']},
                                 artifact_path=artifact_path)

> 2020-10-13 08:59:36,251 [info] starting run transform_dataset uid=627f0bc7aa6148e5b560588efcc2f5c8  -> http://mlrun-api:8080
> 2020-10-13 08:59:36,654 [info] Job is running in the background, pod: transform-dataset-tl7dw
> 2020-10-13 08:59:43,175 [info] Begin dataset transform
> 2020-10-13 09:00:37,475 [info] End dataset transform
> 2020-10-13 09:00:37,690 [info] run executed, status=completed
final state: succeeded


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...fcc2f5c8,0,Oct 13 08:59:42,completed,transform_dataset,v3io_user=edmondkind=jobowner=edmondhost=transform-dataset-tl7dw,input_ds,,,nyc-taxi-dataset-transformed


to track results use .show() or .logs() or in CLI: 
!mlrun get run 627f0bc7aa6148e5b560588efcc2f5c8 --project nyc-taxi-edmond , !mlrun logs 627f0bc7aa6148e5b560588efcc2f5c8 --project nyc-taxi-edmond
> 2020-10-13 09:00:46,442 [info] run executed, status=completed


In [216]:
transform_dataset_run.outputs

{'nyc-taxi-dataset-transformed': 'store://nyc-taxi-edmond/transform_dataset_nyc-taxi-dataset-transformed#627f0bc7aa6148e5b560588efcc2f5c8'}

## Train model locally

In [179]:
# Run the train_model function locally in Jupyter Notebook
train_model_run = run_local(name='train_model',
                            handler=train_model,
                            inputs={'input_ds': transform_dataset_run.outputs['nyc-taxi-dataset-transformed'], 
                                    'model_path' : path.abspath("")},
                            project=project_name, artifact_path=artifact_path)

> 2020-10-07 09:23:42,694 [info] starting run train_model uid=9f546507af984c39b614584414fe2068  -> http://mlrun-api:8080
> 2020-10-07 09:23:43,253 [info] Begin training
> 2020-10-07 09:23:43,253 [info] LGBM version is 2.3.0




Training until validation scores don't improve for 500 rounds
[500]	valid_0's rmse: 3.93997
[1000]	valid_0's rmse: 3.89675
[1500]	valid_0's rmse: 3.88275
[2000]	valid_0's rmse: 3.87709
[2500]	valid_0's rmse: 3.87271
[3000]	valid_0's rmse: 3.86834
[3500]	valid_0's rmse: 3.86719
[4000]	valid_0's rmse: 3.86519
[4500]	valid_0's rmse: 3.86336
[5000]	valid_0's rmse: 3.86383
Early stopping, best iteration is:
[4653]	valid_0's rmse: 3.863
> 2020-10-07 09:26:26,802 [info] End training


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...14fe2068,0,Oct 07 09:23:42,completed,train_model,v3io_user=edmondkind=handlerowner=edmondhost=jupyter-edmond-86984dbb4c-nsnll,input_dsmodel_path,,,FareModel


to track results use .show() or .logs() or in CLI: 
!mlrun get run 9f546507af984c39b614584414fe2068 --project nyc-taxi-edmond , !mlrun logs 9f546507af984c39b614584414fe2068 --project nyc-taxi-edmond
> 2020-10-07 09:26:27,058 [info] run executed, status=completed


## Train model on a cluster

In [217]:
train_model_run = cluster_func.run(name='train_model',
                                    handler='train_model',
                                    inputs={'input_ds': transform_dataset_run.outputs['nyc-taxi-dataset-transformed'], 
                                            'model_path': path.abspath("")},
                                    artifact_path=artifact_path)

> 2020-10-13 09:00:55,332 [info] starting run train_model uid=6584d7d0287148519625945450d77667  -> http://mlrun-api:8080
> 2020-10-13 09:00:56,006 [info] Job is running in the background, pod: train-model-5dkhp
> 2020-10-13 09:01:00,463 [info] Begin training
> 2020-10-13 09:01:00,463 [info] LGBM version is 3.0.0
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 55092
[LightGBM] [Info] Number of data points in the train set: 882018, number of used features: 17
[LightGBM] [Info] Start training from score 11.329779
Training until validation scores don't improve for 500 rounds
[500]	valid_0's rmse: 3.94808
[1000]	valid_0's rmse: 3.91576
[1500]	valid_0's rmse: 3.90318
[2000]	valid_0's rmse: 3.8973
[2500]	valid_0's rmse: 3.89285
[3000]	valid_0's rmse: 3.8871
[3500]	valid_0's rmse: 3.88441
[4000]	valid_0's rmse: 3.88309
[4500]	valid_0's rmse: 3.88056
[5000]	valid_0's rmse: 3.87909
[5500]	valid

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
nyc-taxi-edmond,...50d77667,0,Oct 13 09:01:00,completed,train_model,v3io_user=edmondkind=jobowner=edmondhost=train-model-5dkhp,input_dsmodel_path,,,FareModel


to track results use .show() or .logs() or in CLI: 
!mlrun get run 6584d7d0287148519625945450d77667 --project nyc-taxi-edmond , !mlrun logs 6584d7d0287148519625945450d77667 --project nyc-taxi-edmond
> 2020-10-13 09:04:15,915 [info] run executed, status=completed


# Test the model

In [218]:
test_df =  prepare_test_df('https://s3.wasabisys.com/iguazio/data/Taxi/ny_taxi_train_test.csv')
test_df.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
0,-1.291078,0.711463,-1.29122,0.711114,1,13,27,1,1,2015,42.055277,35.042878,18.501273,18.309374,10.095252,2.32326,2.843096
1,-1.291314,0.710687,-1.291524,0.711033,1,13,27,1,1,2015,41.244373,30.827012,23.023659,12.664639,4.599471,2.425353,0.430894
2,-1.291239,0.711244,-1.291189,0.711154,1,11,8,10,5,2011,41.831497,33.993679,19.353797,17.018308,8.797452,0.618628,-2.740065
3,-1.291215,0.711532,-1.291377,0.71125,1,21,1,12,5,2012,43.964285,34.268523,19.525661,18.544904,10.709378,1.961033,2.731208
4,-1.290951,0.711916,-1.291344,0.711124,1,21,1,12,5,2012,44.128523,36.440152,18.414857,20.732468,12.752865,5.387301,2.781733


In [219]:
bst = lgbm.Booster(model_file=path.abspath("") + '/FareModel.pkl')

In [220]:
ypred = bst.predict(test_df)

In [221]:
ypred

array([10.29027249, 10.91768024,  4.78238741, ..., 52.35993969,
       19.02938427,  6.02166607])

# Serve the model

In [222]:
from mlrun import new_model_server
fn = new_model_server('lgbm-model-server', model_class='LGBMModel')
fn.spec.description = "LGBMModel model server"
fn.metadata.categories = ['serving', 'ml']
fn.metadata.labels = {'author': 'edmondg', 'framework': 'LGBM'}
fn.export()

> 2020-10-13 09:04:48,933 [info] function spec saved to path: function.yaml


<mlrun.runtimes.function.RemoteRuntime at 0x7f525e4219b0>

In [223]:
from mlrun import mount_v3io
fn.apply(mount_v3io())
fn.set_envs({'SERVING_MODEL_nyc_taxi': path.abspath("")})
#fn.verbose = True
address = fn.deploy(project='edmond')

> 2020-10-13 09:04:50,091 [info] deploy started
[nuclio] 2020-10-13 09:05:02,482 (info) Build complete
[nuclio] 2020-10-13 09:05:14,770 (info) Function deploy complete
[nuclio] 2020-10-13 09:05:14,784 done creating edmond-lgbm-model-server, function address: 34.202.248.16:32708


In [224]:
predict_url = address+"/nyc_taxi/predict"
my_data = '''{"instances":[[5.1, 3.5, 1.4, 3, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 5.1, 3.5, 1.4, 0.2, 1]]}'''
!curl {predict_url} -d '{my_data}'

[69.35072877508459]