In [1]:
import os
import joblib
import marshal
import tarfile
import subprocess
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import mlflow
import warnings

warnings.filterwarnings("ignore")

HOUSING_PATH = os.path.join("../dataset")

In [2]:
def fetch_housing_data(housing_path=HOUSING_PATH):
    tgz_path = os.path.join(housing_path, "housing.tgz")
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    csv_path = os.path.join(housing_path, "housing.csv")
    df = pd.read_csv(csv_path)
    return df

housing = fetch_housing_data()

In [3]:
targetCol = "median_house_value"
catCols = ["ocean_proximity"]
numCols = ["housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income"]

x = housing[numCols + catCols]
y = housing[targetCol]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)

In [5]:
batch_size = 80
input_json = x_test[:batch_size].to_json(orient="split")
proc = subprocess.run(["curl",  "-X", "POST", "-H", 
                       "Content-Type:application/json; format=pandas-split", 
                       "--data", input_json, "http://127.0.0.1:31236/invocations"], 
                      stdout=subprocess.PIPE, encoding='utf-8')
output = proc.stdout
predictions = np.array(json.loads(output)).reshape(-1,1)
print(f'RMSE : {np.sqrt(mean_squared_error(y_test[:batch_size], predictions))}')

RMSE : 232556.79494488545
