# Import Libraries

In [23]:
import json
import requests
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import mlflow
import mlflow.sklearn

# Loading Dataset

In [24]:
airbnb = pd.read_csv('data/AB_NYC_2019.csv')

In [25]:
airbnb.shape

(48895, 16)

# EDA and Data Cleaning

In [26]:
airbnb.duplicated().sum()

0

In [27]:
airbnb.drop_duplicates(inplace=True)

In [28]:
airbnb.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [29]:
airbnb.drop(['name','id','host_name','last_review'], axis=1, inplace=True)

In [30]:
airbnb.head(3)

Unnamed: 0,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,1,365


In [31]:
airbnb.fillna({'reviews_per_month':0}, inplace=True)
# examing changes
airbnb.reviews_per_month.isnull().sum()

0

In [32]:
airbnb.isnull().sum()
airbnb.dropna(how='any',inplace=True)
airbnb.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48895 entries, 0 to 48894
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   host_id                         48895 non-null  int64  
 1   neighbourhood_group             48895 non-null  object 
 2   neighbourhood                   48895 non-null  object 
 3   latitude                        48895 non-null  float64
 4   longitude                       48895 non-null  float64
 5   room_type                       48895 non-null  object 
 6   price                           48895 non-null  int64  
 7   minimum_nights                  48895 non-null  int64  
 8   number_of_reviews               48895 non-null  int64  
 9   reviews_per_month               48895 non-null  float64
 10  calculated_host_listings_count  48895 non-null  int64  
 11  availability_365                48895 non-null  int64  
dtypes: float64(3), int64(6), object(

In [33]:
airbnb.describe()

Unnamed: 0,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0
mean,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.09091,7.143982,112.781327
std,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.597283,32.952519,131.622289
min,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.0,1.0,0.0
25%,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.04,1.0,0.0
50%,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.37,1.0,45.0
75%,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,1.58,2.0,227.0
max,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [34]:
airbnb.columns

Index(['host_id', 'neighbourhood_group', 'neighbourhood', 'latitude',
       'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

# Encode the input Variables

In [35]:

def Encode(airbnb):
    for column in airbnb.columns[airbnb.columns.isin(['neighbourhood_group', 'room_type'])]:
        airbnb[column] = airbnb[column].factorize()[0]
    return airbnb

airbnb_en = Encode(airbnb.copy())

# Defining the independent variables and dependent variables

In [36]:

x = airbnb_en.iloc[:,[0,1,3,4,5]]
y = airbnb_en['price']

# Splitting dataset into train and test set

In [37]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.1,random_state=353)
x_train.head()

Unnamed: 0,host_id,neighbourhood_group,latitude,longitude,room_type
17736,29513490,0,40.6837,-73.93325,1
24351,18632318,1,40.8023,-73.96688,0
10069,40608098,1,40.78502,-73.94763,1
11679,47406119,1,40.72558,-74.00195,0
16315,10676792,0,40.68457,-73.9262,1


In [38]:
y_train.head()

17736    125
24351    101
10069    189
11679    145
16315    120
Name: price, dtype: int64

In [39]:
x_train.shape

(44005, 5)

# Setup MLflow Tracking
Make sure MLflow tracking service is running before running the following cell.
If service is not up, run `docker-compose up --build -d` in the project root to get it up.

In [40]:
mlflow.set_tracking_uri('http://0.0.0.0:5000')
mlflow.set_experiment('airbnb-predict-price')

2022/03/30 01:16:59 INFO mlflow.tracking.fluent: Experiment with name 'airbnb-predict-price' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', experiment_id='1', lifecycle_stage='active', name='airbnb-predict-price', tags={}>

# Fit Model and Log to MLflow

In [41]:
with mlflow.start_run() as run:
    reg=LinearRegression()
    reg.fit(x_train,y_train)
    y_pred=reg.predict(x_test)
    
    from sklearn.metrics import r2_score
    r2 = r2_score(y_test,y_pred)

    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(reg, "model")

# Registering the Model

In [42]:
result = mlflow.register_model(
    f"runs:/{run.info.run_id}/model",
    "sklearn-linear-regression-model"
)

Successfully registered model 'sklearn-linear-regression-model'.
2022/03/30 01:17:03 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: sklearn-linear-regression-model, version 1
Created version '1' of model 'sklearn-linear-regression-model'.


# Serving Model

In [43]:
%%bash -s "$run.info.run_id"
PWD=$(pwd)
docker run -d --rm -v ${PWD}/minio/data/mlflow:/data -p 3000:3000 -e MLFLOW_RUN_ID=${1} --name mlflow-serving --entrypoint "./serve_entrypoint.sh" dyson-test_mlflow:latest

292c869570079065343500ec55addab8f6440845b54eb6d77a5fd29b361a4e12


# Send Request to Model Server

In [44]:
data = {
    "columns": [
        "host_id",
        "neighborhood_group",
        "latitude",
        "longitude",
        "room_type"
    ],
    "data": [
        [8072802, 0, 40.71790, -73.95103, 0]
    ]
}
r = requests.post("http://0.0.0.0:3000/invocations", data=json.dumps(data), headers={"Content-Type": "application/json"})
print(r.status_code)
print(json.loads(r.text))


200
[93.28681328838866]
