In [1]:
import os
import uuid
import pickle

import pandas as pd
import numpy as np

import mlflow

In [2]:
TRACKING_URL = "http://127.0.0.1:5000"
mlflow.set_tracking_uri(TRACKING_URL)

print(f"Tracking URI: '{mlflow.get_tracking_uri()}'")

Tracking URI: 'http://127.0.0.1:5000'


In [None]:
# 'https://data.insideairbnb.com/united-states/ny/albany/2024-09-05/visualisations/listings.csv'

year = 2024
month = 9
day = 5
city = 'albany'

input_file = f'https://data.insideairbnb.com/united-states/ny/{city}/{year:04d}-{month:02d}-{day:02d}/visualisations/listings.csv'
output_file = f'output/{city}/{year:04d}-{month:02d}-listings.csv'

RUN_ID = os.getenv('RUN_ID', '1936d050006746eeaa60c76db167d18c')

print("Input file: ", input_file)
print("output_file: ", output_file)

Input file:  https://data.insideairbnb.com/united-states/ny/albany/2024-09-05/visualisations/listings.csv
output_file:  output/albany/2024-09-listings.csv


In [4]:
def generate_uuids(n):
    ride_ids = []
    for i in range(n):
        ride_ids.append(str(uuid.uuid4()))
    return ride_ids

def read_dataframe(filename):
    df = pd.read_csv(filename)
    
    # Handle missing values
    df = df.dropna(subset=['price'])
    df['reviews_per_month'] = df['reviews_per_month'].fillna(0)
    df['last_review'] = pd.to_datetime(df['last_review'])
    
    df['request_id'] = generate_uuids(len(df))

    return df


def preprocess(df):
    # Calculate IQR
    Q1 = df['price'].quantile(0.25)  # First quartile (25th percentile)
    Q3 = df['price'].quantile(0.75)  # Third quartile (75th percentile)
    IQR = Q3 - Q1  # Interquartile range

    # Define outlier boundaries
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Remove outliers
    df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)].reset_index(drop=True)
    
    return df

def prepare_dictionaries(df: pd.DataFrame):    
    # Feature selection
    numerical = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365']
    categorical = ['room_type', 'neighbourhood']
    df[categorical] = df[categorical].astype(str)

    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

In [None]:
def load_model(run_id):
    logged_model = f'runs:/{run_id}/model'
    # Load model as a PyFuncModel.
    # loaded_model = mlflow.pyfunc.load_model(logged_model)
    loaded_model = mlflow.sklearn.load_model(logged_model)

    return loaded_model


def apply_model(input_file, run_id, output_file):

    df = read_dataframe(input_file)
    df = preprocess(df)
    dicts = prepare_dictionaries(df)
    
    model = load_model(run_id)
    y_pred = model.predict(dicts)
    result = np.round(np.power(10, y_pred), 2)

    df_result = pd.DataFrame()
    df_result['request_id'] = df['request_id']
    df_result['room_type'] = df['room_type']
    df_result['neighbourhood'] = df['neighbourhood']
    df_result['latitude'] = df['latitude']
    df_result['longitude'] = df['longitude']
    df_result['minimum_nights'] = df['minimum_nights']
    df_result['number_of_reviews'] = df['number_of_reviews']
    df_result['reviews_per_month'] = df['reviews_per_month']
    df_result['availability_365'] = df['availability_365']
    df_result['actual_price'] = df['price']
    df_result['predicted_price'] = result
    df_result['diff'] = df_result['actual_price'] - df_result['predicted_price']
    df_result['model_version'] = run_id
    
    df_result.to_csv(output_file, index=False)

In [6]:
# df = read_dataframe(input_file)
# df = preprocess(df)
# df.head()

In [7]:
apply_model(input_file=input_file, run_id=RUN_ID, output_file=output_file)

In [8]:
!ls output/albany/

2024-09-listings.csv


In [9]:
print(output_file)

output/albany/2024-09-listings.csv


In [10]:
result = pd.read_csv(output_file)
result.head()

Unnamed: 0,request_id,room_type,neighbourhood,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,availability_365,actual_price,predicted_price,diff,model_version
0,b2dd609d-f761-4c88-8693-512d130793d4,Entire home/apt,THIRD WARD,42.65789,-73.7537,28,9,0.07,158,70.0,107.49,-37.489998,7b5744464f544451aee4a9308d1971ad
1,c7f29750-31ab-468c-9d40-6263f5d0cce2,Entire home/apt,SIXTH WARD,42.65222,-73.76724,2,305,2.49,336,116.0,107.75,8.25,7b5744464f544451aee4a9308d1971ad
2,78b67f9c-dbba-476e-a31e-849f4fea930c,Entire home/apt,SECOND WARD,42.64615,-73.75966,2,366,3.22,34,75.0,98.79,-23.790001,7b5744464f544451aee4a9308d1971ad
3,0448df82-25bc-43b1-b239-7184a867c464,Entire home/apt,SIXTH WARD,42.65222,-73.76724,2,328,2.91,279,116.0,104.04,11.959999,7b5744464f544451aee4a9308d1971ad
4,3b9b9d92-b481-4665-9f96-8fc241ab1611,Private room,TENTH WARD,42.66063,-73.78115,3,18,0.17,179,199.0,73.69,125.309998,7b5744464f544451aee4a9308d1971ad
