In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor

In [27]:
df = pd.read_csv('kc_house_data.csv')
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [28]:
df.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 21613 entries, 7129300520 to 1523300157
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           21613 non-null  object 
 1   price          21613 non-null  float64
 2   bedrooms       21613 non-null  int64  
 3   bathrooms      21613 non-null  float64
 4   sqft_living    21613 non-null  int64  
 5   sqft_lot       21613 non-null  int64  
 6   floors         21613 non-null  float64
 7   waterfront     21613 non-null  int64  
 8   view           21613 non-null  int64  
 9   condition      21613 non-null  int64  
 10  grade          21613 non-null  int64  
 11  sqft_above     21613 non-null  int64  
 12  sqft_basement  21613 non-null  int64  
 13  yr_built       21613 non-null  int64  
 14  yr_renovated   21613 non-null  int64  
 15  zipcode        21613 non-null  int64  
 16  lat            21613 non-null  float64
 17  long           21613 non-null  float64
 1

In [29]:
target_col = 'price'
X = df.drop(target_col, axis=1)
y = pd.DataFrame(df[target_col])

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0)
X_train['date'] = pd.to_datetime(X_train['date']).astype('int64')
X_valid['date'] = pd.to_datetime(X_valid['date']).astype('int64')

sample_row = X.iloc[0, :].to_dict()


{'date': '20141013T000000', 'bedrooms': 3, 'bathrooms': 1.0, 'sqft_living': 1180, 'sqft_lot': 5650, 'floors': 1.0, 'waterfront': 0, 'view': 0, 'condition': 3, 'grade': 7, 'sqft_above': 1180, 'sqft_basement': 0, 'yr_built': 1955, 'yr_renovated': 0, 'zipcode': 98178, 'lat': 47.5112, 'long': -122.257, 'sqft_living15': 1340, 'sqft_lot15': 5650}


In [30]:
model = LGBMRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_valid)

print(f'mse: {mean_squared_error(y_valid, preds)}')
print(f'mae: {mean_absolute_error(y_valid, preds)}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2576
[LightGBM] [Info] Number of data points in the train set: 15129, number of used features: 19
[LightGBM] [Info] Start training from score 540606.716108
mse: 15590471696.584267
mae: 65132.62680667003


In [37]:
import datetime
from typing import Optional
from pydantic import BaseModel, ValidationError

class House(BaseModel):
    date: object 
    bedrooms: int
    bathrooms: float
    sqft_living: int
    sqft_lot: int
    floors: float
    waterfront: int
    view: int
    condition: int
    grade: int
    sqft_above: int
    sqft_basement: int
    yr_built: int
    yr_renovated: Optional[int] = None  # Optional field, as it may not always be present
    zipcode: int
    lat: float
    long: float
    sqft_living15: int
    sqft_lot15: int

print(sample_row)
house = House(**sample_row)
inputs = pd.DataFrame([house.model_dump()])
inputs['date'] = pd.to_datetime(inputs['date']).astype('int64')
pred = model.predict(inputs)[0]
float(pred)



{'date': '20141013T000000', 'bedrooms': 3, 'bathrooms': 1.0, 'sqft_living': 1180, 'sqft_lot': 5650, 'floors': 1.0, 'waterfront': 0, 'view': 0, 'condition': 3, 'grade': 7, 'sqft_above': 1180, 'sqft_basement': 0, 'yr_built': 1955, 'yr_renovated': 0, 'zipcode': 98178, 'lat': 47.5112, 'long': -122.257, 'sqft_living15': 1340, 'sqft_lot15': 5650}


225382.87435824506

In [None]:
from flask import Flask, jsonify, request

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    try:
        house = House(**request.json)
        inputs = pd.DataFrame([house.model_dump()])
        inputs['date'] = pd.to_datetime(inputs['date']).astype('int64')
        pred = model.predict(inputs)[0]
        response = {
            'predict': float(pred)
        }

        return jsonify(response), 200

    except ValidationError as e:
        return jsonify(e.errors()), 400

if __name__ == '__main__':
    app.run()