In [1]:
import ray.data
import ray
import pandas as pd
from prophet import Prophet
import logging
import os
# for testing
import time
import joblib
import warnings
from sklearn.preprocessing import KBinsDiscretizer,LabelEncoder
from dotenv import load_dotenv
import os
import psycopg2
import uuid
import pandas.io.sql as psql
import mlflow
from sklearn.metrics import root_mean_squared_error,mean_absolute_error,mean_squared_error,mean_absolute_percentage_error,median_absolute_error
import matplotlib.pyplot as plt
# Load environment variables from the .env file (if present)
load_dotenv()
warnings.filterwarnings("ignore")

%matplotlib inline

#### predict ec price based on the area and the year after lease commencing

In [2]:
def get_dataframe_from_sql(table_name: str, conn_params: dict):
    connection = psycopg2.connect(**conn_params)
    cursor = connection.cursor()
    df = psql.read_sql(f"Select * from {table_name}", connection)
    return df

def fetch_raw_data(raw_data_path: str ="../data/raw/transaction.csv"): 
    load_dotenv()

    database = os.getenv('database')
    dbuser = os.getenv('dbuser')
    dbpassword = os.getenv('dbpassword')

    conn_params = {
        "host":"localhost",
        "database":database,
        "user":dbuser,
        "password":dbpassword

    }

    table_name = "propertypricetable"
    df_fetch = get_dataframe_from_sql(table_name, conn_params)
    df_fetch.to_csv(raw_data_path,index=False)

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder

In [4]:
def preprocess_ec_data(raw_data_path: str ="../data/raw/transaction.csv") -> pd.DataFrame:
    df_raw = pd.read_csv(raw_data_path,parse_dates=['contract_date'])
    df_ec = df_raw.loc[(df_raw['property_type']=='Executive Condominium')]
    df_ec['tenure_start'] = df_ec['tenure'].apply(lambda x: int(x.split(' ')[-1]))
    #print(df_ec['tenure_start'].unique())
    df_ec['year'] = df_ec['contract_date'].dt.year
    df_ec['month'] = df_ec['contract_date'].dt.month
    df_ec['year_after_lease_com'] =  df_ec['year']  - df_ec['tenure_start']
    df_ec['year_after_lease_com'] = df_ec['year_after_lease_com'].astype(int)
    #feature_cols = ['contract_date','area','year_after_lease_com']
    #target_cols = ['price']
    #df_ec = df_ec[feature_cols+target_cols]
    df_ec = df_ec[df_ec['type_of_sale']==3].drop(columns=['id','contract_date','no_of_units','type_of_sale','tenure_start','property_type','tenure','type_of_area','nett_price','project','street','x','y'])

    
    return df_ec

def prep_ec_data(
    df: pd.DataFrame, 
    area: int = 0, 
    year_after_lease_com: int = 1,
    ) -> pd.DataFrame:
    df_select = df[
        (df['area'] == area) &\
        (df['year_after_lease_com'] == year_after_lease_com)
    ].reset_index(drop=True)
    df_select['contract_date'] = pd.to_datetime(df_select['contract_date'])
    df_select.rename(columns= {'contract_date': 'ds', 'price': 'y'}, inplace=True)
    return df_select[['ds','y']].sort_values('ds', ascending=True) 

  

In [66]:

file_path = "../data/raw/transaction.csv"
if os.path.exists(file_path):
    logging.info('Dataset found, reading into pandas dataframe.')
    df = preprocess_ec_data(file_path)
else:
    logging.info('Dataset not found, downloading ...')
    fetch_raw_data(file_path)
    logging.info('Reading dataset into pandas dataframe.')
    df = preprocess_ec_data(file_path)

df.head()

Unnamed: 0,area,floor_range,price,district,market_segment,year,month,year_after_lease_com
53105,205.0,16-20,2118000.0,19,OCR,2023,9,13
53106,100.0,01-05,1500000.0,19,OCR,2023,9,13
53107,100.0,11-15,995000.0,19,OCR,2019,10,9
53108,92.0,11-15,875000.0,19,OCR,2019,10,9
53109,100.0,06-10,955000.0,19,OCR,2020,2,10


In [None]:
import os
from dotenv import load_dotenv
import mlflow
from mlflow.client import MlflowClient
from urllib.parse import urlparse

load_dotenv()

MLFLOW_TRACKING_URI=os.getenv('MLFLOW_TRACKING_URI')
MLFLOW_TRACKING_USERNAME=os.getenv('MLFLOW_TRACKING_USERNAME')
MLFLOW_TRACKING_PASSWORD=os.getenv('MLFLOW_TRACKING_PASSWORD')

In [85]:
import xgboost as xgb
import numpy as np
import lightgbm as lgb

def train_model(df,model_path):
    df['market_segment'] = df['market_segment'].astype('category')
    df_train = df.loc[(df['year']<=2023)]
    df_test = df.loc[df['year']>2023]
    df_train.to_csv('../data/processed/train.csv',index=False)
    df_test.to_csv('../data/processed/test.csv',index=False)
    X_train = df_train.drop(columns=['price'])
    X_test = df_test.drop(columns=['price'])
    y_train = df_train['price']
    y_test = df_test['price']
    num_transformer_kbin = Pipeline(
        steps=[
            ("kbin_encoder", KBinsDiscretizer(n_bins=5, encode="ordinal", strategy='quantile'))
        ])

    cat_transformer_label = Pipeline(
        steps=[
            ("label_encoder", OrdinalEncoder()),
        ])


    preprocessor  = ColumnTransformer(transformers = [
                                            ('num_kbin', num_transformer_kbin, ['area']),
                                            ('cat_label', cat_transformer_label, ['floor_range','market_segment'])
                                            ],
                                       remainder='passthrough'     
                                            )

    reg = lgb.LGBMRegressor(n_estimators=1000)
    pipeline = Pipeline(steps=[("preprocessor", preprocessor),('regressor',reg)])#,('sd',StandardScaler()),('pca',PCA(n_components=2,random_state=random_state)),('clf',KNeighborsClassifier())

    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)
    mse = np.sqrt(mean_squared_error(y_test, y_pred))
    mape  = mean_absolute_percentage_error(y_test, y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    joblib.dump(pipeline, model_path)
    return pipeline,mse,mape,mae

    

In [86]:
import logging
log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 
logging.basicConfig(format = log_format, level = logging.INFO) 


mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI) 
logging.info("Defined MLFlowClient and set tracking URI.")

mlflow.set_experiment("prophet_models")
#mlflow.autolog()
import os
import mlflow


def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id


exp_id = get_experiment_id("ec_price_prediction")
print(exp_id)




3


In [90]:
with mlflow.start_run(experiment_id=exp_id):
    mlflow.autolog()
    pipeline,rmse,mape,mae = train_model(df,'../models/model.pkl')
    print('rmse: {rmse}, mape: {mape}, mae: {mae}')
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":
        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(pipeline, artifact_path="model")#, registered_model_name=model_name
    else:
        mlflow.sklearn.log_model(pipeline, "model")        
        mlflow.log_metrics(
            {
                'rmse': rmse,
                'mean_abs_perc_error': mape,
                'mean_abs_error': mae,
            }
        )

2024/09/11 16:23:48 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.
2024/09/11 16:23:48 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2024/09/11 16:23:50 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/09/11 16:23:50 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000446 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 65
[LightGBM] [Info] Number of data points in the train set: 7877, number of used features: 7
[LightGBM] [Info] Start training from score 1185356.086962
rmse: {rmse}, mape: {mape}, mae: {mae}


In [91]:
df_test = pd.read_csv('../data/processed/test.csv')
df_test.head()

Unnamed: 0,area,floor_range,price,district,market_segment,year,month,year_after_lease_com
0,92.0,06-10,1400000.0,19,OCR,2024,7,14
1,126.0,01-05,1650000.0,19,OCR,2024,3,14
2,100.0,11-15,1600000.0,19,OCR,2024,3,14
3,77.0,16-20,1195000.0,19,OCR,2024,3,14
4,114.0,06-10,1715000.0,19,OCR,2024,4,14


In [100]:
data = df_test.drop(columns=['price']).iloc[0,:].to_dict()
data

{'area': 92.0,
 'floor_range': '06-10',
 'district': 19,
 'market_segment': 'OCR',
 'year': 2024,
 'month': 7,
 'year_after_lease_com': 14}

In [102]:
model = joblib.load('./models/model.pkl')  # Load model
y_pred = pipeline.predict(pd.DataFrame([data]))
y_pred

array([1338807.90396518])

In [None]:
y