In [None]:
import ray.data
import ray
import pandas as pd
from prophet import Prophet
import logging
import os
# for testing
import time

import warnings
from sklearn.preprocessing import KBinsDiscretizer,LabelEncoder
from dotenv import load_dotenv
import os
import psycopg2
import uuid
import pandas.io.sql as psql
import mlflow
from sklearn.metrics import root_mean_squared_error,mean_absolute_error,mean_squared_error,mean_absolute_percentage_error,median_absolute_error
import matplotlib.pyplot as plt
# Load environment variables from the .env file (if present)
load_dotenv()
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
from prophet.serialize import model_to_json, model_from_json

def save_prophet_model(model,model_path:str):
    with open(model_path, 'w') as fout:
        fout.write(model_to_json(model))  # Save model
def load_prophet_model(model_path:str):
    with open(model_path, 'r') as fin:
        m = model_from_json(fin.read())  # Load model
    return m

In [None]:
model = load_prophet_model('../models/model_area2_year6.json')
ds = pd.date_range(start='2024-07-01',end='2024-09-01',freq='MS')
df_test = pd.DataFrame(ds,columns=['ds'])
model.predict(df_test).loc[:, 'yhat']

#### predict ec price based on the area and the year after lease commencing

In [None]:
def get_dataframe_from_sql(table_name: str, conn_params: dict):
    connection = psycopg2.connect(**conn_params)
    cursor = connection.cursor()
    df = psql.read_sql(f"Select * from {table_name}", connection)
    return df

def fetch_raw_data(raw_data_path: str ="../data/raw/transaction.csv"): 
    load_dotenv()

    database = os.getenv('database')
    dbuser = os.getenv('dbuser')
    dbpassword = os.getenv('dbpassword')

    conn_params = {
        "host":"localhost",
        "database":database,
        "user":dbuser,
        "password":dbpassword

    }

    table_name = "propertypricetable"
    df_fetch = get_dataframe_from_sql(table_name, conn_params)
    df_fetch.to_csv(raw_data_path,index=False)

In [None]:
def preprocess_ec_data(raw_data_path: str ="../data/raw/transaction.csv") -> pd.DataFrame:
    df_raw = pd.read_csv(raw_data_path,parse_dates=['contract_date'])
    df_ec = df_raw.loc[(df_raw['property_type']=='Executive Condominium')]
    df_ec['tenure_start'] = df_ec['tenure'].apply(lambda x: int(x.split(' ')[-1]))
    #print(df_ec['tenure_start'].unique())
    df_ec['year_after_lease_com'] = df_ec['contract_date'].dt.year - df_ec['tenure_start']
    df_ec['year_after_lease_com'] = df_ec['year_after_lease_com'].astype(int)
    feature_cols = ['contract_date','area','year_after_lease_com']
    target_cols = ['price']
    df_ec = df_ec[feature_cols+target_cols]
    n_bins = 5
    enc = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy='quantile')#'uniform', 'kmeans', 'quantile'
    df_ec['area'] = enc.fit_transform(df_ec['area'].values.reshape(-1,1)).reshape(-1,)
    df_mean = df_ec.groupby(feature_cols)['price'].mean()
    return df_mean.reset_index()

def prep_ec_data(
    df: pd.DataFrame, 
    area: int = 0, 
    year_after_lease_com: int = 1,
    ) -> pd.DataFrame:
    df_select = df[
        (df['area'] == area) &\
        (df['year_after_lease_com'] == year_after_lease_com)
    ].reset_index(drop=True)
    df_select['contract_date'] = pd.to_datetime(df_select['contract_date'])
    df_select.rename(columns= {'contract_date': 'ds', 'price': 'y'}, inplace=True)
    return df_select[['ds','y']].sort_values('ds', ascending=True) 

  

In [None]:
def train_predict(
    df: pd.DataFrame,
    train_fraction: float,
    seasonality: dict,
    model_path:str
    ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, int]:
    
    # grab split data
    train_index = int(train_fraction*df.shape[0])
    df_train = df.copy().iloc[0:train_index]
    df_test = df.copy().iloc[train_index:]

    #create Prophet model
    model=Prophet(
        yearly_seasonality=seasonality['yearly'],
        weekly_seasonality=seasonality['weekly'],
        daily_seasonality=seasonality['daily'],
        interval_width = 0.95
    )

    # train and predict

    model.fit(df_train)           
    #model = load_prophet_model(model_path)
    predicted = model.predict(df_test)
    y_pred = predicted.loc[:, 'yhat']
    rmse = root_mean_squared_error(df_test.y.values,y_pred)
    ame = mean_absolute_error(df_test.y.values,y_pred)
    save_prophet_model(model,model_path)


    return predicted, df_train, df_test, model, rmse,ame

In [None]:
def plot_prediction_result(df_select,df_test,predicted,fig_path):
    fig, ax = plt.subplots(1,1,figsize=(12,5))
    plt.plot(df_select.ds,df_select.y.values, label='Actual')
    plt.plot(df_test.ds,predicted.loc[:, 'yhat'], label='Predicted')
    plt.legend()
    plt.savefig(fig_path)
    mlflow.log_figure(fig, 'my_plot.png')

In [None]:

file_path = "../data/raw/transaction.csv"
if os.path.exists(file_path):
    logging.info('Dataset found, reading into pandas dataframe.')
    df = preprocess_ec_data(file_path)
else:
    logging.info('Dataset not found, downloading ...')
    fetch_raw_data(file_path)
    logging.info('Reading dataset into pandas dataframe.')
    df = preprocess_ec_data(file_path)

In [None]:
import os
from dotenv import load_dotenv
import mlflow
from mlflow.client import MlflowClient
from urllib.parse import urlparse

load_dotenv()

MLFLOW_TRACKING_URI=os.getenv('MLFLOW_TRACKING_URI')
MLFLOW_TRACKING_USERNAME=os.getenv('MLFLOW_TRACKING_USERNAME')
MLFLOW_TRACKING_PASSWORD=os.getenv('MLFLOW_TRACKING_PASSWORD')

In [None]:
import logging
log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 
logging.basicConfig(format = log_format, level = logging.INFO) 


mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI) 
logging.info("Defined MLFlowClient and set tracking URI.")

mlflow.set_experiment("prophet_models")
#mlflow.autolog()
import os
import mlflow

In [None]:
def get_experiment_id(name):
    exp = mlflow.get_experiment_by_name(name)
    if exp is None:
      exp_id = mlflow.create_experiment(name)
      return exp_id
    return exp.experiment_id


exp_id = get_experiment_id("prophet_models")
print(exp_id)

In [None]:
area_list = sorted(df['area'].unique())#[0:50] #for testing
year_after_lease_com_list = sorted(df['year_after_lease_com'].unique())

# Define the parameters for the Prophet model
seasonality = {
    'yearly': True,
    'weekly': False,
    'daily': False
}
start = time.time()
predictions = []
train_data = []
test_data = []
train_indices = []
metric_list = []
#mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
#experiment_id = mlflow.set_experiment("ec-price-experiment")

for area in area_list:
    area = int(area)
    for year_after_lease_com in year_after_lease_com_list[5:11]:
        model_name = f'model_area{area}_year{year_after_lease_com}'
        model_path = f'../models/model_area{area}_year{year_after_lease_com}.json'
        fig_path = f'../reports/figures/model_area{area}_year{year_after_lease_com}.png'
        df_select = prep_ec_data(df, area=area,year_after_lease_com=year_after_lease_com)#[['ds','y']]
        if len(df_select)>20:
            print('start to train')
            with mlflow.start_run(experiment_id=exp_id):
                mlflow.autolog()
        
                predicted, df_train, df_test, forecaster, rmse, mae = train_predict(
                    df = df_select,
                    train_fraction = 0.8,
                    seasonality=seasonality,
                    model_path=model_path
                )
                fig, ax = plt.subplots(1,1,figsize=(12,5))
                plt.plot(df_select.ds,df_select.y.values, label='Actual')
                plt.plot(df_test.ds,predicted.loc[:, 'yhat'], label='Predicted')
                plt.title(f'area: {area}, year_after_lease_com: {year_after_lease_com}')
                plt.legend()
                plt.savefig(fig_path)
                #mlflow.log_figure(fig, f'predict_area{area}_year{year_after_lease_com}.png')
                #mlflow.log_metrics({'rmse':rmse})
                metric_list.append({'area':area,'year_after_lease_com':year_after_lease_com,'rmse':rmse,'mae':mae})

                tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
                if tracking_url_type_store != "file":
                    # Register the model
                    # There are other ways to use the Model Registry, which depends on the use case,
                    # please refer to the doc for more information:
                    # https://mlflow.org/docs/latest/model-registry.html#api-workflow
                    mlflow.prophet.log_model(forecaster, artifact_path="model")#, registered_model_name=model_name
                else:
                    mlflow.prophet.log_model(forecaster, "model")        


                #mlflow.prophet.log_model(forecaster, artifact_path="model")
                #mlflow.prophet.autolog(registered_model_name="<model name>")
                logging.info("Logged model")
                
                mlflow.log_params(seasonality)
                mlflow.log_metrics(
                    {
                        'rmse': mean_squared_error(y_true=df_test['y'], y_pred=forecaster.predict(df_test)['yhat'], squared=False),
                        'mean_abs_perc_error': mean_absolute_percentage_error(y_true=df_test['y'], y_pred=forecaster.predict(df_test)['yhat']),
                        'mean_abs_error': mean_absolute_error(y_true=df_test['y'], y_pred=forecaster.predict(df_test)['yhat']),
                        'median_abs_error': median_absolute_error(y_true=df_test['y'], y_pred=forecaster.predict(df_test)['yhat'])
                    }
                )
    else:
        print('not enough data')




df_metric = pd.DataFrame(metric_list)
df_metric.to_csv('../reports/testing_metric.csv')

In [None]:
area

In [None]:
year_after_lease_com

In [None]:
len(df_select)

In [None]:
from pathlib import Path

# prefix components:
space =  '    '
branch = '│   '
# pointers:
tee =    '├── '
last =   '└── '


def tree(dir_path: Path, prefix: str=''):
    """A recursive generator, given a directory Path object
    will yield a visual tree structure line by line
    with each line prefixed by the same characters
    """    
    contents = list(dir_path.iterdir())
    # contents each get pointers that are ├── with a final └── :
    pointers = [tee] * (len(contents) - 1) + [last]
    for pointer, path in zip(pointers, contents):
        yield prefix + pointer + path.name
        if path.is_dir(): # extend the prefix and recurse:
            extension = branch if pointer == tee else space 
            # i.e. space because last, └── , above so no more |
            yield from tree(path, prefix=prefix+extension)

In [1]:
import os

def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))

In [None]:
list_files('/Users/mac-zhou/Desktop/Yixin/ec-price-prediction')

In [4]:
from pathlib import Path

# prefix components:
space =  '    '
branch = '│   '
# pointers:
tee =    '├── '
last =   '└── '


def tree(dir_path: Path, prefix: str=''):
    """A recursive generator, given a directory Path object
    will yield a visual tree structure line by line
    with each line prefixed by the same characters
    """    
    contents = list(dir_path.iterdir())
    # contents each get pointers that are ├── with a final └── :
    pointers = [tee] * (len(contents) - 1) + [last]
    for pointer, path in zip(pointers, contents):
        yield prefix + pointer + path.name
        if path.is_dir(): # extend the prefix and recurse:
            extension = branch if pointer == tee else space 
            # i.e. space because last, └── , above so no more |
            yield from tree(path, prefix=prefix+extension)

In [5]:
folder_path = '/Users/mac-zhou/Desktop/Yixin/ec-price-prediction'
tree(folder_path)

<generator object tree at 0x7fc3d29e3220>

In [7]:
for line in tree(Path.home() / folder_path):
    print(line)

├── research
│   ├── .DS_Store
│   ├── 01-data-store.ipynb
│   ├── 04-model-training.ipynb
│   ├── train_forecasters_mlflow.ipynb
│   ├── 03-EDA.ipynb
│   └── 02-data-ingestion.ipynb
├── .DS_Store
├── app
│   ├── .DS_Store
│   ├── requirements.txt
│   ├── Dockerfile
│   ├── models
│   │   ├── model_area1_year6.json
│   │   ├── KBinsDiscretizer.save
│   │   ├── model_area2_year9.json
│   │   ├── model_area4_year10.json
│   │   ├── model_area1_year11.json
│   │   ├── model_area1_year10.json
│   │   ├── model_area4_year11.json
│   │   ├── model_area2_year8.json
│   │   ├── model_area1_year7.json
│   │   ├── model_area0_year9.json
│   │   ├── model_area2_year10.json
│   │   ├── model_area2_year11.json
│   │   ├── model_area0_year8.json
│   │   ├── model_area3_year8.json
│   │   ├── model_area0_year7.json
│   │   ├── model_area0_year10.json
│   │   ├── model_area0_year11.json
│   │   ├── model_area0_year6.json
│   │   ├── model_area3_year9.json
│   │   ├── model_area4_year9.json
│   │   ├──