In [1]:
import os
import tempfile
import mlflow
import click
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

In [68]:
data = pd.read_csv("data/resale-flat-prices-2022-jan.csv")

tmpdir = tempfile.mkdtemp()
train_output_path = os.path.join(tmpdir, "train.csv")
validation_output_path = os.path.join(tmpdir, "validation.csv")
test_output_path = os.path.join(tmpdir, "test.csv")

columns = [
    "resale_price",
    "town",
    "flat_type",
    "storey_range",
    "floor_area_sqm",
    "flat_model",
    "lease_commence_date",
    "remaining_lease",
]
data = data[columns]

data = data.replace(regex=[r".*[mM]aisonette.*", "foo"], value="Maisonette")
data["remaining_lease"] = data["remaining_lease"].str.extract(
    r"(\d+)(?= years)"
)
data = data.astype({"remaining_lease": "int16"})

print("Label encoding categorical columns - flat_type")
flat_type_map = {
    "1 ROOM": 0,
    "2 ROOM": 1,
    "3 ROOM": 2,
    "4 ROOM": 3,
    "5 ROOM": 4,
    "MULTI-GENERATION": 5,
    "EXECUTIVE": 6,
}
data = data.replace({"flat_type": flat_type_map})
# save mappings as artifacts!!!

print("Label encoding categorical columns - storey_range")
storey_range_le = LabelEncoder()
data["storey_range"] = storey_range_le.fit_transform(data["storey_range"])
# print(storey_range_le.classes_)



Label encoding categorical columns - flat_type
Label encoding categorical columns - storey_range


In [69]:
def onehotencode(df, col):
    df1 = df.copy()
    ohe = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)
    ohe_df = pd.DataFrame(ohe.fit_transform(df1[col].values.reshape(-1, 1)))
    ohe_features = [x.replace('x0_', '') for x in ohe.get_feature_names_out()]
    ohe_df.columns = ohe_features
    categories = ohe.categories_[0]
    df1.drop(col, axis=1, inplace=True)
    df1 = pd.concat([df1, ohe_df], axis=1)
    return df1, ohe_features, categories

In [70]:
t1, t2, t3 = onehotencode(data, 'town')

In [71]:
t1

Unnamed: 0,resale_price,town,flat_type,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,BEDOK,BISHAN,...,PASIR RIS,PUNGGOL,QUEENSTOWN,SEMBAWANG,SENGKANG,SERANGOON,TAMPINES,TOA PAYOH,WOODLANDS,YISHUN
0,245000.0,ANG MO KIO,1,2,44.0,Improved,1977,54,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,358000.0,ANG MO KIO,2,2,73.0,New Generation,1977,54,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,355000.0,ANG MO KIO,2,2,67.0,New Generation,1978,55,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,338000.0,ANG MO KIO,2,2,68.0,New Generation,1981,58,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,420000.0,ANG MO KIO,2,2,82.0,New Generation,1980,57,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2437,750000.0,YISHUN,6,0,142.0,Apartment,1988,65,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2438,709000.0,YISHUN,6,3,146.0,Maisonette,1988,65,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2439,830000.0,YISHUN,6,2,154.0,Maisonette,1988,65,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2440,800000.0,YISHUN,6,0,159.0,Apartment,1992,69,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [58]:
town_ohe = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)
town_data = pd.DataFrame(town_ohe.fit_transform(data["town"].values.reshape(-1, 1)))
town_ohe_features = [x.replace('x0_', '') for x in town_ohe.get_feature_names_out()]
town_data.columns = town_ohe_features
flat_model_ohe = OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False)
flat_model_data = pd.DataFrame(flat_model_ohe.fit_transform(data["flat_model"].values.reshape(-1, 1)))
flat_model_ohe_features = [x.replace('x0_', '') for x in flat_model_ohe.get_feature_names_out()]
flat_model_data.columns = flat_model_ohe_features
# data.drop(["town", "flat_model"], axis=1, inplace=True)


In [59]:
data

Unnamed: 0,resale_price,town,flat_type,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease
0,245000.0,ANG MO KIO,1,2,44.0,Improved,1977,54
1,358000.0,ANG MO KIO,2,2,73.0,New Generation,1977,54
2,355000.0,ANG MO KIO,2,2,67.0,New Generation,1978,55
3,338000.0,ANG MO KIO,2,2,68.0,New Generation,1981,58
4,420000.0,ANG MO KIO,2,2,82.0,New Generation,1980,57
...,...,...,...,...,...,...,...,...
2437,750000.0,YISHUN,6,0,142.0,Apartment,1988,65
2438,709000.0,YISHUN,6,3,146.0,Maisonette,1988,65
2439,830000.0,YISHUN,6,2,154.0,Maisonette,1988,65
2440,800000.0,YISHUN,6,0,159.0,Apartment,1992,69


In [19]:
town_ohe.get_feature_names_out()

array(['x0_BEDOK', 'x0_BISHAN', 'x0_BUKIT BATOK', 'x0_BUKIT MERAH',
       'x0_BUKIT PANJANG', 'x0_BUKIT TIMAH', 'x0_CENTRAL AREA',
       'x0_CHOA CHU KANG', 'x0_CLEMENTI', 'x0_GEYLANG', 'x0_HOUGANG',
       'x0_JURONG EAST', 'x0_JURONG WEST', 'x0_KALLANG/WHAMPOA',
       'x0_MARINE PARADE', 'x0_PASIR RIS', 'x0_PUNGGOL', 'x0_QUEENSTOWN',
       'x0_SEMBAWANG', 'x0_SENGKANG', 'x0_SERANGOON', 'x0_TAMPINES',
       'x0_TOA PAYOH', 'x0_WOODLANDS', 'x0_YISHUN'], dtype=object)

In [40]:
town_ohe.categories_

[array(['ANG MO KIO', 'BEDOK', 'BISHAN', 'BUKIT BATOK', 'BUKIT MERAH',
        'BUKIT PANJANG', 'BUKIT TIMAH', 'CENTRAL AREA', 'CHOA CHU KANG',
        'CLEMENTI', 'GEYLANG', 'HOUGANG', 'JURONG EAST', 'JURONG WEST',
        'KALLANG/WHAMPOA', 'MARINE PARADE', 'PASIR RIS', 'PUNGGOL',
        'QUEENSTOWN', 'SEMBAWANG', 'SENGKANG', 'SERANGOON', 'TAMPINES',
        'TOA PAYOH', 'WOODLANDS', 'YISHUN'], dtype=object)]

In [20]:
flat_model_ohe.get_feature_names_out()

array(['x0_Apartment', 'x0_DBSS', 'x0_Improved', 'x0_Maisonette',
       'x0_Model A', 'x0_Model A2', 'x0_Multi Generation',
       'x0_New Generation', 'x0_Premium Apartment',
       'x0_Premium Apartment Loft', 'x0_Simplified', 'x0_Standard',
       'x0_Type S1', 'x0_Type S2'], dtype=object)

In [22]:
flat_model_ohe.categories_

[array(['Adjoined flat', 'Apartment', 'DBSS', 'Improved', 'Maisonette',
        'Model A', 'Model A2', 'Multi Generation', 'New Generation',
        'Premium Apartment', 'Premium Apartment Loft', 'Simplified',
        'Standard', 'Type S1', 'Type S2'], dtype=object)]

In [13]:
dir(town_ohe)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_X',
 '_check_feature_names',
 '_check_infrequent_enabled',
 '_check_n_features',
 '_compute_n_features_outs',
 '_compute_transformed_categories',
 '_drop_idx_after_grouping',
 '_fit',
 '_fit_infrequent_category_mapping',
 '_get_param_names',
 '_get_tags',
 '_identify_infrequent',
 '_infrequent_enabled',
 '_map_drop_idx_to_infrequent',
 '_map_infrequent_categories',
 '_more_tags',
 '_n_features_outs',
 '_parameter_constraints',
 '_remove_dropped_categories',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_set_drop_idx',
 '_sklearn_auto_wrap_output_

In [12]:
town_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2440,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
