# Data cleaning and feature creation
Often writes data to data/processed or data/interim

In [1]:
import polars as pl
import os

DATA_PATH = "../data/raw"

In [24]:
# Load data
file_names = [x for x in os.listdir(DATA_PATH) if ".csv" in x]  # List all CSV files in the raw data directory

df_list = []
for file in file_names:
    print("Loading file:", file)
    df = pl.scan_csv(os.path.join(DATA_PATH, file), infer_schema=False)
    df_list.append(df)
print("Loaded", len(df_list), "files.")

Loading file: ResaleFlatPricesBasedonRegistrationDateFromJan2015toDec2016.csv
Loading file: ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv
Loading file: ResaleFlatPricesBasedonRegistrationDateFromMar2012toDec2014.csv
Loaded 3 files.


In [36]:
# Combine lazyframes into a single lazyframe
df_combined = pl.concat(df_list, how="align_full")
df_combined_col = df_combined.collect()

In [37]:
# Convert datatype
df_combined_col.head()

month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
str,str,str,str,str,str,str,str,str,str,str
"""2012-03""","""ANG MO KIO""","""2 ROOM""","""172""","""ANG MO KIO AVE 4""","""06 TO 10""","""45""","""Improved""","""1986""",,"""250000"""
"""2012-03""","""ANG MO KIO""","""2 ROOM""","""510""","""ANG MO KIO AVE 8""","""01 TO 05""","""44""","""Improved""","""1980""",,"""265000"""
"""2012-03""","""ANG MO KIO""","""3 ROOM""","""103""","""ANG MO KIO AVE 3""","""06 TO 10""","""73""","""New Generation""","""1978""",,"""368000"""
"""2012-03""","""ANG MO KIO""","""3 ROOM""","""110""","""ANG MO KIO AVE 4""","""01 TO 05""","""67""","""New Generation""","""1978""",,"""323000"""
"""2012-03""","""ANG MO KIO""","""3 ROOM""","""114""","""ANG MO KIO AVE 4""","""01 TO 05""","""73""","""New Generation""","""1978""",,"""339000"""


In [38]:
# Convert variables to appropriate types
df_combined_col = df_combined_col.with_columns(
    pl.col("resale_price").cast(pl.Float64),
    pl.col("lease_commence_date").cast(pl.Int64),
    pl.col("floor_area_sqm").cast(pl.Float64),
    pl.col("month").str.to_date("%Y-%m")
)
df_combined_col.head()

month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
date,str,str,str,str,str,f64,str,i64,str,f64
2012-03-01,"""ANG MO KIO""","""2 ROOM""","""172""","""ANG MO KIO AVE 4""","""06 TO 10""",45.0,"""Improved""",1986,,250000.0
2012-03-01,"""ANG MO KIO""","""2 ROOM""","""510""","""ANG MO KIO AVE 8""","""01 TO 05""",44.0,"""Improved""",1980,,265000.0
2012-03-01,"""ANG MO KIO""","""3 ROOM""","""103""","""ANG MO KIO AVE 3""","""06 TO 10""",73.0,"""New Generation""",1978,,368000.0
2012-03-01,"""ANG MO KIO""","""3 ROOM""","""110""","""ANG MO KIO AVE 4""","""01 TO 05""",67.0,"""New Generation""",1978,,323000.0
2012-03-01,"""ANG MO KIO""","""3 ROOM""","""114""","""ANG MO KIO AVE 4""","""01 TO 05""",73.0,"""New Generation""",1978,,339000.0


In [52]:
# Check for nulls
null_counts = df_combined_col.null_count()
print("Null counts in each column:")
print(null_counts)

Null counts in each column:
shape: (1, 13)
┌───────┬──────┬───────────┬───────┬───┬──────────────┬──────────────┬──────────────┬──────────────┐
│ month ┆ town ┆ flat_type ┆ block ┆ … ┆ remaining_le ┆ resale_price ┆ remaining_le ┆ days_from_ea │
│ ---   ┆ ---  ┆ ---       ┆ ---   ┆   ┆ ase          ┆ ---          ┆ ase_yrs      ┆ rliest_data  │
│ u32   ┆ u32  ┆ u32       ┆ u32   ┆   ┆ ---          ┆ u32          ┆ ---          ┆ ---          │
│       ┆      ┆           ┆       ┆   ┆ u32          ┆              ┆ u32          ┆ u32          │
╞═══════╪══════╪═══════════╪═══════╪═══╪══════════════╪══════════════╪══════════════╪══════════════╡
│ 0     ┆ 0    ┆ 0         ┆ 0     ┆ … ┆ 265657       ┆ 0            ┆ 0            ┆ 0            │
└───────┴──────┴───────────┴───────┴───┴──────────────┴──────────────┴──────────────┴──────────────┘


In [51]:
# Calculate months from earliest month in dataset (Mar-2012)
df_combined_col = df_combined_col.with_columns(
    remaining_lease_yrs = (pl.col("month").dt.year() - pl.col("lease_commence_date")),
    days_from_earliest_data = (pl.col('month') - pl.date(2012, 3, 1)).dt.total_days()
)

In [53]:
# Remaining lease can be dropped and recalculated on the assumption that each lease lasts for 99 years.
df_combined_col = df_combined_col.drop("remaining_lease")

In [62]:
# Checking unique values of towns
pl.Config.set_tbl_rows(100)
df_combined_col.select(pl.col("town").unique())

town
str
"""HOUGANG"""
"""JURONG WEST"""
"""JURONG EAST"""
"""TOA PAYOH"""
"""SEMBAWANG"""
"""BEDOK"""
"""BUKIT BATOK"""
"""PUNGGOL"""
"""BUKIT TIMAH"""
"""KALLANG/WHAMPOA"""


In [65]:
# Check flat type
df_combined_col.select(pl.col("flat_type").unique())
df_combined_col.select(pl.col("block").unique())
df_combined_col.select(pl.col("flat_model").unique())

flat_model
str
"""Type S1"""
"""Model A"""
"""Multi Generation"""
"""Model A2"""
"""Premium Apartment Loft"""
"""Simplified"""
"""Premium Maisonette"""
"""Standard"""
"""2-room"""
"""Premium Apartment"""


In [None]:
# Check flat model futher

In [None]:
# Check flat type
df_combined_col.select(pl.col("flat_type").unique())

In [61]:
# Checking unique values of storey blocks
pl.Config.set_tbl_rows(100)
df_combined_col.select(pl.col("storey_range").unique())

storey_range
str
"""36 TO 40"""
"""16 TO 20"""
"""49 TO 51"""
"""31 TO 35"""
"""01 TO 03"""
"""19 TO 21"""
"""31 TO 33"""
"""21 TO 25"""
"""10 TO 12"""
"""01 TO 05"""


In [None]:
# Create min max value for stories, then group every 15 storeys since a part of the data groups every 5 floors instead of 3 floors



In [None]:
# Split into train (2017-2022), test(2023) and deploy sets (2024)
train = df_combined_col.filter(df_combined_col['date'].dt.year().is_between(2017, 2022)).write_parquet("../data/processed/train.parquet")
test = df_combined_col.filter(df_combined_col['date'].dt.year() == 2023).write_parquet("../data/processed/test.parquet")
deploy = df_combined_col.filter(df_combined_col['date'].dt.year() == 2024).write_parquet("../data/processed/deploy.parquet")