Data obtained from: https://data.gov.sg/datasets?agencies=NEA&page=1&query=rain&resultId=2279
Full year data from 2017-2024

In [1]:
import polars as pl
import os

DATA_PATH = "../data/raw"

In [2]:
# Load data
file_names = [x for x in os.listdir(DATA_PATH) if ".csv" in x]  # List all CSV files in the raw data directory

df_list = []
for file in file_names:
    df = pl.scan_csv(os.path.join(DATA_PATH, file))
    df_list.append(df)

In [3]:
# Combine lazyframes into a single lazyframe
df_combined = pl.concat(df_list, how="vertical")
df_combined_col = df_combined.collect()


In [4]:
# Profile
df_combined_col.head()

date,timestamp,update_timestamp,station_id,station_name,station_device_id,location_longitude,location_latitude,reading_update_timestamp,reading_value,reading_type,reading_unit
str,str,str,str,str,str,f64,f64,str,f64,str,str
"""2020-01-01""","""2020-01-01T00:00:00+08:00""","""2020-01-05T17:47:38+08:00""","""S105""","""Admiralty Road West""","""S105""",103.79525,1.45817,"""2020-01-05T17:47:38+08:00""",0.0,"""TB1 Rainfall 5 Minute Total F""","""mm"""
"""2020-01-01""","""2020-01-01T00:00:00+08:00""","""2020-01-05T17:47:38+08:00""","""S77""","""Alexandra Road""","""S77""",103.8125,1.2937,"""2020-01-05T17:47:38+08:00""",0.0,"""TB1 Rainfall 5 Minute Total F""","""mm"""
"""2020-01-01""","""2020-01-01T00:00:00+08:00""","""2020-01-05T17:47:38+08:00""","""S109""","""Ang Mo Kio Avenue 5""","""S109""",103.8492,1.3764,"""2020-01-05T17:47:38+08:00""",0.0,"""TB1 Rainfall 5 Minute Total F""","""mm"""
"""2020-01-01""","""2020-01-01T00:00:00+08:00""","""2020-01-05T17:47:38+08:00""","""S117""","""Banyan Road""","""S117""",103.679,1.256,"""2020-01-05T17:47:38+08:00""",0.0,"""TB1 Rainfall 5 Minute Total F""","""mm"""
"""2020-01-01""","""2020-01-01T00:00:00+08:00""","""2020-01-05T17:47:38+08:00""","""S64""","""Bukit Panjang Road""","""S64""",103.7603,1.3824,"""2020-01-05T17:47:38+08:00""",0.0,"""TB1 Rainfall 5 Minute Total F""","""mm"""


In [5]:
# Check unique reading units - 1 Unique
(df_combined_col.select(pl.col("reading_unit"))
 .unique()
 .sort("reading_unit")
 .to_series().to_list()
 )

['mm']

In [6]:
# Check unique reading types- 1 Unique
(df_combined_col.select(pl.col("reading_type"))
 .unique()
 .sort("reading_type")
 .to_series().to_list()
 )

['TB1 Rainfall 5 Minute Total F']

In [7]:
# Checking unique stations
df_combined_col.select(pl.col("station_name").unique()).to_series().to_list()

['Ang Mo Kio Avenue 10',
 'Clementi Park',
 'Hougang Avenue 1',
 'S222',
 'Scotts Road',
 'Choa Chu Kang Avenue 4',
 'S209',
 'Pulau Ubin',
 'Woodlands Avenue 9',
 'S208',
 'Upper Peirce Reservoir Park',
 'Buangkok Green',
 'Pasir Ris Drive 12',
 'S202',
 'Yio Chu Kang Road',
 'Semakau Landfill',
 'Ang Mo Kio Ave 10',
 'S226',
 'Yishun Ring Road',
 'Upper Serangoon Road',
 'Marina Gardens Drive',
 'Bukit Panjang Road',
 'Bishan Street 13',
 'Bukit Timah Road',
 'Bedok Road',
 'Clementi Road',
 'S227',
 'Poole Road',
 'Sime Road',
 'Woodlands Road',
 'Tuas Road',
 'S220',
 'Old Toh Tuck Road',
 'Henderson Road',
 'Jurong West Street 42',
 'S230',
 'Kim Chuan Road',
 'Holland Road',
 'Banyan Road',
 'S213',
 'Woodlands Centre',
 'Pasir Ris Street 51',
 'Towner Road',
 'Sentosa',
 'Dairy Farm Road',
 'Airport Boulevard',
 'Tuas South Avenue 3',
 'Alexandra Road',
 'West Coast Road',
 'East Coast Parkway',
 'Lim Chu Kang Road',
 'GEYLANG EAST CENTRAL',
 'S216',
 'Old Choa Chu Kang Road',
 

In [8]:
# Check out station IDs starting with S
(df_combined_col.filter(pl.col("station_id").str.starts_with("S2"))
 .group_by(["station_id", "station_name", "location_longitude", "location_latitude"])
 .agg(pl.col("date").count())
 .sort("station_id"))

station_id,station_name,location_longitude,location_latitude,date
str,str,f64,f64,u32
"""S201""","""Clementi Park""",103.76714,1.32311,47649
"""S201""","""S201""",103.76714,1.32311,422634
"""S202""","""S202""",103.7578,1.30968,276164
"""S203""","""S203""",103.7702,1.29164,417506
"""S203""","""Pasir Panjang""",103.7702,1.29164,42562
…,…,…,…,…
"""S230""","""S230""",103.76444,1.30167,421526
"""S230""","""West Coast Road""",103.76444,1.30167,47652
"""S24""","""Upper Changi Road North""",103.9826,1.3678,798983
"""S24B""","""Upper Changi Road North""",103.998,1.3678,32648


In [9]:
# Check out station names starting with S
(df_combined_col.filter(pl.col("station_name").str.starts_with("S2"))
 .group_by(["station_name","station_id"])
 .agg(pl.col("date").count())
 .sort("station_name"))

station_name,station_id,date
str,str,u32
"""S201""","""S201""",422634
"""S202""","""S202""",276164
"""S203""","""S203""",417506
"""S204""","""S204""",345437
"""S205""","""S205""",315754
…,…,…
"""S226""","""S226""",419807
"""S227""","""S227""",421660
"""S228""","""S228""",420329
"""S229""","""S229""",422397


In [62]:
# Redundancy in station ID, to check unique
(df_combined_col.select(pl.col("station_id"))
 .unique()
.sort("station_id")
.to_series().to_list()
)

station_id
str
"""S06"""
"""S07"""
"""S08"""
"""S100"""
"""S101"""
…
"""S91"""
"""S92"""
"""S94"""
"""S96"""


In [65]:
# Counting dates per station id
(df_combined_col.group_by("station_id")
 .agg(pl.col("date").count())
 .sort("station_id")
)

station_id,date
str,u32
"""S06""",40141
"""S07""",554703
"""S08""",787572
"""S100""",621234
"""S101""",169959
…,…
"""S91""",346468
"""S92""",395407
"""S94""",800299
"""S96""",30633


In [83]:
# Check the number of dates per station: There should be 2920 by right
(df_combined_col.group_by("station_id")
 .agg(pl.col("date").n_unique().alias("date_unique_count"))
 .sort("date_unique_count", descending=True)
 .head(20)
)

(df_combined_col.group_by("station_id")
 .agg(pl.col("date").n_unique().alias("date_unique_count"))
 .sort("date_unique_count", descending=True)
 .filter(pl.col("date_unique_count") >= 2500)
 .count()
) # 34


station_id,date_unique_count
u32,u32
34,34


Scoping begins here, along with feature engineering

In [94]:
# To proceed with only station IDs having at least 2500 dates
selected_stations_list = (df_combined_col.group_by("station_id")
 .agg(pl.col("date").n_unique().alias("date_unique_count"))
 .sort("date_unique_count", descending=True)
 .filter(pl.col("date_unique_count") >= 2500)
 .select(pl.col("station_id"))
).to_series().to_list()

In [101]:
# Aggregate rainfall min max per day
df_agg = (df_combined_col.filter(pl.col("station_id").is_in(selected_stations_list))
          .group_by(["station_id", "date"])
          .agg(
    pl.min("reading_value").alias("min_rainfall"),
    pl.max("reading_value").alias("max_rainfall")
         ).sort("station_id", "date"))
df_agg.head()

station_id,date,min_rainfall,max_rainfall
str,str,f64,f64
"""S08""","""2017-01-01""",0.0,0.4
"""S08""","""2017-01-02""",0.0,0.6
"""S08""","""2017-01-03""",0.0,0.2
"""S08""","""2017-01-04""",0.0,0.4
"""S08""","""2017-01-05""",0.0,0.4


In [102]:
# Check min and max dates per station
with pl.Config(tbl_rows=50):
    print((df_agg.group_by("station_id")
    .agg(
        pl.min("date").alias("min_date"),
        pl.max("date").alias("max_date"))))

shape: (34, 3)
┌────────────┬────────────┬────────────┐
│ station_id ┆ min_date   ┆ max_date   │
│ ---        ┆ ---        ┆ ---        │
│ str        ┆ str        ┆ str        │
╞════════════╪════════════╪════════════╡
│ S08        ┆ 2017-01-01 ┆ 2024-12-31 │
│ S104       ┆ 2017-01-01 ┆ 2024-12-31 │
│ S107       ┆ 2017-01-01 ┆ 2024-12-31 │
│ S109       ┆ 2017-01-01 ┆ 2024-12-31 │
│ S111       ┆ 2017-08-31 ┆ 2024-12-31 │
│ S112       ┆ 2017-01-01 ┆ 2024-12-31 │
│ S113       ┆ 2017-01-01 ┆ 2024-12-31 │
│ S114       ┆ 2017-01-01 ┆ 2024-11-20 │
│ S115       ┆ 2017-01-01 ┆ 2024-12-31 │
│ S116       ┆ 2017-01-01 ┆ 2024-12-31 │
│ S119       ┆ 2017-01-01 ┆ 2024-12-31 │
│ S120       ┆ 2017-01-01 ┆ 2024-05-02 │
│ S121       ┆ 2017-01-01 ┆ 2024-12-31 │
│ S123       ┆ 2017-01-01 ┆ 2024-12-31 │
│ S24        ┆ 2017-01-01 ┆ 2024-12-31 │
│ S33        ┆ 2017-01-01 ┆ 2024-12-31 │
│ S35        ┆ 2017-01-01 ┆ 2024-12-31 │
│ S40        ┆ 2017-01-01 ┆ 2024-12-31 │
│ S43        ┆ 2017-01-01 ┆ 2024-12-31 │
│

In [103]:
# Further filtering to having first and last date as 2017-01-01 and 2024-12-31 respectively
df_min_max_date = (df_agg.group_by("station_id")
                    .agg(
                        pl.min("date").alias("min_date"),
                        pl.max("date").alias("max_date"),
                        pl.count("date").alias("date_count"))
)

df_min_max_date = df_min_max_date.filter(
    (pl.col("min_date") == "2017-01-01") & 
    (pl.col("max_date") == "2024-12-31")
)

with pl.Config(tbl_rows=50):
    print(df_min_max_date) # 30 left, some dates are missing! 

shape: (30, 4)
┌────────────┬────────────┬────────────┬────────────┐
│ station_id ┆ min_date   ┆ max_date   ┆ date_count │
│ ---        ┆ ---        ┆ ---        ┆ ---        │
│ str        ┆ str        ┆ str        ┆ u32        │
╞════════════╪════════════╪════════════╪════════════╡
│ S08        ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2803       │
│ S104       ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2728       │
│ S107       ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2842       │
│ S109       ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2701       │
│ S112       ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2796       │
│ S113       ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2840       │
│ S115       ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2791       │
│ S116       ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2787       │
│ S119       ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2854       │
│ S121       ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2793       │
│ S123       ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2767       │
│ S24        ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2835       │
│ S33        ┆ 2017-01-01 ┆ 2024-12-31 ┆ 2794       │
│ S35        

In [104]:
# More filtering to have min and max dates as 2017-01-01 and 2024-12-31 respectively
df_agg = df_agg.filter(
    pl.col("station_id").is_in(
        df_min_max_date.select(pl.col("station_id")).to_series().to_list()
        )
)

In [105]:
# Need to fill in the missing dates for each station
df_agg = df_agg.with_columns(pl.col("date").cast(pl.Date, strict=False)).set_sorted("date")
df_agg_filled = df_agg.upsample(
    time_column="date", every="1d", group_by="station_id", maintain_order=True
).select(pl.all().forward_fill())



In [106]:
# Tag whether max of the station is more than 0.01
df_agg_filled = df_agg_filled.with_columns(
    pl.when(pl.col("max_rainfall") > 0.01)
    .then(1)
    .otherwise(0)
    .alias("y")
)
df_agg_filled.shape

(87660, 5)

In [107]:
# Count % of rain days distribution per station
df_agg_filled.group_by("station_id").agg(
    pl.col("y").mean().alias("rain_days_pct"),
    pl.col("y").sum().alias("rain_days_count"),
    pl.col("y").count().alias("total_days_count")
).sort("rain_days_pct", descending=True).head(20) # looks to be around 50%, pretty good

df_agg_filled.group_by("station_id").agg(
    pl.col("y").mean().alias("rain_days_pct")
).describe()  # Ranges from 46% to 58%. Quite nice

statistic,station_id,rain_days_pct
str,str,f64
"""count""","""30""",30.0
"""null_count""","""0""",0.0
"""mean""",,0.531736
"""std""",,0.028909
"""min""","""S08""",0.46167
"""25%""",,0.512663
"""50%""",,0.528063
"""75%""",,0.551677
"""max""","""S94""",0.585558


In [112]:
# joining in the station names
df_stations = (df_combined_col.select("station_id", "station_name", 
                                      "location_longitude", "location_latitude").unique(subset=["station_id", "station_name"]).sort("station_id")
               .filter(~pl.col("station_name").str.contains("S\\d", literal=False)))

df_agg_filled_final = df_agg_filled.join(df_stations, on="station_id", how="left")
df_agg_filled_final.select("station_name").null_count()  # Should be 0, no nulls in station names

station_name
u32
0


In [113]:
print(df_agg_filled.shape)
print(df_agg_filled_final.shape)


(87660, 5)
(87660, 8)


In [114]:
df_agg_filled_final.select(pl.col("station_name")).unique().sort("station_name").to_series().to_list()  # Check unique station names

['Alexandra Road',
 'Ang Mo Kio Avenue 5',
 'Bukit Timah Road',
 'Clementi Road',
 'East Coast Parkway',
 'Jurong Pier Road',
 'Kent Ridge Road',
 'Kim Chuan Road',
 'Kranji Way',
 'Lim Chu Kang Road',
 'Mandai Lake Road',
 'Marine Parade Road',
 'Nicoll Highway',
 'Old Choa Chu Kang Road',
 'Old Toh Tuck Road',
 'Pasir Ris Street 51',
 'Poole Road',
 'Punggol Central',
 'Seletar Aerospace View',
 'Simei Avenue',
 'Somerset Road',
 'Toa Payoh North',
 'Towner Road',
 'Tuas Road',
 'Tuas South Avenue 3',
 'Upper Changi Road North',
 'Upper Peirce Reservoir Park',
 'Upper Thomson Road',
 'West Coast Highway',
 'Woodlands Avenue 9']