# Impute missing values in weather data

In [1]:
import polars as pl

### Test

In [2]:
df = pl.read_csv("../../data/weather/weather.csv")[23:] # [23:] is for from July 24th, [16:] is for 7th lag variable
df = df.with_columns(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"))

# display(df.null_count())

nulls = df.hstack(df.transpose().select(pl.all().is_null().sum()).transpose().rename({"column_0": "null_count"}))
nulls = nulls.filter(pl.col("null_count") != 0)
nulls

date,day_of_week,mean_press,mean_press_sea,min_press_sea,mean_hum,min_hum,mean_vapor,total_preci,hourly_max_preci,max_depth,total_snowfall,sun_hour,mean_temp,min_temp,max_temp,mean_wind_speed,most_direction,max_wind_speed,max_gust,most_direction_dummy,null_count
date,str,f64,f64,f64,i64,i64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,str,f64,f64,i64,u32
2017-10-22,"""日""",982.5,1007.5,991.7,,,,63.5,8.0,0,0,0.0,15.6,14.9,16.6,0.8,"""西南西""",2.4,3.6,11,3
2017-10-23,"""月""",971.3,996.2,977.8,,,,54.0,11.0,0,0,0.4,13.1,10.6,17.6,7.0,"""西""",12.5,21.2,12,3
2017-10-24,"""火""",997.1,1022.8,1017.5,,,,0.0,0.0,0,0,6.0,11.7,7.5,16.4,1.6,"""西""",5.5,8.1,12,3
2019-10-15,"""火""",998.5,1024.1,1020.2,79,58,11.9,,,0,0,2.8,13.0,7.8,16.2,3.0,"""西""",6.7,12.5,12,2
2020-02-13,"""木""",991.3,1017.6,1014.5,86,72,7.4,1.0,0.5,0,0,,4.6,0.5,9.5,1.5,"""北西""",5.2,8.5,14,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2020-06-05,"""金""",981.4,1005.6,1002.4,66,42,19.6,,,0,0,11.9,24.3,18.0,32.1,2.1,"""北""",6.6,10.7,16,2
2021-01-05,"""火""",993.5,1020.1,1015.5,96,88,6.3,5.0,1.0,14,2,0.2,0.9,0.1,2.5,,,,,,5
2022-02-11,"""金""",995.6,1022.3,1018.9,84,57,5.5,1.0,0.5,48,1,5.0,1.1,-1.8,5.4,,,,,,5
2022-10-02,"""日""",996.9,1021.9,1019.5,76,38,16.1,,,0,0,8.4,19.2,12.2,28.2,1.2,"""北北西""",2.8,4.7,15,2


In [7]:
df = df.with_columns(pl.col("date").dt.strftime("%m-%d").alias("month_day"))
df.group_by("month_day", maintain_order=True).agg([
    pl.col("mean_hum").mean()
])

month_day,mean_hum
str,f64
"""07-24""",80.375
"""07-25""",78.875
"""07-26""",73.625
"""07-27""",77.375
"""07-28""",82.25
…,…
"""07-20""",77.571429
"""07-21""",76.571429
"""07-22""",77.571429
"""07-23""",76.714286


In [8]:
df.head()

date,day_of_week,mean_press,mean_press_sea,min_press_sea,mean_hum,min_hum,mean_vapor,total_preci,hourly_max_preci,max_depth,total_snowfall,sun_hour,mean_temp,min_temp,max_temp,mean_wind_speed,most_direction,max_wind_speed,max_gust,most_direction_dummy,month_day
date,str,f64,f64,f64,i64,i64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,str,f64,f64,i64,str
2017-07-24,"""月""",978.5,1002.5,1000.6,99,93,29.1,50.5,8.0,0,0,0.0,23.7,23.1,25.1,0.9,"""北北西""",2.9,4.0,15,"""07-24"""
2017-07-25,"""火""",979.6,1003.6,1002.1,95,85,28.6,8.0,1.5,0,0,0.0,24.1,22.3,25.7,0.9,"""北北西""",2.2,3.2,15,"""07-25"""
2017-07-26,"""水""",983.1,1007.2,1004.9,73,51,24.0,0.0,0.0,0,0,10.2,25.9,22.0,31.5,2.1,"""東南東""",5.3,9.7,5,"""07-26"""
2017-07-27,"""木""",986.1,1010.2,1008.8,68,54,21.9,0.0,0.0,0,0,4.7,25.4,21.0,30.3,2.7,"""東""",5.0,7.9,4,"""07-27"""
2017-07-28,"""金""",984.8,1008.8,1006.4,84,57,28.0,39.5,27.5,0,0,2.5,26.1,22.6,32.1,1.5,"""北西""",6.4,10.0,14,"""07-28"""


---

## Write imputed weather data into csv file

In [9]:
df = pl.read_csv("../../data/weather/weather.csv")[23:]
df = df.with_columns(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"))

columns_to_fill = [col for col in df.columns if col not in ["date", "month_day", "day_of_week", "most_direction", "most_direction_dumy"]]

df = df.with_columns(pl.col("date").dt.strftime("%m-%d").alias("month_day"))

daily_mean = df.group_by("month_day", maintain_order=True).agg([
    pl.col(col).mean().alias(f"{col}_mean") for col in columns_to_fill
])

for col in columns_to_fill:
    df = df.join(daily_mean.select([pl.col("month_day"), pl.col(f"{col}_mean")]), on="month_day", how="left")
    df = df.with_columns(pl.when(pl.col(col).is_null()).then(pl.col(f"{col}_mean")).otherwise(pl.col(col)).alias(col))

df = df.drop([f"{col}_mean" for col in columns_to_fill])
df = df.drop("month_day")
df.write_csv("../../data/weather/weather_imputed.csv")

In [None]:
# test
import polars as pl

# サンプルデータフレーム作成
df = pl.DataFrame({
    "date": [
        "2017-01-01", "2017-01-02", "2017-01-03",
        "2018-01-01", "2018-01-02", "2018-01-03",
        "2019-01-01", "2019-01-02", "2019-01-03",
        "2020-01-01", "2020-01-02", "2020-01-03",
        "2021-01-01", "2021-01-02", "2021-01-03"
    ],
    "temp": [10.0, None, 12.0, None, 15.0, 14.0, 11.0, None, 16.0, 10.5, 14.5, None, 9.0, None, 17.0],
    "humidity": [50, None, 55, 48, None, 52, 49, 54, None, None, 50, 56, 53, None, 58]
}).with_columns(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d"))
# 月日だけを抽出して新しい列を作成
df = df.with_columns(pl.col("date").dt.strftime("%m-%d").alias("month_day"))

# 補完するカラムリスト
columns_to_fill = [col for col in df.columns if col not in ["date", "month_day"]]

# nullを月日でグループ化した平均で埋める
df_filled = df.group_by("month_day").agg([
    pl.col(col).mean().alias(f"{col}_mean") for col in columns_to_fill  # 各グループの平均を計算
])

# 平均値でnullを埋める
for col in columns_to_fill:
    df = df.join(df_filled.select([pl.col("month_day"), pl.col(f"{col}_mean")]), on="month_day", how="left")
    df = df.with_columns(pl.when(pl.col(col).is_null()).then(pl.col(f"{col}_mean")).otherwise(pl.col(col)).alias(col))

# 不要な平均値のカラムを削除
df = df.drop([f"{col}_mean" for col in columns_to_fill])

# 結果を表示
df

In [27]:
df = pl.read_csv("../../data/weather/weather_imputed.csv")
# df.null_count()