# Gather all data into one file

In [1]:
import polars as pl

## Humidity

年月日, 曜日, 平均湿度(％), 最小相対湿度(％), 平均蒸気圧(hPa)

In [2]:
file_name = "../../data/weather/humidity.csv"
df_hum = pl.read_csv(file_name, encoding="shift-jis", skip_rows=4)
df_hum.columns = ["date", "day_of_week", "mean_hum", "min_hum", "mean_vapor"] #, "min_hum_time"]
# df_hum = df_hum.drop("mean_vapor")
df_hum = df_hum.with_columns(pl.col("date").str.strptime(pl.Date, "%Y/%m/%d")) # convert day column data type into date type
# df_hum = df_hum.with_columns(pl.col("min_hum_time").str.strptime(pl.Datetime, "%Y/%m/%d %H:%M")) # convert time column data type into datetime
df_hum.head()

date,day_of_week,mean_hum,min_hum,mean_vapor
date,str,i64,i64,f64
2017-07-01,"""土""",88,63,23.8
2017-07-02,"""日""",91,63,25.9
2017-07-03,"""月""",89,56,26.3
2017-07-04,"""火""",97,87,25.6
2017-07-05,"""水""",82,62,23.7


In [3]:
display(df_hum.null_count())
df_hum.hstack(df_hum.transpose().select(pl.all().is_null().sum()).transpose().rename({"column_0": "null_count"})).filter(pl.col("null_count") != 0)

date,day_of_week,mean_hum,min_hum,mean_vapor
u32,u32,u32,u32,u32
0,0,3,3,3


date,day_of_week,mean_hum,min_hum,mean_vapor,null_count
date,str,i64,i64,f64,u32
2017-10-22,"""日""",,,,3
2017-10-23,"""月""",,,,3
2017-10-24,"""火""",,,,3


## Barometric pressure

年月日, 曜日, 平均現地気圧(hPa), 平均海面気圧(hPa), 最低海面気圧(hPa), 最低海面気圧(hPa) info

In [4]:
file_name = "../../data/weather/pressure.csv"
df_press = pl.read_csv(file_name, encoding="shift-jis", skip_rows=5)
df_press.columns = ["date", "day_of_week", "mean_press", "mean_press_sea", "min_press_sea", \
                # "min_press_sea_time",
                # "min_press_time_info",
                "min_press_sea_info"]
df_press = df_press.drop(["min_press_sea_info", "min_press_time_info"])
df_press = df_press.with_columns(pl.col("date").str.strptime(pl.Date, "%Y/%m/%d")) # convert day column data type into date type
# df_press = df_press.with_columns(pl.col("min_press_sea_time").str.strptime(pl.Datetime, "%Y/%m/%d %H:%M")) # convert time column data type into datetime
df_press.head()

date,day_of_week,mean_press,mean_press_sea,min_press_sea
date,str,f64,f64,f64
2017-07-01,"""土""",983.8,1008.1,1007.1
2017-07-02,"""日""",985.0,1009.3,1007.5
2017-07-03,"""月""",982.8,1006.9,1004.0
2017-07-04,"""火""",982.3,1006.6,1004.7
2017-07-05,"""水""",983.7,1007.9,1004.0


In [5]:
display(df_press.null_count())
df_press.hstack(df_press.transpose().select(pl.all().is_null().sum()).transpose().rename({"column_0": "null_count"})).filter(pl.col("null_count") != 0)

date,day_of_week,mean_press,mean_press_sea,min_press_sea
u32,u32,u32,u32,u32
0,0,0,0,0


date,day_of_week,mean_press,mean_press_sea,min_press_sea,null_count
date,str,f64,f64,f64,u32


## Rain

年月日, 曜日, 降水量の合計(mm), 降水量の合計(mm) info, 1時間降水量の最大(mm), 1時間降水量の最大(mm) info

In [6]:
file_name = "../../data/weather/rain.csv"
df_rain = pl.read_csv(file_name, encoding="shift-jis", skip_rows=5)
df_rain.columns = ["date", "day_of_week", "total_preci", "total_preci_info", "hourly_max_preci", \
                "hourly_max_preci_info"] # , "hourly_max_preci_time", "hourly_max_preci_time_info"]
df_rain = df_rain.drop(["hourly_max_preci_info", "hourly_max_preci_time_info", "total_preci_info"])
df_rain = df_rain.with_columns(pl.col("date").str.strptime(pl.Date, "%Y/%m/%d")) # convert day column data type into date type
# df_rain = df_rain.with_columns(pl.col("hourly_max_preci_time").str.strptime(pl.Datetime, "%Y/%m/%d %H:%M")) # convert time column data type into datetime
df_rain.head()

date,day_of_week,total_preci,hourly_max_preci
date,str,f64,f64
2017-07-01,"""土""",11.5,4.0
2017-07-02,"""日""",34.0,14.0
2017-07-03,"""月""",52.0,10.0
2017-07-04,"""火""",37.0,11.0
2017-07-05,"""水""",0.0,0.0


In [7]:
display(df_rain.null_count())
df_rain.hstack(df_rain.transpose().select(pl.all().is_null().sum()).transpose().rename({"column_0": "null_count"})).filter(pl.col("null_count") != 0)

date,day_of_week,total_preci,hourly_max_preci
u32,u32,u32,u32
0,0,6,6


date,day_of_week,total_preci,hourly_max_preci,null_count
date,str,f64,f64,u32
2019-10-15,"""火""",,,2
2020-06-02,"""火""",,,2
2020-06-03,"""水""",,,2
2020-06-04,"""木""",,,2
2020-06-05,"""金""",,,2
2022-10-02,"""日""",,,2


## Snow

年月日, 曜日, 最深積雪(cm), 最深積雪(cm) info, 降雪量合計(cm), 降雪量合計(cm) info

In [8]:
file_name = "../../data/weather/snow.csv"
df_snow = pl.read_csv(file_name, encoding="shift-jis", skip_rows=5)
df_snow.columns = ["date", "day_of_week", "max_depth", "max_depth_info",  \
                    "total_snowfall", "total_snowfall_info"]
df_snow = df_snow.drop(["max_depth_time", "max_depth_time_info", "max_depth_info", "total_snowfall_info"])
df_snow = df_snow.with_columns(pl.col("date").str.strptime(pl.Date, "%Y/%m/%d")) # convert day column data type into date type
df_snow.head()

date,day_of_week,max_depth,total_snowfall
date,str,i64,i64
2017-07-01,"""土""",0,0
2017-07-02,"""日""",0,0
2017-07-03,"""月""",0,0
2017-07-04,"""火""",0,0
2017-07-05,"""水""",0,0


In [9]:
display(df_snow.null_count())
df_snow.hstack(df_snow.transpose().select(pl.all().is_null().sum()).transpose().rename({"column_0": "null_count"})).filter(pl.col("null_count") != 0)

date,day_of_week,max_depth,total_snowfall
u32,u32,u32,u32
0,0,0,0


date,day_of_week,max_depth,total_snowfall,null_count
date,str,i64,i64,u32


## Sun

年月日, 曜日, 日照時間(時間), 日照時間(時間) info

In [10]:
file_name = "../../data/weather/sun.csv"
df_sun = pl.read_csv(file_name, encoding="shift-jis", skip_rows=4)
df_sun.columns = ["date", "day_of_week", "sun_hour", "sun_hour_info"]
df_sun = df_sun.drop(["sun_sphere", "sun_hour_info"])
df_sun = df_sun.with_columns(pl.col("date").str.strptime(pl.Date, "%Y/%m/%d")) # convert day column data type into date type
df_sun.head()

date,day_of_week,sun_hour
date,str,f64
2017-07-01,"""土""",0.3
2017-07-02,"""日""",0.7
2017-07-03,"""月""",2.4
2017-07-04,"""火""",0.0
2017-07-05,"""水""",5.5


In [11]:
display(df_sun.null_count())
df_sun.hstack(df_sun.transpose().select(pl.all().is_null().sum()).transpose().rename({"column_0": "null_count"})).filter(pl.col("null_count") != 0)

date,day_of_week,sun_hour
u32,u32,u32
0,0,1


date,day_of_week,sun_hour,null_count
date,str,f64,u32
2020-02-13,"""木""",,1


## Temperature

年月日, 曜日, 平均気温(℃), 最低気温(℃), 最高気温(℃)

In [12]:
file_name = "../../data/weather/temperature.csv"
df_temp = pl.read_csv(file_name, encoding="shift-jis", skip_rows=4)
df_temp.columns = ["date", "day_of_week", "mean_temp", "min_temp", "max_temp"]
df_temp = df_temp.with_columns(pl.col("date").str.strptime(pl.Date, "%Y/%m/%d")) # convert day column data type into date type
# df_temp = (
#     df_temp.with_columns(pl.col(["max_temp_time", "min_temp_time"]).str
#                     .strptime(pl.Datetime, "%Y/%m/%d %H:%M")) # convert time column data type into datetime
# )
df_temp.head()

date,day_of_week,mean_temp,min_temp,max_temp
date,str,f64,f64,f64
2017-07-01,"""土""",22.5,20.3,25.1
2017-07-02,"""日""",23.3,20.3,29.2
2017-07-03,"""月""",24.1,21.7,29.3
2017-07-04,"""火""",22.0,20.1,24.2
2017-07-05,"""水""",23.6,20.6,27.8


In [13]:
display(df_temp.null_count())
df_temp.hstack(df_temp.transpose().select(pl.all().is_null().sum()).transpose().rename({"column_0": "null_count"})).filter(pl.col("null_count") != 0)

date,day_of_week,mean_temp,min_temp,max_temp
u32,u32,u32,u32,u32
0,0,0,0,0


date,day_of_week,mean_temp,min_temp,max_temp,null_count
date,str,f64,f64,f64,u32


## Wind

年月日, 曜日, 平均風速(m/s), 最多風向(16方位), 最大風速(m/s), 最大風速(m/s) direction, 最大瞬間風速(m/s), 最大瞬間風速(m/s) direction

In [14]:
file_name = "../../data/weather/wind.csv"
df_wind = pl.read_csv(file_name, encoding="shift-jis", skip_rows=4)
df_wind.columns = ["date", "day_of_week", "mean_wind_speed", "most_direction", "max_wind_speed", \
                "max_wind_speed_direction", "max_gust", "max_gust_direction"]
df_wind = df_wind.with_columns(pl.col("date").str.strptime(pl.Date, "%Y/%m/%d")) # convert day column data type into date type
# df_wind = (
#     df_wind.with_columns(pl.col(["max_wind_speed_time", "max_gust_time"]).str
#                     .strptime(pl.Datetime, "%Y/%m/%d %H:%M")) # convert time column data type into datetime
# )
df_wind = df_wind.drop(["max_wind_speed_direction", "max_gust_direction"])
df_wind.head()

date,day_of_week,mean_wind_speed,most_direction,max_wind_speed,max_gust
date,str,f64,str,f64,f64
2017-07-01,"""土""",1.5,"""南南東""",3.8,6.2
2017-07-02,"""日""",1.8,"""北東""",5.7,9.6
2017-07-03,"""月""",2.2,"""北""",6.3,11.3
2017-07-04,"""火""",1.2,"""北""",3.6,5.3
2017-07-05,"""水""",2.8,"""北西""",6.7,9.8


In [15]:
direction = ["北北東", "北東", "東北東", "東", "東南東", "南東", "南南東", "南", \
                "南南西", "南西", "西南西", "西", "西北西", "北西", "北北西", "北"]
direction = {dire: idx+1 for idx, dire in enumerate(direction)}
df_wind = df_wind.with_columns(
    pl.col("most_direction").replace(direction, return_dtype=pl.Int8).alias("most_direction_dummy")
)

In [16]:
display(df_wind.null_count())
df_wind.hstack(df_wind.transpose().select(pl.all().is_null().sum()).transpose().rename({"column_0": "null_count"})).filter(pl.col("null_count") != 0)

date,day_of_week,mean_wind_speed,most_direction,max_wind_speed,max_gust,most_direction_dummy
u32,u32,u32,u32,u32,u32,u32
0,0,4,4,4,4,4


date,day_of_week,mean_wind_speed,most_direction,max_wind_speed,max_gust,most_direction_dummy,null_count
date,str,f64,str,f64,f64,i8,u32
2020-03-29,"""日""",,,,,,5
2021-01-05,"""火""",,,,,,5
2022-02-11,"""金""",,,,,,5
2024-02-06,"""火""",,,,,,5


---
## Gather'em

Combine dataframe into barometric pressure dataframe because it doesn't have any null values.

In [17]:
df = (
    df_press
    .join(df_hum, on=["date", "day_of_week"], how="left")
    .join(df_rain, on=["date", "day_of_week"], how="left")
    .join(df_snow, on=["date", "day_of_week"], how="left")
    .join(df_sun, on=["date", "day_of_week"], how="left")
    .join(df_temp, on=["date", "day_of_week"], how="left")
    .join(df_wind, on=["date", "day_of_week"], how="left")
)
df

date,day_of_week,mean_press,mean_press_sea,min_press_sea,mean_hum,min_hum,mean_vapor,total_preci,hourly_max_preci,max_depth,total_snowfall,sun_hour,mean_temp,min_temp,max_temp,mean_wind_speed,most_direction,max_wind_speed,max_gust,most_direction_dummy
date,str,f64,f64,f64,i64,i64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,str,f64,f64,i8
2017-07-01,"""土""",983.8,1008.1,1007.1,88,63,23.8,11.5,4.0,0,0,0.3,22.5,20.3,25.1,1.5,"""南南東""",3.8,6.2,7
2017-07-02,"""日""",985.0,1009.3,1007.5,91,63,25.9,34.0,14.0,0,0,0.7,23.3,20.3,29.2,1.8,"""北東""",5.7,9.6,2
2017-07-03,"""月""",982.8,1006.9,1004.0,89,56,26.3,52.0,10.0,0,0,2.4,24.1,21.7,29.3,2.2,"""北""",6.3,11.3,16
2017-07-04,"""火""",982.3,1006.6,1004.7,97,87,25.6,37.0,11.0,0,0,0.0,22.0,20.1,24.2,1.2,"""北""",3.6,5.3,16
2017-07-05,"""水""",983.7,1007.9,1004.0,82,62,23.7,0.0,0.0,0,0,5.5,23.6,20.6,27.8,2.8,"""北西""",6.7,9.8,14
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024-07-27,"""土""",993.4,1017.5,1015.7,85,60,31.8,0.5,0.5,0,0,3.3,28.0,25.0,33.6,1.8,"""北""",5.4,9.2,16
2024-07-28,"""日""",988.5,1012.4,1009.7,84,69,30.9,0.5,0.5,0,0,1.0,27.5,25.4,31.3,2.0,"""北西""",5.0,8.2,14
2024-07-29,"""月""",981.8,1005.6,1002.4,78,57,28.5,1.0,1.0,0,0,1.4,27.4,24.6,32.5,2.5,"""北西""",6.0,10.1,14
2024-07-30,"""火""",978.2,1001.9,1000.7,80,64,29.8,1.5,1.0,0,0,6.3,27.8,24.4,31.9,3.5,"""西北西""",6.4,10.7,13


---
## Remove occurrence time columns

In [18]:
use_col = []
for col in df.columns:
    if "time" not in col:
        use_col.append(col)
df = df[use_col] # select columns which not including "time"

In [19]:
display(df.null_count())
df.hstack(df.transpose().select(pl.all().is_null().sum()).transpose().rename({"column_0": "null_count"})).filter(pl.col("null_count") != 0)

date,day_of_week,mean_press,mean_press_sea,min_press_sea,mean_hum,min_hum,mean_vapor,total_preci,hourly_max_preci,max_depth,total_snowfall,sun_hour,mean_temp,min_temp,max_temp,mean_wind_speed,most_direction,max_wind_speed,max_gust,most_direction_dummy
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,3,3,3,6,6,0,0,1,0,0,0,4,4,4,4,4


date,day_of_week,mean_press,mean_press_sea,min_press_sea,mean_hum,min_hum,mean_vapor,total_preci,hourly_max_preci,max_depth,total_snowfall,sun_hour,mean_temp,min_temp,max_temp,mean_wind_speed,most_direction,max_wind_speed,max_gust,most_direction_dummy,null_count
date,str,f64,f64,f64,i64,i64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,str,f64,f64,i8,u32
2017-10-22,"""日""",982.5,1007.5,991.7,,,,63.5,8.0,0,0,0.0,15.6,14.9,16.6,0.8,"""西南西""",2.4,3.6,11,3
2017-10-23,"""月""",971.3,996.2,977.8,,,,54.0,11.0,0,0,0.4,13.1,10.6,17.6,7.0,"""西""",12.5,21.2,12,3
2017-10-24,"""火""",997.1,1022.8,1017.5,,,,0.0,0.0,0,0,6.0,11.7,7.5,16.4,1.6,"""西""",5.5,8.1,12,3
2019-10-15,"""火""",998.5,1024.1,1020.2,79,58,11.9,,,0,0,2.8,13.0,7.8,16.2,3.0,"""西""",6.7,12.5,12,2
2020-02-13,"""木""",991.3,1017.6,1014.5,86,72,7.4,1.0,0.5,0,0,,4.6,0.5,9.5,1.5,"""北西""",5.2,8.5,14,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2020-06-05,"""金""",981.4,1005.6,1002.4,66,42,19.6,,,0,0,11.9,24.3,18.0,32.1,2.1,"""北""",6.6,10.7,16,2
2021-01-05,"""火""",993.5,1020.1,1015.5,96,88,6.3,5.0,1.0,14,2,0.2,0.9,0.1,2.5,,,,,,5
2022-02-11,"""金""",995.6,1022.3,1018.9,84,57,5.5,1.0,0.5,48,1,5.0,1.1,-1.8,5.4,,,,,,5
2022-10-02,"""日""",996.9,1021.9,1019.5,76,38,16.1,,,0,0,8.4,19.2,12.2,28.2,1.2,"""北北西""",2.8,4.7,15,2


In [20]:
df

date,day_of_week,mean_press,mean_press_sea,min_press_sea,mean_hum,min_hum,mean_vapor,total_preci,hourly_max_preci,max_depth,total_snowfall,sun_hour,mean_temp,min_temp,max_temp,mean_wind_speed,most_direction,max_wind_speed,max_gust,most_direction_dummy
date,str,f64,f64,f64,i64,i64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,str,f64,f64,i8
2017-07-01,"""土""",983.8,1008.1,1007.1,88,63,23.8,11.5,4.0,0,0,0.3,22.5,20.3,25.1,1.5,"""南南東""",3.8,6.2,7
2017-07-02,"""日""",985.0,1009.3,1007.5,91,63,25.9,34.0,14.0,0,0,0.7,23.3,20.3,29.2,1.8,"""北東""",5.7,9.6,2
2017-07-03,"""月""",982.8,1006.9,1004.0,89,56,26.3,52.0,10.0,0,0,2.4,24.1,21.7,29.3,2.2,"""北""",6.3,11.3,16
2017-07-04,"""火""",982.3,1006.6,1004.7,97,87,25.6,37.0,11.0,0,0,0.0,22.0,20.1,24.2,1.2,"""北""",3.6,5.3,16
2017-07-05,"""水""",983.7,1007.9,1004.0,82,62,23.7,0.0,0.0,0,0,5.5,23.6,20.6,27.8,2.8,"""北西""",6.7,9.8,14
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024-07-27,"""土""",993.4,1017.5,1015.7,85,60,31.8,0.5,0.5,0,0,3.3,28.0,25.0,33.6,1.8,"""北""",5.4,9.2,16
2024-07-28,"""日""",988.5,1012.4,1009.7,84,69,30.9,0.5,0.5,0,0,1.0,27.5,25.4,31.3,2.0,"""北西""",5.0,8.2,14
2024-07-29,"""月""",981.8,1005.6,1002.4,78,57,28.5,1.0,1.0,0,0,1.4,27.4,24.6,32.5,2.5,"""北西""",6.0,10.1,14
2024-07-30,"""火""",978.2,1001.9,1000.7,80,64,29.8,1.5,1.0,0,0,6.3,27.8,24.4,31.9,3.5,"""西北西""",6.4,10.7,13


In [20]:
df.write_csv("../../data/weather/tmp.csv")
# del df_hum, df_press, df_rain, df_snow, df_sun, df_temp, df_wind