# Meteorological factors exploring template

## Import libraries

In [1]:
"""Basic ones"""
import polars as pl
from plotnine import *
import matplotlib.pyplot as plt
plt.show()  # for display figures after ggplot
# plt.rcParams['axes.grid'] = True
import numpy as np
import seaborn as sns
import pandas
import statsmodels.api as sm
from datetime import datetime

import warnings
warnings.simplefilter("ignore")

## Load meterological data

In [None]:
file_path = "../data/weather/*.csv" # change into csv file name
df = pl.read_csv(file_path, encoding="shift-jis", skip_rows=3, has_header=True) # csv file includes Japanese so "shift-jis"
df.columns = ["date", "day_of_week", ...] # every file has fixed name "date" and "day_of_week" on first and second columns respectively
df = df.with_columns(pl.col("date").str.strptime(pl.Date, "%Y/%m/%d")) # convert day column data type into date type
df = df.with_columns(pl.col("time").str.strptime(pl.Datetime, "%Y/%m/%d %H:%M")) # convert time column data type into datetime type
df.head()

## Check nulls

Ref. of finding nulls row wise
(https://stackoverflow.com/questions/76219628/how-to-find-the-no-of-nulls-in-every-column-in-a-polars-dataframe)


In [None]:
display(df.null_count())

nulls = df.hstack(df.transpose().select(pl.all().is_null().sum()).transpose().rename({"column_0": "null_count"}))
nulls.filter(pl.col("null_count") != 0)

## Encode day of week

In [None]:
days_of_week = ["日", "月", "火", "水", "木", "金", "土"]
day_to_label = {day: idx for idx, day in enumerate(days_of_week)}
df = df.with_columns(
    pl.col("day_of_week").apply(lambda day: day_to_label[day]).alias("day_of_week")
)
del days_of_week, day_to_label

## Visualize each column

In [None]:
(
    ggplot(data=df, mapping=aes(x="day"))
    + geom_line(aes(y=""), color="green")
    + geom_line(aes(y=""), color="red")
    + geom_line(aes(y=""))
    + theme(figure_size=(12, 4))
)

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(df[""].drop_nulls(), lags=1000);
plot_pacf(df[""].drop_nulls(), lags=100);

### Features seen in visualization

- Distribution
- Distribution (value range)
- Check the features' meaning

## Process nulls

- In case weather data has null values, how do I deal with it???
  - __Probably ecg data is more important, so processed weather data would be adjusted to corresponding date of ecg data.__
---
- If weather data and ecg data are null -> Delete both, which means all data of the date will be removed.
- If only ecg data is null -> Delete weather data, which means all data of the date will be removed. (This never happens because weather data is adjusted to corresponding date of ecg data.)
- __If only weather data is null__ -> Umm, state space model can deal with it?

## Match the date of ecg data and weather data.