In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
raw_dataset = pd.read_csv("Tokyo.csv")

In [None]:
raw_dataset.head()

In [None]:
raw_dataset.describe()

In [None]:
dataset = raw_dataset.copy()

In [None]:
dataset.isnull().sum()

In [None]:
dataset.info()

# Date

In [None]:
date_time = pd.to_datetime(dataset["Date"])

In [None]:
timestamp_s = date_time.map(pd.Timestamp.timestamp)

In [None]:
timestamp_s

In [None]:
day = 24 * 60 * 60
year = (365.2425) * day

In [None]:
dataset["Day sin"] = np.sin(timestamp_s * (2 * np.pi / day))
dataset["Day cos"] = np.cos(timestamp_s * (2 * np.pi / day))

dataset["Year sin"] = np.sin(timestamp_s * (2 * np.pi / year))
dataset["Year cos"] = np.cos(timestamp_s * (2 * np.pi / year))

In [None]:
plt.plot(dataset["Day sin"][:50])
plt.plot(dataset["Day cos"][:50])

In [None]:
dataset = dataset.drop(["Date"], axis=1)

# Air Pressure

In [None]:
plt.hist(dataset["air_pressure_ashore"])

In [None]:
dataset.loc[dataset["air_pressure_ashore"] < 800.0, "air_pressure_ashore"] = dataset["air_pressure_ashore"].mean()

In [None]:
plt.hist(dataset["air_pressure_ashore"])

In [None]:
plt.hist(dataset["air_pressure_afloat"])

In [None]:
dataset.loc[dataset["air_pressure_afloat"] < 800.0, "air_pressure_afloat"] = dataset["air_pressure_afloat"].mean()

In [None]:
plt.hist(dataset["air_pressure_afloat"])

In [None]:
dataset["diff_air_pressure"] = dataset["air_pressure_ashore"] - dataset["air_pressure_afloat"]

In [None]:
plt.hist(dataset["diff_air_pressure"])

In [None]:
dataset.loc[dataset["diff_air_pressure"] > 5.0, "diff_air_pressure"] = dataset["diff_air_pressure"].mean()

In [None]:
sns.scatterplot(dataset["diff_air_pressure"], dataset["temperature"])

In [None]:
dataset["diff_air_pressure_bucketized"] = pd.cut(dataset["diff_air_pressure"], 2)

In [None]:
dataset["diff_air_pressure_bucketized"].value_counts()

# Precipitation

In [None]:
dataset["precipitation"] = dataset["precipitation"].fillna(0.0)

In [None]:
plt.hist(dataset.loc[dataset["precipitation"] > 0.0, "precipitation"])

In [None]:
pd.cut(dataset["precipitation"], [0, 10, 20, 30, 40, 50]).value_counts()

In [None]:
dataset.loc[dataset["precipitation"] > 40.0]

In [None]:
plt.scatter(dataset["precipitation"], dataset["temperature"])

In [None]:
plt.scatter(dataset.loc[dataset["precipitation"]> 0, "precipitation"], dataset.loc[dataset["precipitation"]> 0, "temperature"])

# Temperature

In [None]:
sns.displot(dataset["temperature"])

# Humidity

In [None]:
plt.hist(dataset["humidity"], bins=50)

In [None]:
plt.scatter(dataset["temperature"], dataset["humidity"])

# Wind

In [None]:
plt.hist(dataset["wind_velocity"])

In [None]:
dataset["wind_direction"].value_counts()

In [None]:
dataset["wind_direction"] = dataset["wind_direction"].map(lambda x: x.replace(" )", ""))

In [None]:
dataset.loc[dataset["wind_direction"] == "×", "wind_velocity"].value_counts()

In [None]:
dataset.loc[dataset["wind_direction"] == "×", "wind_direction"] = "静穏"

In [None]:
dataset["wind_direction"].value_counts()

In [None]:
direction = ["北", "北北東", "北東", "東北東", "東", "東南東", "南東", "南南東", "南", "南南西", "南西", "西南西", "西", "西北西", "北西", "北北西", "静穏"]
degree = [0.0, 22.5, 45.0, 67.5, 90.0, 112.5, 135.0, 157.5, 180.0, 202.5, 225.0, 247.5, 270.0, 292.5, 315.0, 337.5, 0]

direction_dict = dict(zip(direction, degree))

def direction_to_degree(direction):
    if direction in direction_dict.keys():
        return direction_dict[direction]
    else:
        return 0.0

dataset["wind_direction"] = dataset["wind_direction"].map(direction_to_degree)

In [None]:
wind_direction_rad = dataset["wind_direction"] * np.pi / 180

In [None]:
dataset["wind_vector_x"] = dataset["wind_velocity"] * np.cos(wind_direction_rad)
dataset["wind_vector_y"] = dataset["wind_velocity"] * np.sin(wind_direction_rad)

In [None]:
plt.scatter(dataset["wind_vector_x"], dataset["wind_vector_y"])

In [None]:
plt.scatter(dataset["wind_vector_x"], dataset["temperature"])

In [None]:
plt.scatter(dataset["wind_vector_y"], dataset["temperature"])

In [None]:
dataset = dataset.drop(["wind_direction", "wind_velocity"], axis=1)

In [None]:
dataset.head()

# Daylight

In [None]:
dataset.loc[dataset["hours_of_daylight"].notnull(), "is_daytime"] = 1
dataset.loc[dataset["hours_of_daylight"].isnull(), "is_daytime"] = 0

In [None]:
dataset.head(10)

In [None]:
dataset["hours_of_daylight"] = dataset["hours_of_daylight"].fillna(0.0)

In [None]:
dataset["hours_of_daylight"].value_counts()

In [None]:
dataset["is_daytime"].value_counts()

In [None]:
plt.hist(dataset["hours_of_daylight"])

In [None]:
plt.scatter(dataset["temperature"], dataset["hours_of_daylight"])

In [None]:
dataset["global_solar_radiation"] = dataset["global_solar_radiation"].fillna(0.0)

In [None]:
sns.displot(dataset["global_solar_radiation"])

In [None]:
plt.scatter(dataset["global_solar_radiation"], dataset["temperature"])

# Weather

In [None]:
dataset["weather"] = dataset["weather"].fillna("--")
dataset["weather"].value_counts()

In [None]:
weather_jp = ["快晴", "晴れ", "曇", "薄雲", "雨", "霧雨", "雪", "みぞれ", "氷あられ", "雷電", "--"]
weather_en = ["cloudless day", "sunny", "cloudy", "thin cloud", "rainny", "drizzle", "snowy", "sleet", "hail", "thunder", "others"]

weather_dict = dict(zip(weather_jp, weather_en))

def weather_jp_to_en(weather_jp):
    if weather_jp in weather_dict.keys():
        return weather_dict[weather_jp]
    else:
        return "others"

dataset["weather"] = dataset["weather"].map(weather_jp_to_en)

In [None]:
dataset["cloud_cover"] = dataset["cloud_cover"].fillna("--")
dataset["cloud_cover"].value_counts()

In [None]:
dataset.isnull().sum()

In [None]:
dataset.info()

In [None]:
dataset["wind_velocity_bucketized"] = pd.cut(dataset["wind_velocity"], 5)

In [None]:
dataset["wind_velocity_bucketized"]

In [None]:
dataset = pd.get_dummies(dataset)

In [None]:
dataset.info()

In [None]:
sns.pairplot(dataset, vars=["temperature", "air_pressure_ashore", "air_pressure_afloat", "precipitation", "humidity", "hours_of_daylight", "global_solar_radiation", "diff_air_pressure", "wind_vector_x", "wind_vector_y", "Day sin", "Day cos", "Year sin", "Year cos"])

In [None]:
sns.scatterplot(data=dataset, x="hours_of_daylight", y="temperature", hue="is_daytime", alpha=0.1)