# Library import

In [1]:
from dateutil.relativedelta import relativedelta
import pandas as pd

# Data read

In [2]:
df_merge = pd.read_csv("../data/merge_data_preprocessing.csv")
df_weather = pd.read_csv("../data/weather_v3.csv")
df_sample = pd.read_csv("../data/sample_submission.csv")

# Preprocessing

## weather merge

In [3]:
def typhoon_fillna_by_groupby(df, agg):
    df = df.copy()
    WEATHER_AREA_DICT = {
        "仙台": "宮城",
        "佐賀": "佐賀",
        "前橋": "群馬",
        "千葉": "千葉",
        "名古屋": "愛知",
        "和歌山": "和歌山",
        "宇都宮": "栃木",
        "宮崎": "宮崎",
        "山形": "山形",
        "帯広": "北海道",
        "徳島": "徳島",
        "新潟": "新潟",
        "東京": "東京",
        "松山": "愛媛",
        "横浜": "神奈川",
        "水戸": "茨城",
        "浜松": "静岡",
        "熊本": "熊本",
        "熊谷": "埼玉",
        "甲府": "山梨",
        "盛岡": "岩手",
        "神戸": "兵庫",
        "福岡": "福岡",
        "福島": "福島",
        "秋田": "秋田",
        "那覇": "沖縄",
        "長崎": "長崎",
        "長野": "長野",
        "青森": "青森",
        "高松": "香川",
        "高知": "高知",
        "鹿児島": "鹿児島"
    }
    df["area"] = df["area"].map(WEATHER_AREA_DICT)
    df["typhoon_approach"] = df["typhoon_approach"].fillna(0)
    return df

In [4]:
df_weather = typhoon_fillna_by_groupby(df_weather, "mean")

In [5]:
def weather_area(area):
    if area == "大分":
        return "熊本"
    elif area == "岐阜":
        return "愛知"
    else:
        return area


def lag_weather_add(df, n_shift):
    df_weather_tmp = df_weather.copy()
    df_weather_tmp.columns = [f"{i}_{n_shift}prev" if i not in ["year-month", "month", "area"] else i for i in df_weather_tmp.columns]
    df_weather_tmp["merge-year-month"] = df_weather_tmp["year-month"]
    
    df["area"] = df["area"].map(weather_area)
    df["merge-year-month"] = pd.to_datetime(df["year-month"]).dt.date + relativedelta(months=-n_shift)
    df["merge-year-month"] = df["merge-year-month"].map(lambda x: x.strftime("%Y-%m"))
    return df.merge(df_weather_tmp.drop(columns="year-month"), how="inner", on=["merge-year-month", "area"]).drop(columns=["merge-year-month"])

In [None]:
for i in range(1, 13):
    df_merge = lag_weather_add(df_merge, i)

In [None]:
df_merge.to_pickle("../data/merge_data_preprocessing_v2.pickle")