In [2]:
import pandas as pd

# データの加工整形

In [3]:
# 降水量について
# 基本小数値。0（整数）が存在するがこれは雨が降っていないパターン。0.0は若干降ってたっぽい

# 日照時間について
# 基本0~1の小数。0（整数）はそもそも日が出てない。

# 風速・風向について
# 風向は16方向で表記

# 全天日射量はエンコード不明。ファイルが開けない

# 天気について
# 1.値は記号に対応した数字を割り振られている。
# 2.観測は3時間ごとのみ。また深夜0時には観測されない。

# 雲量について
# 基本0~10の整数。0+と10-が存在する。

status = {
    "temperature":{"before_colname": "気温(℃)", "after_colname": "temperature", "dtype": "float",},
    "precipitation":{"before_colname": "降水量(mm)", "after_colname": "precipitation", "dtype": "float",},
    "sunshine":{"before_colname": "日照時間(時間)", "after_colname": "sunshine", "dtype": "float",},
    "wind":{"before_colname": {1: "風速(m/s)", 3: "風向",}, "after_colname": {1: "wind_speed", 3: "wind_direction",}, "dtype": {1: float, 3: object,},},  # 例外処理しようね
#    "solar":{"before_colname": "", "after_colname": "solar", "dtype": "float",}, # ?
    "pressure":{"before_colname": "現地気圧(hPa)", "after_colname": "pressure", "dtype": "float",},
    "humidity":{"before_colname": "相対湿度(％)", "after_colname": "humidity", "dtype": "float",},
    "weather":{"before_colname": "天気", "after_colname": "weather", "dtype": "int",},
    "cloud":{"before_colname": "雲量(10分比)", "after_colname": "cloud", "dtype": "int",}, # 型変換の前に+と-を処理しようね
}

In [4]:
def lead_and_preprocess_data(year, element, status):

    '''データを読み込んで整形するゾ'''
    
    # 読み込みと整形
    data = pd.read_csv(f"./../data/csv/data_{element}_{year}.csv", encoding="shift-jis")
    drop_column = data.columns[0]
    data.reset_index(inplace=True)
    data.drop(drop_column, axis=1, inplace=True)

    if  element == "wind":
        data = data[["level_0","level_1","level_3"]]
        colnames = ["年月日時","風速(m/s)","風向"]
        data.drop([0,1,2,3],axis=0, inplace=True)
    else:
        data = data[["level_0","level_1"]]
        colnames = data.iloc[1].values.tolist()
        data.drop([0,1,2],axis=0, inplace=True)

    data.set_axis(colnames, axis=1, inplace=True)
    data.reset_index(drop=True,inplace=True)
    
    return data

In [5]:
def rename_column_and_trans_type(data, year, element, status):

    '''カラム名の修正と型変換'''

    if element == "wind":
        before_name1 = status[element]["before_colname"][1]
        after_name1 = status[element]["after_colname"][1]
        before_name3 = status[element]["before_colname"][3]
        after_name3 = status[element]["after_colname"][3]
        data.rename(columns={before_name1:after_name1,before_name3:after_name3}, inplace=True)
        feature_names = ["年月日時",after_name1, after_name3]
        data = data[feature_names]

        dtype1, dtype3 = status[element]["dtype"].values()
        data[after_name1] = data[after_name1].astype(dtype1)
        data[after_name3] = data[after_name3].astype(dtype3)
        
    else:
        before_name = status[element]["before_colname"]
        after_name = status[element]["after_colname"]
        data.rename(columns={before_name:after_name}, inplace=True)
        feature_names = ["年月日時",after_name]
        data = data[feature_names]
        # 天気と雲量はnanを含むから後で処理しようね
        if element != "weather" and element != "cloud":
            dtype = status[element]["dtype"]
            data[after_name] = data[after_name].astype(dtype)
            
    return data

In [6]:
def trans_datetime(data):
    
    '''"年月日時"を分割する'''
    
    data["年月日時"] = pd.to_datetime(data["年月日時"])
    data.rename(columns={"年月日時":"datetime"},inplace=True)
    
    return data

In [9]:
def sort_columns(data):
    
    '''カラムを並べ替える'''
    
    
    feature_name = ['datetime', 'temperature', 'precipitation', 'sunshine', 'wind_speed', 'wind_direction', 'pressure', 'humidity', 'weather', 'cloud', ]
    data = data[feature_name]
    
    return data

In [10]:
# 気温データの加工
now_year = 2022
cycle = 20

start_year = now_year - (cycle - 1)
weather_data = pd.DataFrame()
data_year = pd.DataFrame()

for i in range(cycle):
    year = start_year + i
    for num, element in enumerate(status.keys()):

        # 読み込みと整形
        data = lead_and_preprocess_data(year, element, status)

        # カラム名を修正と型変換
        data = rename_column_and_trans_type(data, year, element, status)

        # 結合
        if num == 0:
            data_year = data
        else:
            data_year = pd.merge(data_year, data, on="年月日時", how="left")

    # 年月日時を分割
    data_year = trans_datetime(data_year)
    # 並べ替え
    data_year = sort_columns(data_year)
    # 1日分重複したので削る
    data_year = data_year[24:]
    
    # 親データに追加
    weather_data = pd.concat([weather_data,data_year],axis=0)
    
# Index番号を振りなおす
weather_data.reset_index(drop=True,inplace=True)

In [11]:
weather_data

Unnamed: 0,datetime,temperature,precipitation,sunshine,wind_speed,wind_direction,pressure,humidity,weather,cloud
0,2002-11-29 01:00:00,9.0,0.0,,4.7,北西,1019.6,39.0,,
1,2002-11-29 02:00:00,8.4,0.0,,3.8,北北西,1020.0,40.0,,
2,2002-11-29 03:00:00,8.1,0.0,,4.0,北北西,1020.1,41.0,1,0+
3,2002-11-29 04:00:00,7.9,0.0,,2.6,北北西,1020.3,41.0,,
4,2002-11-29 05:00:00,6.9,0.0,,2.9,北北東,1020.8,47.0,,
...,...,...,...,...,...,...,...,...,...,...
175315,2022-11-28 20:00:00,13.2,0.0,0.0,1.4,西,1023.9,69.0,,
175316,2022-11-28 21:00:00,13.2,0.0,0.0,1.9,北西,1023.4,71.0,4,10
175317,2022-11-28 22:00:00,13.4,0.0,0.0,1.7,西北西,1023.4,70.0,,
175318,2022-11-28 23:00:00,13.3,0.0,0.0,2.0,北西,1023.0,72.0,,


In [12]:
# csvに保存
weather_data.to_csv("./../data/csv/weather_data.csv",index=False)

In [3]:
weather_data = pd.read_csv("./../data/csv/weather_data.csv")

# データの前処理

In [7]:
condition = weather_data['temperature'].isnull()
weather_data[condition]

Unnamed: 0,year,month,day,hour,temperature,precipitation,sunshine,wind_speed,wind_direction,pressure,humidity,weather,cloud
149124,2019,12,3,13,,0.0,,,,,,,
149125,2019,12,3,14,,0.0,,,,,,,
149265,2019,12,9,10,,0.0,,,,,,,
157258,2020,11,6,11,,0.0,0.0,2.3,北西,1023.9,,,


In [13]:
pd.DataFrame.reset_index?