In [1]:
import os, sys
from pathlib import Path
import time

import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon

import matplotlib.pyplot as plt
import cartopy.crs as ccrs



In [2]:
INPUT_PATH = Path("../../data/01-raw")
DEST_PATH = Path("../../data/02-processed")

### Read station data

In [3]:
df_list = []
for data_batch in [batch for batch in os.listdir(INPUT_PATH/'station_data') if 'A-' in batch]:
    files_list = [fn for fn in os.listdir(INPUT_PATH/'station_data'/data_batch) if 'Data' in fn]
    for fn in files_list:
        df = pd.read_csv(INPUT_PATH/'station_data'/ data_batch / fn)
        this_station = fn.split(' Daily')[0]
        print(df.columns)
        df = df.dropna(how='all',axis=0) 
        df['date'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']].astype(int)).dt.strftime('%Y-%m-%d')
        df.columns = [col.lower() for col in df.columns]
        df['station'] = this_station
        df = df.replace(999, np.NaN)
        print(f"{this_station}:{df.isnull().values.ravel().sum()} total missing data")
        df = df.rename(columns={'rr':'rainfall'})
        df['rainfall'] = df['rainfall'].replace(-1.0, 0)
        df = df[['station','date']+[col for col in df.columns if col not in ['year', 'month', 'day','date','station']]]
        df_list.append(df)

Index(['YEAR', 'MONTH', 'DAY', 'RAINFALL', 'TMAX', 'TMIN', 'TMEAN', 'RH',
       'WIND_SPEED', 'WIND_DIRECTION'],
      dtype='object')
Cabanatuan:0 total missing data
Index(['YEAR', 'MONTH', 'DAY', 'RAINFALL', 'TMAX', 'TMIN', 'TMEAN', 'RH',
       'WIND_SPEED', 'WIND_DIRECTION'],
      dtype='object')
Dagupan:0 total missing data
Index(['YEAR', 'MONTH', 'DAY', 'RAINFALL', 'TMAX', 'TMIN', 'TMEAN', 'RH',
       'WIND_SPEED', 'WIND_DIRECTION'],
      dtype='object')
Davao City:0 total missing data
Index(['YEAR', 'MONTH', 'DAY', 'RAINFALL', 'TMAX', 'TMIN'], dtype='object')
Dumangas:0 total missing data
Index(['YEAR', 'MONTH', 'DAY', 'RAINFALL', 'TMAX', 'TMIN', 'TMEAN', 'RH',
       'WIND_SPEED', 'WIND_DIRECTION'],
      dtype='object')
Legazpi:0 total missing data
Index(['YEAR', 'MONTH', 'DAY', 'RAINFALL', 'TMAX', 'TMIN', 'TMEAN', 'RH',
       'WIND_SPEED', 'WIND_DIRECTION'],
      dtype='object')
Lumbia-El Salvador:0 total missing data
Index(['YEAR', 'MONTH', 'DAY', 'RAINFALL', 'TMAX', '

In [4]:
alldf = pd.concat(df_list)
alldf

Unnamed: 0,station,date,rainfall,tmax,tmin,tmean,rh,wind_speed,wind_direction
0,Cabanatuan,2008-07-01,0.0,34.3,24.5,29.4,89.0,1.0,120.0
1,Cabanatuan,2008-07-02,0.0,33.1,25.5,29.3,90.0,1.0,180.0
2,Cabanatuan,2008-07-03,11.8,30.3,24.0,27.2,94.0,0.0,0.0
3,Cabanatuan,2008-07-04,1.4,33.8,24.5,29.1,89.0,1.0,180.0
4,Cabanatuan,2008-07-05,0.0,33.8,24.5,29.1,87.0,1.0,180.0
...,...,...,...,...,...,...,...,...,...
3132,Zamboanga,2021-12-27,3.0,35.0,24.0,29.5,79.0,2.0,270.0
3133,Zamboanga,2021-12-28,1.6,33.6,23.2,28.4,84.0,2.0,90.0
3134,Zamboanga,2021-12-29,0.0,32.7,22.7,27.7,85.0,1.0,270.0
3135,Zamboanga,2021-12-30,2.2,33.7,23.5,28.6,79.0,1.0,360.0


In [5]:
alldf.to_csv(DEST_PATH/'station_data.csv', index=False)