In [None]:
import pandas as pd
import os

In [None]:
time_series_folder = r'../example/data/time-series'
stations_csv_file = r'../example/data/Stations.csv'
number_of_days_in_year_calcs = 250 #minimum number of days requested for yearly stats calculations

max_10m_outlier_threshold = 40

In [None]:
df_rains_cols = ['DateTime', 'StationId', 'IR']
df_rains = pd.DataFrame(columns=df_rains_cols)
time_series_files = [file for file in os.listdir(time_series_folder) if '.csv' in file and '~' not in file]

for rainsfile in time_series_files:
    df = pd.read_csv(os.path.join(time_series_folder, rainsfile))
    df.set_index('DateTime', inplace=True)
    df.index = pd.to_datetime(df.index)
    df = df[~df.index.duplicated(keep='first')] #Remove duplicates keeping first value
    df_ex = df.resample('10T').sum()
    df_freq_nan = df.resample('10T').asfreq()
    df_ex.loc[df_freq_nan[df_freq_nan.IR.isnull()].index, 'IR'] = None # Change possibles GAPS in time series from 0 to NaN
    df_ex['StationId'] = df.StationId[0]
    df_rains = pd.concat([df_rains, df_ex.reset_index()[['DateTime', 'StationId', 'IR']]], ignore_index=True)


In [None]:
# Outlier Checks

total_outliers = len(df_rains[df_rains.IR >= max_10m_outlier_threshold])
print('There are {0} outliers in time-series.'.format(total_outliers))


In [None]:
# Save 10m time series in parquet file

df_rains.to_parquet('precipitations_time_series.parquet')


In [None]:
# Save 10m time series filtered in parquet file

df_rains_2 = df_rains.copy()
df_rains_2['IR'].mask(df_rains_2['IR'] >= max_10m_outlier_threshold, inplace=True)
df_rains_2.to_parquet('precipitations_time_series_filtered.parquet')


In [None]:
# With 10m_outlier_threshold daily rains

df_daily_rains = df_rains[df_rains['IR'] < max_10m_outlier_threshold].groupby([df_rains['DateTime'].dt.date, 'StationId'])['IR'].sum().reset_index()
df_daily_rains_na = df_rains[df_rains['IR'] < max_10m_outlier_threshold].groupby([df_rains['DateTime'].dt.date, 'StationId'])['IR'].apply(lambda x: x.sum(skipna=False)).reset_index()
df_daily_rains.to_parquet('precipitations_daily_time_series.parquet')


In [None]:
df_stats_year = df_daily_rains.groupby([pd.to_datetime(df_daily_rains['DateTime']).dt.year, 'StationId']).agg({'DateTime': ['min', 'max','count'], 'IR': ['sum', lambda x: x[x > 0.2].count()]}).reset_index().droplevel(0, axis=1)
df_stats_year.columns = ['year', 'StationID', 'Min Date', 'Max Date','Number of days with data', 'IR Sum','Raining days']

In [None]:
df_stations = pd.read_csv(stations_csv_file)

In [None]:
df_stations_with_stats = pd.concat([
                df_stations.set_index('Id'), 
                df_daily_rains.groupby('StationId').agg(['min', 'max']).droplevel(0, axis=1),
                df_stats_year[df_stats_year['Number of days with data']>number_of_days_in_year_calcs][['StationID', 'IR Sum']].groupby('StationID').mean(),
                df_stats_year[df_stats_year['Number of days with data']>number_of_days_in_year_calcs][['StationID', 'Raining days']].groupby('StationID').mean()
            ], axis=1).reset_index()
df_stations_with_stats.columns = ['StationId', 'Name', 'Latitude', 'Longitude', 'Elevation', 'Series Start', 'Series end', 'Min day IR', 'Max day IR', 'Yearly rainfall', 'Number of days with rainfall over 0.2 mm']

In [None]:
df_stations_with_stats.to_parquet('stations_with_stats.parquet')