# Example on how to add data to the dataframe (database)

Here we need to add new weather columns to the dataframe. Specifically:

```python
cols = [
    "precipitation",
    "wind_gusts_10m",
    "cloud_cover",
    'shortwave_radiation'
]
```

This data can be obtained from the same API so the data collection is strigtforward. 
First, we update `variables_standard` in Openmeteo class to add new quantities. This will assure that they are downloaded for all future data updates. However they also need to be added to the dataframe itself. This can be accomplised as follows. 

In [None]:
import pandas as pd
from datetime import datetime, timedelta
from glob import glob
from scipy.stats import fisk_gen

df_original = pd.read_parquet('../database/prev_latest.parquet')

In [None]:
df_original.tail()

In [None]:
start_date = pd.Timestamp(df_original.dropna(how='any',inplace=False).first_valid_index())
today = pd.Timestamp(datetime.today())
end_date = pd.Timestamp(df_original.dropna(how='any',inplace=False).last_valid_index())
from data_modules.collect_data_openmeteo import get_weather_data_from_api_forecast, get_weather_data_from_api, \
    locations, OpenMeteo

df_om_hist = get_weather_data_from_api(start_date, today-timedelta(hours=12), locations)



In [None]:
df_om_forecast = get_weather_data_from_api_forecast(locations=locations)
if not df_om_forecast.columns.equals(df_om_hist.columns):
    print("! Error. Column mismatch between historical and forecasted weather!")
df_om = pd.concat([df_om_hist, df_om_forecast[df_om_hist.columns]], ignore_index=True)
df_om.drop_duplicates(subset='date', keep='last', inplace=True)
# df_om = process_weather_quantities(df_om, locations)
df_om.set_index('date',inplace=True)

In [None]:
df_om

In [None]:
# df_om.to_parquet('../database'+'db_openweather.parquet',engine='pyarrow')
    

In [None]:
from data_modules.collect_data_openmeteo import OpenMeteo
for var in OpenMeteo.variables_standard:
    for col in df_original.columns.to_list():
        if str(col).__contains__(var):
            df_original.drop(col, axis=1, inplace=True)

In [None]:
df_original

In [None]:
df_original = df_original.join(df_om)

In [None]:
df_original

# Add SMARD columns

In [None]:
import pandas as pd
from datetime import datetime, timedelta
# df_original = pd.read_parquet('../database/latest.parquet')

In [None]:
df_original

In [None]:
start_date = pd.Timestamp(df_original.dropna(how='any',inplace=False).first_valid_index())
today = pd.Timestamp(datetime.today())
end_date = pd.Timestamp(df_original.dropna(how='any',inplace=False).last_valid_index())

In [None]:
from data_modules.collect_data_smard import DataEnergySMARD
o_smard = DataEnergySMARD(start_date=start_date, end_date=end_date)
df_smard_flow = o_smard.get_international_flow()
df_smard_flow

In [None]:
df_smard_flow.set_index('date',inplace=True)

In [None]:

for col in df_smard_flow.columns.to_list():
    if not col in df_original.columns.to_list():
        print(f"Adding...{col}")
        # merge the new column with the dataframe 
        df_original = df_original.merge(df_smard_flow[[col]], how='left', left_index=True, right_index=True)
df_original.to_parquet('../database/latest.parquet')

In [None]:
d

# Add new SMARD data (after aggregation change) and split to history and forecast

In [None]:
df_hist = pd.read_parquet('../database/history.parquet')
last_ts = pd.Timestamp(df_hist.dropna(how='any',inplace=False).last_valid_index())+timedelta(hours=1)
begin_ts = pd.Timestamp(df_hist.dropna(how='any',inplace=False).first_valid_index())
print(last_ts, begin_ts)
df_hist

In [None]:
from data_modules.collect_data_smard import DataEnergySMARD
o_smard = DataEnergySMARD(start_date=begin_ts, end_date=last_ts)
df_smard_flow = o_smard.get_international_flow()
df_smard_gen_forecasted = o_smard.get_forecasted_generation()
df_smard_con_forecasted = o_smard.get_forecasted_consumption()
df_smard = pd.merge(left=df_smard_flow,right=df_smard_gen_forecasted,left_on='date',right_on='date',how='outer')
df_smard = pd.merge(left=df_smard,right=df_smard_con_forecasted,left_on='date',right_on='date',how='outer')
df_smard.set_index('date',inplace=True)

In [None]:
# drop not needed cols from df_hist
for col in df_hist.columns.to_list():
    if col in df_smard.columns.to_list() and not col in ['DA_auction_price']:
        df_hist.drop(col, axis=1, inplace=True)
# drop cols that are not there but are no longer needed
for col in df_hist.columns.to_list():
    if col.__contains__('_flow'):
        df_hist.drop(col, axis=1, inplace=True)
# add cols from new data to old data
for col in df_smard.columns.to_list():
    df_hist[col] = df_smard[col]
df_hist

In [None]:
df_hist.to_parquet('../database/history.parquet')

# Add EPEXSPOT DATA FROM FILES TO DATAFRAME 

In [None]:
import pandas as pd
from glob import glob
from datetime import datetime, timedelta

In [None]:
df_hist = pd.read_parquet('../database/history.parquet')
last_ts = pd.Timestamp(df_hist.dropna(how='any',inplace=False).last_valid_index())+timedelta(hours=1)
begin_ts = pd.Timestamp(df_hist.dropna(how='any',inplace=False).first_valid_index())
print(last_ts, begin_ts)
df_hist

In [None]:
raw_datadir = "../data/DE-LU/DayAhead_MRC/"
files = glob(raw_datadir + '*.csv')
df_da_upd = pd.DataFrame()
for file in files:
    df_i = pd.read_csv(file)
    df_da_upd = pd.concat([df_da_upd, df_i])
if len(files) == 0:
    raise FileNotFoundError(f"File in {raw_datadir} does not exist")
df_da_upd['date'] = pd.to_datetime(df_da_upd['date'])
df_da_upd.sort_values(by='date', inplace=True)
df_da_upd.drop_duplicates(subset='date', keep='first', inplace=True)
# for agreement with energy-charts
df_da_upd['date'] = df_da_upd['date'].dt.tz_localize('Etc/GMT-2').dt.tz_convert('UTC') #
df_da_upd.rename(columns={'Price':'DA_auction_price'},inplace=True)
# we do not need other data for now
df_da_upd = df_da_upd[['date','DA_auction_price']]
df_da_upd.set_index('date',inplace=True)

In [None]:
df_hist = df_hist.fillna(df_da_upd)

In [None]:
df_hist


In [None]:
df_hist.to_parquet('../database/history.parquet')