In [1]:
import io
from io import BytesIO
from zipfile import ZipFile
import pandas as pd
import requests
import pytz, datetime

from googletrans import Translator
translator = Translator()

In [2]:
url = "https://ai4impact.org/P003/historical/energy-ile-de-france.csv"

In [3]:
r = requests.get(url)
data = r.content.decode('utf8')
ac_df = pd.read_csv(io.StringIO(data), header = None, names=['datetime','wind'])

In [4]:
ac_df['datetime'] = pd.to_datetime(ac_df['datetime'])

In [5]:
required_columns = ['Périmètre', 'Nature', 'Date', 'Heures', 'Consommation', 'Thermique',
        'Eolien', 'Solaire', 'Hydraulique',
       'Bioénergies', 'Ech. physiques']

In [6]:
url_list = ["https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_Ile-de-France_Annuel-Definitif_2017.zip",
            "https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_Ile-de-France_Annuel-Definitif_2018.zip",
            "https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_Ile-de-France_En-cours-Consolide.zip",
           "https://eco2mix.rte-france.com/download/eco2mix/eCO2mix_RTE_Ile-de-France_En-cours-TR.zip"]

In [7]:
df_list = []

for url in url_list:
    content = requests.get(url)
    zf = ZipFile(BytesIO(content.content))

    for item in zf.namelist():
        print("File in zip: "+  item)

    # find the first matching csv file in the zip:
    match = [s for s in zf.namelist() if ".xls" in s][0]
    
    tmp_df = pd.read_table(zf.open(match), index_col=False, usecols = required_columns, encoding='ISO-8859-1').head(-1)
    
    df_list.append(tmp_df)
    
df = pd.concat(df_list).reset_index(drop=True)

translated_columns = [translator.translate(i, src='fr', dest='en').text for i in df.columns]
df.columns = translated_columns

#consider all datetime is in summer time of france
#df['datetime'] = pd.to_datetime(df['Dated'] + " " + df['Hours']) - datetime.timedelta(hours=2)

File in zip: eCO2mix_RTE_Ile-de-France_Annuel-Definitif_2017.xls
File in zip: eCO2mix_RTE_Ile-de-France_Annuel-Definitif_2018.xls
File in zip: eCO2mix_RTE_Ile-de-France_En-cours-Consolide.xls
File in zip: eCO2mix_RTE_Ile-de-France_En-cours-TR.xls


In [8]:
# localising timezone to Paris, but probematics. 

local = pytz.timezone ("Europe/Paris")

df['datetime'] = df['Dated'] + " " + df['Hours']

df['datetime'] = df['datetime'].apply(lambda x: local.localize(datetime.datetime.strptime(x, "%Y-%m-%d %H:%M"), is_dst=True).astimezone(pytz.utc))

In [9]:
df.datetime.value_counts()

2017-03-26 00:15:00+00:00    2
2018-03-25 00:45:00+00:00    2
2020-03-29 00:30:00+00:00    2
2020-03-29 00:45:00+00:00    2
2018-03-25 00:00:00+00:00    2
                            ..
2018-01-11 23:45:00+00:00    1
2017-07-11 07:30:00+00:00    1
2019-07-06 14:00:00+00:00    1
2017-10-14 14:30:00+00:00    1
2017-12-31 22:00:00+00:00    1
Name: datetime, Length: 124112, dtype: int64

In [10]:
df = df.drop_duplicates(subset=['datetime'])

In [11]:
df['datetime'] = df['datetime'].astype(str).apply(lambda x: x.split("+")[0])

In [12]:
df['datetime'] = pd.to_datetime(df['datetime'])

In [13]:
#df = df[['datetime','Wind']]

In [14]:
df = df[df['Consumption'] != "ND"]

In [16]:
for i in ['Consumption', 'Thermal', 'Wind', 'Solar', 'Hydraulic', 'Bioenergies','Ech. physical']:
    df[i] = pd.to_numeric(df[i]) * 250

In [17]:
df.columns

Index(['Perimeter', 'Nature', 'Dated', 'Hours', 'Consumption', 'Thermal',
       'Wind', 'Solar', 'Hydraulic', 'Bioenergies', 'Ech. physical',
       'datetime'],
      dtype='object')

In [18]:
df

Unnamed: 0,Perimeter,Nature,Dated,Hours,Consumption,Thermal,Wind,Solar,Hydraulic,Bioenergies,Ech. physical,datetime
0,Ile-de-France,Données définitives,2017-01-01,00:00,2999750.0,133500.0,0.0,0.0,2000.0,39500.0,2824500.0,2016-12-31 23:00:00
1,Ile-de-France,Données définitives,2017-01-01,00:15,,,,,,,,2016-12-31 23:15:00
2,Ile-de-France,Données définitives,2017-01-01,00:30,2942500.0,134000.0,0.0,0.0,2000.0,39250.0,2767250.0,2016-12-31 23:30:00
3,Ile-de-France,Données définitives,2017-01-01,00:45,,,,,,,,2016-12-31 23:45:00
4,Ile-de-France,Données définitives,2017-01-01,01:00,2891750.0,134000.0,0.0,0.0,2000.0,40000.0,2715750.0,2017-01-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
124123,Ile-de-France,Données temps réel,2020-07-16,22:45,,,,,,,,2020-07-16 20:45:00
124124,Ile-de-France,Données temps réel,2020-07-16,23:00,,,,,,,,2020-07-16 21:00:00
124125,Ile-de-France,Données temps réel,2020-07-16,23:15,,,,,,,,2020-07-16 21:15:00
124126,Ile-de-France,Données temps réel,2020-07-16,23:30,,,,,,,,2020-07-16 21:30:00


In [77]:
agg_df = df[['datetime','Wind']].groupby([df['datetime'].dt.floor('H')]).agg(['mean','sum','std'])

In [78]:
agg_df.columns = agg_df.columns.map('_'.join)

In [79]:
agg_df = agg_df.reset_index()

In [80]:
agg_df

Unnamed: 0,datetime,Wind_mean,Wind_sum,Wind_std
0,2016-12-31 23:00:00,0.0,0.0,0.0
1,2017-01-01 00:00:00,0.0,0.0,0.0
2,2017-01-01 01:00:00,0.0,0.0,0.0
3,2017-01-01 02:00:00,0.0,0.0,0.0
4,2017-01-01 03:00:00,0.0,0.0,0.0
...,...,...,...,...
31013,2020-07-16 17:00:00,,0.0,
31014,2020-07-16 18:00:00,,0.0,
31015,2020-07-16 19:00:00,,0.0,
31016,2020-07-16 20:00:00,,0.0,


In [85]:
check_df = agg_df.merge(ac_df, how='left', on='datetime')[['datetime','Wind_sum','wind']]

In [86]:
check_df.columns = ['datetime','wind_from_RTE','wind_from_ai4impact']

In [87]:
check_df[check_df.wind_from_ai4impact.notnull()].tail(25)

Unnamed: 0,datetime,wind_from_RTE,wind_from_ai4impact
30981,2020-07-15 09:00:00,6500.0,6500.0
30982,2020-07-15 10:00:00,7000.0,7000.0
30983,2020-07-15 11:00:00,9500.0,9500.0
30984,2020-07-15 12:00:00,11000.0,11000.0
30985,2020-07-15 13:00:00,10250.0,10250.0
30986,2020-07-15 14:00:00,12250.0,12250.0
30987,2020-07-15 15:00:00,16750.0,16750.0
30988,2020-07-15 16:00:00,14250.0,14250.0
30989,2020-07-15 17:00:00,15500.0,15500.0
30990,2020-07-15 18:00:00,12250.0,12250.0


In [19]:
wf_list = ["guitrancourt", "lieusaint", "lvs-pussay", "parc-du-gatinais", "arville", "boissy-la-riviere", "angerville-1", "angerville-2",
"guitrancourt-b", "lieusaint-b", "lvs-pussay-b", "parc-du-gatinais-b", "arville-b", "boissy-la-riviere-b", "angerville-1-b", "angerville-2-b"]

In [20]:
forecast_df_list = []

for forecast in wf_list:
    
    hist_url = "https://ai4impact.org/P003/historical/" + forecast +".csv"

    r = requests.get(hist_url)
    data = r.content.decode('utf8').split("UTC\n")[1]
    hist_tmp_df = pd.read_csv(io.StringIO(data))
    
    current_url = "https://ai4impact.org/P003/" + forecast +".csv"

    r = requests.get(current_url)
    data = r.content.decode('utf8').split("UTC\n")[1]
    current_tmp_df = pd.read_csv(io.StringIO(data))
    
    tmp_df = pd.concat([hist_tmp_df,current_tmp_df]).reset_index(drop=True).rename(columns={'Speed(m/s)':forecast + '_Speed(m/s)', 'Direction (deg N)':forecast + '_Direction (deg N)'})
    
    tmp_df['datetime'] = pd.to_datetime(tmp_df['Time'].str.replace("UTC", ""))
    
    tmp_df = tmp_df.drop(columns=['Time'])
    
    forecast_df_list.append(tmp_df)
    
main_df = df.copy()

for i in forecast_df_list:
    main_df = main_df.merge(i, how='left', on='datetime')

In [21]:
main_df

Unnamed: 0,Perimeter,Nature,Dated,Hours,Consumption,Thermal,Wind,Solar,Hydraulic,Bioenergies,...,parc-du-gatinais-b_Speed(m/s),parc-du-gatinais-b_Direction (deg N),arville-b_Speed(m/s),arville-b_Direction (deg N),boissy-la-riviere-b_Speed(m/s),boissy-la-riviere-b_Direction (deg N),angerville-1-b_Speed(m/s),angerville-1-b_Direction (deg N),angerville-2-b_Speed(m/s),angerville-2-b_Direction (deg N)
0,Ile-de-France,Données définitives,2017-01-01,00:00,2999750.0,133500.0,0.0,0.0,2000.0,39500.0,...,,,,,,,,,,
1,Ile-de-France,Données définitives,2017-01-01,00:15,,,,,,,...,,,,,,,,,,
2,Ile-de-France,Données définitives,2017-01-01,00:30,2942500.0,134000.0,0.0,0.0,2000.0,39250.0,...,,,,,,,,,,
3,Ile-de-France,Données définitives,2017-01-01,00:45,,,,,,,...,,,,,,,,,,
4,Ile-de-France,Données définitives,2017-01-01,01:00,2891750.0,134000.0,0.0,0.0,2000.0,40000.0,...,2.51,225.0,2.52,225.0,2.73,226.0,2.74,228.0,2.74,228.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124060,Ile-de-France,Données temps réel,2020-07-16,22:45,,,,,,,...,,,,,,,,,,
124061,Ile-de-France,Données temps réel,2020-07-16,23:00,,,,,,,...,3.17,354.0,3.17,354.0,3.25,351.0,3.25,353.0,3.25,353.0
124062,Ile-de-France,Données temps réel,2020-07-16,23:15,,,,,,,...,,,,,,,,,,
124063,Ile-de-France,Données temps réel,2020-07-16,23:30,,,,,,,...,,,,,,,,,,
