In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta

In [2]:
#Going to loop over 2 week periods to get all data as error when trying to load in 1 go
#Define overall date range
start_date= datetime(2024, 3, 25)
end_date = datetime(2024, 5, 24)

#Storage
intensity_data = []
generation_data = []


In [3]:
#Loop in 14-day chunks and will pull carbon intensity and generation data as similar formats
current_date = start_date

while current_date < end_date:
    #Pulls 14 days or until end date is reached
    next_date = min(current_date+timedelta(days=14),end_date)
    
    #Format into ISO 8601
    start_str = current_date.strftime('%Y-%m-%dT%H:%MZ')
    end_str = next_date.strftime('%Y-%m-%dT%H:%MZ')
    
    #Pull carbon intensity
    url_intensity= f'https://api.carbonintensity.org.uk/intensity/{start_str}/{end_str}'
    r_intensity = requests.get(url_intensity)
    
    #Error hadling
    if r_intensity.status_code == 200:
        intensity_data.extend(r_intensity.json()['data'])
    else:
        print('Error getting intensity')
        
    #Pull generation data
    url_gen = f'https://api.carbonintensity.org.uk/generation/{start_str}/{end_str}'
    r_gen = requests.get(url_gen)
    
    #Error handling
    if r_gen.status_code == 200:
        gen_results = r_gen.json()['data']
        for entry in gen_results:
            # Add datetime + also will add column for each type of fuel mix
            row = {'from': entry['from']}
            for fuel in entry['generationmix']:
                row[fuel['fuel']] = fuel['perc']
            generation_data.append(row)
    else:
        print('Error getting generation mix:', r_gen.status_code)
        
    current_date= next_date #Loop the next 14 days

In [4]:
#Need to convert to dataframes
intensity = pd.json_normalize(intensity_data)
intensity['from'] = pd.to_datetime(intensity['from'])
intensity.set_index('from', inplace=True)

generation = pd.DataFrame(generation_data)
generation['from'] = pd.to_datetime(generation['from'])
generation.set_index('from', inplace=True)

# Merge both based on the datetime index
combined = intensity.join(generation, how='inner')

combined

Unnamed: 0_level_0,to,intensity.forecast,intensity.actual,intensity.index,biomass,coal,imports,gas,nuclear,other,hydro,solar,wind
from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2024-03-24 23:30:00+00:00,2024-03-25T00:00Z,172,163,moderate,6.5,2.1,27.7,29.8,18.9,0,2.0,0.0,12.9
2024-03-25 00:00:00+00:00,2024-03-25T00:30Z,159,163,moderate,6.2,1.8,26.6,30.1,18.2,0,1.9,1.1,14.0
2024-03-25 00:30:00+00:00,2024-03-25T01:00Z,156,151,moderate,6.1,1.3,25.6,30.4,18.4,0,1.9,0.0,16.4
2024-03-25 01:00:00+00:00,2024-03-25T01:30Z,155,138,moderate,5.1,1.0,26.0,28.4,18.6,0,1.9,0.0,19.0
2024-03-25 01:30:00+00:00,2024-03-25T02:00Z,143,133,moderate,4.9,1.0,26.6,25.9,18.3,0,1.8,0.0,21.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-23 21:30:00+00:00,2024-05-23T22:00Z,148,136,moderate,6.0,0.0,13.9,34.1,18.7,0,1.5,0.0,25.9
2024-05-23 22:00:00+00:00,2024-05-23T22:30Z,143,136,moderate,6.2,0.0,14.4,33.5,19.4,0,1.1,0.0,25.4
2024-05-23 22:30:00+00:00,2024-05-23T23:00Z,155,139,moderate,6.3,0.0,15.1,34.0,19.7,0,1.0,0.0,23.8
2024-05-23 23:00:00+00:00,2024-05-23T23:30Z,138,142,moderate,6.4,0.0,15.4,34.9,20.1,0,0.7,0.1,22.3


In [5]:
#Only need to keep the beginning of time frame so drop the end
combined.drop(columns=['to'], inplace=True)

In [6]:
#Want to combine into fossil-like and 'clean sources'
#Classing biomass for intensity purposes as fossil-like, nuclear as clean
#Want to check if imports and other are large as want to drop
print('Imports mean: ', combined['imports'].mean(), ' max: ', combined['imports'].max(),
     ' std: ', combined['imports'].std())

print('Other mean: ', combined['other'].mean(), ' max: ', combined['other'].max(),
     ' std: ', combined['other'].std())


Imports mean:  14.243968199101273  max:  53.0  std:  4.992652528335805
Other mean:  0.0  max:  0  std:  0.0


In [7]:
#Can drop other with no concern, going to need to keep imports as significant

#Fossil like made with fossil fuels + biomass
combined['Fossil_like_share'] = combined[['biomass','coal','gas']].sum(axis=1)
#Low carbon is other renewables + nuclear
combined['Low_carbon_share'] = combined[['wind','solar','hydro','nuclear']].sum(axis=1)

#Will drop the columns that make up fossil like and low carbon
combined.drop(columns=['coal','gas','biomass','wind','solar','hydro','nuclear',
                      'other'], inplace=True)

combined

Unnamed: 0_level_0,intensity.forecast,intensity.actual,intensity.index,imports,Fossil_like_share,Low_carbon_share
from,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-03-24 23:30:00+00:00,172,163,moderate,27.7,38.4,33.8
2024-03-25 00:00:00+00:00,159,163,moderate,26.6,38.1,35.2
2024-03-25 00:30:00+00:00,156,151,moderate,25.6,37.8,36.7
2024-03-25 01:00:00+00:00,155,138,moderate,26.0,34.5,39.5
2024-03-25 01:30:00+00:00,143,133,moderate,26.6,31.8,41.7
...,...,...,...,...,...,...
2024-05-23 21:30:00+00:00,148,136,moderate,13.9,40.1,46.1
2024-05-23 22:00:00+00:00,143,136,moderate,14.4,39.7,45.9
2024-05-23 22:30:00+00:00,155,139,moderate,15.1,40.3,44.5
2024-05-23 23:00:00+00:00,138,142,moderate,15.4,41.3,43.2


In [8]:
#Need to pull date for weather, justification's in medium
#Don't really care about where but have latitude and longitude
#Write list of dictionaries
locations = [
    {'name': 'inverness', 'lat': 57.4778, 'lon': -4.2247, 'vars': ['windspeed_10m']},
    {'name': 'hull', 'lat': 53.7443, 'lon': -0.3324, 'vars': ['windspeed_10m']},
    {'name': 'aberystwyth', 'lat': 52.4153, 'lon': -4.0829, 'vars': ['windspeed_10m']},
    {'name': 'london', 'lat': 51.5074, 'lon': -0.1278, 'vars': ['temperature_2m']},
    {'name': 'manchester', 'lat': 53.4808, 'lon': -2.2426, 'vars': ['temperature_2m']},
    {'name': 'edinburgh', 'lat': 55.9533, 'lon': -3.1883, 'vars': ['temperature_2m']},
    {'name': 'brighton', 'lat': 50.8225, 'lon': -0.1372, 'vars': ['cloudcover']},
    {'name': 'plymouth', 'lat': 50.3755, 'lon': -4.1427, 'vars': ['cloudcover']},
    {'name': 'cambridge', 'lat': 52.2053, 'lon': 0.1218, 'vars': ['cloudcover']},
]

In [9]:
start = '2024-03-25'
end= '2024-05-24'

#Create empty dicts before looping
wind_data = {}
temp_data = {}
cloud_data = {}

for loc in locations:
    #String splicing and getting from loc
    var_string = ','.join(loc['vars'])
    url = (
        f'https://archive-api.open-meteo.com/v1/archive?'
        f"latitude={loc['lat']}&longitude={loc['lon']}&"
        f'start_date={start}&end_date={end}&'
        f'hourly={var_string}'
    )
    
    #Produced an error had to double check url, error check
    resp = requests.get(url).json()
    if 'error' in resp:
        print('API error:', resp['reason'])

    times = resp['hourly']['time']

    for var in loc['vars']:
        key = f"{loc['name']}_{var}"
        if var == 'windspeed_10m':
            wind_data[key] = resp['hourly'][var]
        elif var == 'temperature_2m':
            temp_data[key] = resp['hourly'][var]
        elif var == 'cloudcover':
            cloud_data[key] = resp['hourly'][var]

In [10]:
#Create data frame with wind time and cloud, put time as index so can sum rows
wind = pd.DataFrame(wind_data)
wind['time'] = times
wind = wind.set_index(pd.to_datetime(wind['time'])).drop(columns=['time'])

temp = pd.DataFrame(temp_data)
temp['time'] = times
temp = temp.set_index(pd.to_datetime(temp['time'])).drop(columns=['time'])

cloud = pd.DataFrame(cloud_data)
cloud['time'] = times
cloud =cloud.set_index(pd.to_datetime(cloud['time'])).drop(columns=['time'])

In [11]:
#For windspeed want the average but also counting how many locations greater
#than 25 m/s as wind turbine not expected to work
wind['avg_wind'] = wind.mean(axis=1)

wind_cols = [col for col in wind.columns]
wind['wind_overflow'] = (wind[wind_cols]>25).sum(axis=1)

In [12]:
#Averaging temperature and cloud cover
temp['avg_temp']=temp.mean(axis =1)
cloud['avg_cover']=cloud.mean(axis=1)

In [13]:
wind


Unnamed: 0_level_0,inverness_windspeed_10m,hull_windspeed_10m,aberystwyth_windspeed_10m,avg_wind,wind_overflow
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-03-25 00:00:00,6.1,7.0,23.6,12.233333,0
2024-03-25 01:00:00,5.0,8.9,20.4,11.433333,0
2024-03-25 02:00:00,5.5,10.2,21.6,12.433333,0
2024-03-25 03:00:00,6.5,10.5,21.5,12.833333,0
2024-03-25 04:00:00,6.3,11.8,19.7,12.600000,0
...,...,...,...,...,...
2024-05-24 19:00:00,5.2,7.4,4.3,5.633333,0
2024-05-24 20:00:00,4.2,5.2,4.2,4.533333,0
2024-05-24 21:00:00,3.0,7.6,3.7,4.766667,0
2024-05-24 22:00:00,3.6,7.5,2.4,4.500000,0


In [14]:
#Select specific columns I need
wind_sel = wind[['avg_wind','wind_overflow']]
temp_sel = temp[['avg_temp']]
cloud_sel = cloud[['avg_cover']]

wind_sel.reset_index()
temp_sel.reset_index()
cloud_sel.reset_index()

#Merge into data set using time as key
weather = wind_sel.merge(temp_sel, on='time', how='inner')
weather = weather.merge(cloud_sel, on='time', how='inner')


In [15]:
weather = weather.reset_index()

In [16]:
#Need to change index on the combined dataset, rename to time also remove +00:00
combined = combined.reset_index()
combined = combined.rename(columns={'from':'time'})
combined['time'] = combined['time'].dt.tz_convert(None)
combined = combined.set_index('time',drop= True)


In [17]:
combined.dtypes

intensity.forecast      int64
intensity.actual        int64
intensity.index        object
imports               float64
Fossil_like_share     float64
Low_carbon_share      float64
dtype: object

In [18]:
weather.dtypes

time             datetime64[ns]
avg_wind                float64
wind_overflow             int64
avg_temp                float64
avg_cover               float64
dtype: object

In [19]:
#Weather every hour so need to reindex annd fill forward, going to interpolate
weather = weather.set_index('time',drop= True)
new_index=pd.date_range(start=weather.index.min(),
                       end = weather.index.max(),
                       freq='30T')

weather_30min = weather.reindex(new_index)
weather_30min.index.name = 'time'
weather_30min = weather_30min.interpolate(method='time')

In [20]:
weather_30min

Unnamed: 0_level_0,avg_wind,wind_overflow,avg_temp,avg_cover
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-03-25 00:00:00,12.233333,0.0,2.966667,79.000000
2024-03-25 00:30:00,11.833333,0.0,3.466667,89.500000
2024-03-25 01:00:00,11.433333,0.0,3.966667,100.000000
2024-03-25 01:30:00,11.933333,0.0,4.316667,100.000000
2024-03-25 02:00:00,12.433333,0.0,4.666667,100.000000
...,...,...,...,...
2024-05-24 21:00:00,4.766667,0.0,12.100000,34.666667
2024-05-24 21:30:00,4.633333,0.0,11.650000,27.333333
2024-05-24 22:00:00,4.500000,0.0,11.200000,20.000000
2024-05-24 22:30:00,4.833333,0.0,10.933333,14.333333


In [21]:
forecast_data = combined.merge(weather_30min, left_index=True,
                              right_index = True, how = 'inner')
forecast_data

Unnamed: 0_level_0,intensity.forecast,intensity.actual,intensity.index,imports,Fossil_like_share,Low_carbon_share,avg_wind,wind_overflow,avg_temp,avg_cover
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-03-25 00:00:00,159,163,moderate,26.6,38.1,35.2,12.233333,0.0,2.966667,79.000000
2024-03-25 00:30:00,156,151,moderate,25.6,37.8,36.7,11.833333,0.0,3.466667,89.500000
2024-03-25 01:00:00,155,138,moderate,26.0,34.5,39.5,11.433333,0.0,3.966667,100.000000
2024-03-25 01:30:00,143,133,moderate,26.6,31.8,41.7,11.933333,0.0,4.316667,100.000000
2024-03-25 02:00:00,139,128,moderate,26.7,30.0,43.2,12.433333,0.0,4.666667,100.000000
...,...,...,...,...,...,...,...,...,...,...
2024-05-23 21:30:00,148,136,moderate,13.9,40.1,46.1,13.733333,0.0,11.233333,24.166667
2024-05-23 22:00:00,143,136,moderate,14.4,39.7,45.9,13.300000,0.0,11.233333,15.333333
2024-05-23 22:30:00,155,139,moderate,15.1,40.3,44.5,12.983333,0.0,11.100000,19.000000
2024-05-23 23:00:00,138,142,moderate,15.4,41.3,43.2,12.666667,0.0,10.966667,22.666667


In [22]:
#Add day of the week, weekend, bank holidays and hour
forecast_data['day_of_week'] = forecast_data.index.weekday
forecast_data['is_weekend']= forecast_data['day_of_week']>=5
forecast_data['hour']= forecast_data.index.hour

# All holidays in my timeframe
uk_holidays = pd.to_datetime([
    '2024-03-29','2024-04-01','2024-05-06',
    '2024-05-27'
])

forecast_data['is_bank_holiday']= forecast_data.index.normalize().isin(uk_holidays)

In [23]:
forecast_data

Unnamed: 0_level_0,intensity.forecast,intensity.actual,intensity.index,imports,Fossil_like_share,Low_carbon_share,avg_wind,wind_overflow,avg_temp,avg_cover,day_of_week,is_weekend,hour,is_bank_holiday
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2024-03-25 00:00:00,159,163,moderate,26.6,38.1,35.2,12.233333,0.0,2.966667,79.000000,0,False,0,False
2024-03-25 00:30:00,156,151,moderate,25.6,37.8,36.7,11.833333,0.0,3.466667,89.500000,0,False,0,False
2024-03-25 01:00:00,155,138,moderate,26.0,34.5,39.5,11.433333,0.0,3.966667,100.000000,0,False,1,False
2024-03-25 01:30:00,143,133,moderate,26.6,31.8,41.7,11.933333,0.0,4.316667,100.000000,0,False,1,False
2024-03-25 02:00:00,139,128,moderate,26.7,30.0,43.2,12.433333,0.0,4.666667,100.000000,0,False,2,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-05-23 21:30:00,148,136,moderate,13.9,40.1,46.1,13.733333,0.0,11.233333,24.166667,3,False,21,False
2024-05-23 22:00:00,143,136,moderate,14.4,39.7,45.9,13.300000,0.0,11.233333,15.333333,3,False,22,False
2024-05-23 22:30:00,155,139,moderate,15.1,40.3,44.5,12.983333,0.0,11.100000,19.000000,3,False,22,False
2024-05-23 23:00:00,138,142,moderate,15.4,41.3,43.2,12.666667,0.0,10.966667,22.666667,3,False,23,False


In [24]:
#Add lagging features, going to use a 3hr rolling mean to limit an anomaly in lag
forecast_data['2hr_rolling_from_2hrs_ago'] = forecast_data['intensity.actual'].shift(8).rolling(4).mean()
forecast_data['2hr_rolling_from_1d_ago'] = forecast_data['intensity.actual'].shift(48).rolling(4).mean()
forecast_data=forecast_data.dropna()
forecast_data.head()

Unnamed: 0_level_0,intensity.forecast,intensity.actual,intensity.index,imports,Fossil_like_share,Low_carbon_share,avg_wind,wind_overflow,avg_temp,avg_cover,day_of_week,is_weekend,hour,is_bank_holiday,2hr_rolling_from_2hrs_ago,2hr_rolling_from_1d_ago
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2024-03-26 01:30:00,89,92,low,22.7,24.3,53.1,12.066667,0.0,5.433333,99.666667,1,False,1,False,153.5,146.25
2024-03-26 02:00:00,92,94,low,22.9,24.7,52.4,12.166667,0.0,5.233333,99.333333,1,False,2,False,138.75,137.5
2024-03-26 02:30:00,97,102,low,22.7,26.0,51.3,12.55,0.0,5.15,99.666667,1,False,2,False,124.25,131.75
2024-03-26 03:00:00,93,103,low,22.7,26.5,50.8,12.933333,0.0,5.066667,100.0,1,False,3,False,111.0,128.0
2024-03-26 03:30:00,94,99,low,20.9,28.2,51.0,12.65,0.0,5.133333,100.0,1,False,3,False,98.75,125.0


In [25]:
forecast_data.to_csv('carbon_weather_data.csv', index = True)