# Notebook to simulate data and a SCI score for england over two years

In [None]:
import pandas as pd
import json
import numpy as np

In [None]:
%load_ext autoreload
%autoreload 2
import simulate_sci_data as sci

In [None]:
sci.simulate_sci_data("../data/carbonintensitydata.json", "../data/devices_generated.json",  "../data/timeseries.json", "both")

In [None]:
def process_dict_df(df, feature):
    inner = df[feature].map(lambda d: next(iter(d.values())) if isinstance(d, dict) and d else {})
    df_matrix = inner.apply(pd.Series)
    df_matrix.index = pd.to_datetime(df["time"])  # optional but handy
    return df_matrix.sort_index().sort_index(axis=1)

def load_sci_raw_data(path_carbon_intensity, path_devices_details, path_timeseries_data):
    df_carbon_intensity = pd.read_json(path_carbon_intensity).T
    df_devices = pd.read_json(path_devices_details)
    df_devices = pd.json_normalize(df_devices["devices"])
    df_time_seriesV2 = pd.read_json(path_timeseries_data) 
    
    df_hardware_usage = process_dict_df(df_time_seriesV2, "hardware_usages")
    df_pue = process_dict_df(df_time_seriesV2, "pue_values")

    return df_carbon_intensity, df_devices, df_time_seriesV2, df_hardware_usage, df_pue

def compute_o(df_carbon_intensity, df_devices, df_time_seriesV2, df_hardware_usage, df_pue):

    df_final_o = pd.DataFrame()
    
    for device_id in df_devices.device_id:
        #get the inputs
        tmp_pue = df_pue[device_id]
        tmp_carbon = df_carbon_intensity[df_devices[df_devices["device_id"]==device_id]["location"]]
        power_idle = df_devices[df_devices["device_id"]==device_id]["power_idle"].values[0]
        power_var = df_devices[df_devices["device_id"]==device_id]["power_variable"].values[0]
        tmp_usage = df_hardware_usage[device_id] 
    
        #combine them into a DF
        tmp_carbon.columns = ["carbon"]
        df_o = pd.concat([tmp_pue.rename("pue"), tmp_carbon], axis=1)
        df_o = pd.concat([df_o, tmp_usage.rename("usage")], axis=1)
        df_o["idle"] = power_idle
        df_o["power"] = power_var
    
        #make the computations
        df_o["energy_consumption"] = ((df_o["power"] * df_o["usage"] ) + df_o["idle"]) *0.5
        df_o.drop(["idle", "power", "usage"], inplace=True, axis=1)
        df_o["o"] = (df_o["pue"] * df_o["carbon"] * df_o["energy_consumption"]) /1000
    
        #save everything
        df_final_o = pd.concat([df_final_o, df_o.rename({"o":device_id}, axis=1)[device_id]], axis=1)
    return df_final_o

def compute_m(df_carbon_intensity, df_devices, df_time_seriesV2, df_hardware_usage, df_pue):
    df_final_m = pd.DataFrame()

    for device_id in df_devices.device_id:
        #get the inputs into a dataframe
        df_m = pd.DataFrame(df_hardware_usage[device_id], columns = [device_id]).rename({device_id:"usage"}, axis=1)
        df_m["embodied_carbon"] = df_devices[df_devices["device_id"]==device_id]["embodied_carbon"].values[0]
        df_m["lifetime"] = df_devices[df_devices["device_id"]==device_id]["lifetime"].values[0]
        df_m.index.name = None
        
        #make the computations
        df_m["M"] = df_m["embodied_carbon"] * (0.5 / df_m["lifetime"]) * df_m["usage"]
        df_m.drop(["lifetime", "embodied_carbon", "usage"], inplace=True, axis=1)
        
        #save everything
        df_final_m = pd.concat([df_final_m, df_m.rename({"M":device_id}, axis=1)[device_id]], axis=1)
    return df_final_m

def compute_sci(df_final_o, df_final_m, df_time_seriesV2):
    df_sci = pd.concat([df_final_o.sum(axis=1).rename("O"), df_final_m.sum(axis=1).rename("M")], axis=1)
    # add functional units 
    df_functinal_units = df_time_seriesV2["functional_units"]
    df_functinal_units.index = df_time_seriesV2["time"]
    df_sci.index = pd.to_datetime(df_sci.index).tz_convert("UTC")
    df_functinal_units.index = pd.to_datetime(df_functinal_units.index).tz_convert("UTC")
    
    df_sci  = pd.concat([df_sci , df_functinal_units], axis=1)
    df_sci.reset_index(inplace=True)
    df_sci.rename({"index":"datetime"}, axis=1, inplace=True)
    return df_sci

def simulate_sci_forecast(df_sci):
    df_shifted = df_sci.copy()
    df_shifted.datetime = pd.to_datetime(df_shifted.datetime) + pd.DateOffset(years=1)

    df_shifted["O"] = df_shifted["O"] * (1 + np.random.normal(0, 0.05, len(df_shifted)))
    df_shifted["M"] = df_shifted["M"] * (1 + np.random.normal(0, 0.05, len(df_shifted)))
    df_shifted["functional_units"] = df_shifted["functional_units"] * (1 + np.random.normal(0, 0.05, len(df_shifted)))
    df_shifted["functional_units"] = df_shifted["functional_units"].round().astype(int)
    df_shifted["SCI"] = (df_shifted["O"] + df_shifted["M"]) / (df_shifted["functional_units"] * 1000)

    return df_shifted


def simulate_sci_data(path_carbon_intensity, path_devices_details, path_timeseries_data, type_to_simulate):

    df_carbon_intensity, df_devices, df_time_seriesV2, df_hardware_usage, df_pue = load_sci_raw_data(path_carbon_intensity, 
                                                                                                     path_devices_details, 
                                                                                                     path_timeseries_data)
    df_final_o = compute_o(df_carbon_intensity, df_devices, df_time_seriesV2, df_hardware_usage, df_pue)
    df_final_m = compute_m(df_carbon_intensity, df_devices, df_time_seriesV2, df_hardware_usage, df_pue)
    df_sci = compute_sci(df_final_o, df_final_m, df_time_seriesV2)

    if type_to_simulate == "past":
        return df_sci
    elif type_to_simulate == "future":
        return simulate_sci_forecast(df_sci)
    else:
        return df_sci, simulate_sci_forecast(df_sci)



In [None]:
simulate_sci_data("../data/carbonintensitydata.json", "../data/devices_generated.json",  "../data/timeseries.json", "both")

## Data loading

We load the carbon intensity data

In [None]:
df_carbon_intensity = pd.read_json("../data/carbonintensitydata.json").T
df_carbon_intensity

the details aboit te devices

In [None]:
df_devices = pd.read_json("../data/devices_generated.json")
df_devices = pd.json_normalize(df_devices["devices"])

In [None]:
df_devices

and the "consumption details" about usage and PUE of the devices

In [None]:
df_time_seriesV2 = pd.read_json("../data/timeseries.json") 

In [None]:
df_time_seriesV2

From that we extract the hardware usage into a dataframe

In [None]:
inner = df_time_seriesV2["hardware_usages"].map(
    lambda d: next(iter(d.values())) if isinstance(d, dict) and d else {}
)

# 2) Expand to columns (device IDs) and index by time
df_matrix = inner.apply(pd.Series)
df_matrix.index = pd.to_datetime(df_time_seriesV2["time"])  # optional but handy
df_hardware_usage = df_matrix.sort_index().sort_index(axis=1)

In [None]:
df_hardware_usage

and we do teh same for the PUE

In [None]:
inner = df_time_seriesV2["pue_values"].map(
    lambda d: next(iter(d.values())) if isinstance(d, dict) and d else {}
)

# 2) Expand to columns (device IDs) and index by time
df_matrix = inner.apply(pd.Series)
df_matrix.index = pd.to_datetime(df_time_seriesV2["time"])  # optional but handy
df_pue = df_matrix.sort_index().sort_index(axis=1)

In [None]:
df_pue

## SCI computation

### Compute O

First, we compute O for all the devices

In [None]:
df_final_o = pd.DataFrame()

for device_id in df_devices.device_id:
    #get the inputs
    tmp_pue = df_pue[device_id]
    tmp_carbon = df_carbon_intensity[df_devices[df_devices["device_id"]==device_id]["location"]]
    power_idle = df_devices[df_devices["device_id"]==device_id]["power_idle"].values[0]
    power_var = df_devices[df_devices["device_id"]==device_id]["power_variable"].values[0]
    tmp_usage = df_hardware_usage[device_id] 

    #combine them into a DF
    tmp_carbon.columns = ["carbon"]
    df_o = pd.concat([tmp_pue.rename("pue"), tmp_carbon], axis=1)
    df_o = pd.concat([df_o, tmp_usage.rename("usage")], axis=1)
    df_o["idle"] = power_idle
    df_o["power"] = power_var

    #make the computations
    df_o["energy_consumption"] = ((df_o["power"] * df_o["usage"] ) + df_o["idle"]) *0.5
    df_o.drop(["idle", "power", "usage"], inplace=True, axis=1)
    df_o["o"] = (df_o["pue"] * df_o["carbon"] * df_o["energy_consumption"]) /1000

    #save everything
    df_final_o = pd.concat([df_final_o, df_o.rename({"o":device_id}, axis=1)[device_id]], axis=1)



In [None]:
df_final_o

## compute M

Second, we compute M for all devices

In [None]:
df_final_m = pd.DataFrame()

for device_id in df_devices.device_id:
    #get the inputs into a dataframe
    df_m = pd.DataFrame(df_hardware_usage[device_id], columns = [device_id]).rename({device_id:"usage"}, axis=1)
    df_m["embodied_carbon"] = df_devices[df_devices["device_id"]==device_id]["embodied_carbon"].values[0]
    df_m["lifetime"] = df_devices[df_devices["device_id"]==device_id]["lifetime"].values[0]
    df_m.index.name = None
    
    #make the computations
    df_m["M"] = df_m["embodied_carbon"] * (0.5 / df_m["lifetime"]) * df_m["usage"]
    df_m.drop(["lifetime", "embodied_carbon", "usage"], inplace=True, axis=1)
    
    #save everything
    df_final_m = pd.concat([df_final_m, df_m.rename({"M":device_id}, axis=1)[device_id]], axis=1)

In [None]:
df_final_m

### Compute SCI

First, we collect O and M

In [None]:
df_sci = pd.concat([df_final_o.sum(axis=1).rename("O"), df_final_m.sum(axis=1).rename("M")], axis=1)

We add the number of requests, i.e. the functional_units

In [None]:
# add functional units 
df_functinal_units = df_time_seriesV2["functional_units"]
df_functinal_units.index = df_time_seriesV2["time"]
df_sci .index = pd.to_datetime(df_sci .index).tz_convert("UTC")
df_functinal_units.index = pd.to_datetime(df_functinal_units.index).tz_convert("UTC")

df_sci  = pd.concat([df_sci , df_functinal_units], axis=1)

and we compute the SCI

In [None]:
df_sci["SCI"] = (df_sci["O"] + df_sci["M"]) / (df_sci["functional_units"] * 1000)

In [None]:
df_sci

In [None]:
df_sci.reset_index(inplace=True)
df_sci.rename({"index":"datetime"}, axis=1, inplace=True)

In [None]:
df_sci

In [None]:
df_sci.to_csv("../data/simulated_sci.csv", sep=";")

## Let's pretend that we can forecast stuff

the goal is to shift the data of 1 year, and to reshuffle it a bit so it doesn't look exactly the same.

First, we shift the data of one year

In [None]:
df_shifted = df_sci.copy()
df_shifted.datetime = pd.to_datetime(df_shifted.datetime) + pd.DateOffset(years=1)

In [None]:
df_shifted

Then, we had some gaussian noise to O, M and the functional units. And we compute SCI again

In [None]:
df_shifted["O"] = df_shifted["O"] * (1 + np.random.normal(0, 0.05, len(df_shifted)))
df_shifted["M"] = df_shifted["M"] * (1 + np.random.normal(0, 0.05, len(df_shifted)))
df_shifted["functional_units"] = df_shifted["functional_units"] * (1 + np.random.normal(0, 0.05, len(df_shifted)))
df_shifted["functional_units"] = df_shifted["functional_units"].round().astype(int)
df_shifted["SCI"] = (df_shifted["O"] + df_shifted["M"]) / (df_shifted["functional_units"] * 1000)

In [None]:
df_shifted

In [None]:
df_shifted.to_csv("../data/simulated_sci_forecasted.csv", sep=";")