In [2]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from matplotlib.backends.backend_pdf import PdfPages
import ipywidgets as widgets
from IPython.display import display
from bs4 import BeautifulSoup
import json
import numpy as np
from sklearn.metrics import mean_squared_error

# ------------------ EDIT ONLY HERE ------------------------
branch = 'main'
start_date = '2024-08-07'  # Start date
end_date = '2024-08-11'    # End date
make_pdf = True           # True to generate a PDF, False to display graphs
# ----------------------------------------------------------

# Base URL and date
base_url = 'https://github.com/sustainable-computing-io/kepler-metal-ci/tree/{branch}/docs/validation/{date}/'

# JSON file names
json_files = {
    'vm': 'kepler_node_package_joules_total--dynamic.json',
    'metal': 'kepler_vm_package_joules_total--dynamic.json'
}

# Function to calculate MAPE
def percentage_error(actual, predicted):
    res = np.empty(actual.shape)
    for j in range(actual.shape[0]):
        if actual[j] != 0:
            res[j] = (actual[j] - predicted[j]) / actual[j]
        else:
            res[j] = predicted[j] / np.mean(actual)
    return res

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs(percentage_error(np.asarray(y_true), np.asarray(y_pred)))) * 100

# Function to process each date
def process_date(date, save_to_pdf=False, pdf=None):
    date_str = date.strftime('%Y-%m-%d')
    date_url = base_url.format(branch=branch, date=date_str)
    response = requests.get(date_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        script_tag = soup.find('script', {'type': "application/json", 'data-target': "react-app.embeddedData"})
        if script_tag:
            json_data = json.loads(script_tag.string)

            folder_names = [item['name'] for item in json_data.get('payload', {}).get('tree', {}).get('items', []) if item.get('contentType') == 'directory']

            for folder_name in folder_names:
                folder_url = f'https://raw.githubusercontent.com/sustainable-computing-io/kepler-metal-ci/{branch}/docs/validation/{date_str}/{folder_name}/'

                df_metal = None
                df_vm = None

                for data_type, json_file in json_files.items():
                    json_url = folder_url + json_file
                    response = requests.get(json_url)
                    if response.status_code == 200:
                        data = response.json()

                        if 'timestamps' in data:
                            timestamps = data['timestamps']
                            timestamps = [datetime.datetime.fromtimestamp(ts) for ts in timestamps]
                        else:
                            timestamps = list(range(len(data['values'])))

                        values = data['values']

                        if data_type == 'metal':
                            df_metal = pd.DataFrame({'Timestamp': timestamps, 'Watts': values})
                        elif data_type == 'vm':
                            df_vm = pd.DataFrame({'Timestamp': timestamps, 'Watts': values})
                    else:
                        print(f"Failed to fetch data for {json_file} in folder {folder_name}")

                if df_metal is not None and df_vm is not None:
                    df_merged = pd.merge(df_metal, df_vm, on='Timestamp', how='inner', suffixes=('_metal', '_vm'))

                    if not df_merged.empty:
                        mse = mean_squared_error(df_merged['Watts_metal'], df_merged['Watts_vm'])
                        mape = mean_absolute_percentage_error(df_merged['Watts_metal'], df_merged['Watts_vm'])

                        fig, ax1 = plt.subplots(figsize=(14, 7))
                        ax1.plot(df_metal['Timestamp'], df_metal['Watts'], marker='x', color='#024abf', label='metal data')
                        ax1.set_xlabel('Timestamp')
                        ax1.set_ylabel('Metal [Watts]', color='#024abf')
                        ax1.tick_params(axis='y')

                        ax2 = ax1.twinx()
                        ax2.plot(df_vm['Timestamp'], df_vm['Watts'], marker='o', color='#ff742e', label='vm data')
                        ax2.set_ylabel('VM [Watts]', color='#ff742e')
                        ax2.tick_params(axis='y')

                        plt.title(f'Kepler Metal & VM / {folder_name} / Branch: {branch} / Date: {date_str}')
                        plt.xticks(rotation=45)
                        plt.grid(True)

                        lines, labels = ax1.get_legend_handles_labels()
                        lines2, labels2 = ax2.get_legend_handles_labels()
                        ax2.legend(lines + lines2, labels + labels2, loc='upper left')

                        textstr = f'MSE: {mse:.2f}\nMAPE: {mape:.2f}%'
                        ax1.text(0.98, 0.97, textstr, transform=ax1.transAxes, fontsize=14,
                                 verticalalignment='top', horizontalalignment='right',
                                 bbox=dict(facecolor='white', alpha=0.5))

                        if save_to_pdf and pdf:
                            pdf.savefig(fig)
                            plt.close(fig)
                        else:
                            plt.show()
                    else:
                        print(f"No overlapping data for metal and VM in folder {folder_name}")
                else:
                    print(f"Data for metal or VM not found in folder {folder_name}")
    else:
        print(f"Failed to fetch data for date {date_str}")

# Loop through each date in the range and process
start_date_obj = datetime.datetime.strptime(start_date, '%Y-%m-%d')
end_date_obj = datetime.datetime.strptime(end_date, '%Y-%m-%d')

if make_pdf:
    output_file = "kepler_report.pdf"
    with PdfPages(output_file) as pdf:
        current_date = start_date_obj
        while current_date <= end_date_obj:
            process_date(current_date, save_to_pdf=True, pdf=pdf)
            current_date += datetime.timedelta(days=1)
    print(f"PDF report saved as {output_file}")
else:
    current_date = start_date_obj
    while current_date <= end_date_obj:
        process_date(current_date)
        current_date += datetime.timedelta(days=1)


ModuleNotFoundError: No module named 'requests'