In [1]:
# THIS NOTEBOOK TAKES RAW DATA FROM EXPERIMENTS AND OUTPUT A PROCESSED DATASET TO BE USED AS INPUT FOR STREAMLIT APPLICATION

In [167]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [168]:
# load dataset
df = pd.read_csv("data_raw/yearly_log copy.csv")

# keep important columns to be used in the prototype
df = df[['timestamp','project_name','run_id','duration','emissions','emissions_rate','cpu_power','ram_power',
         'cpu_energy','ram_energy','energy_consumed','country_name','country_iso_code','region','os','python_version',
         'cpu_count','cpu_model','longitude','latitude','ram_total_size']]



# THIS STEP ADD THE FAKE DATE COLUMN FOR EVERY PROJECT EVERY ROW AS IT WAS A COMPUTATION OF THE DAY
# Sort by 'project_name'
df.sort_values(by='project_name', inplace=True)
# Group by 'project_name' and add a 'date' column
df['date'] = df.groupby('project_name').cumcount().apply(lambda x: datetime(2022, 1, 1) + timedelta(days=x))
# Insert the "date" column at the second position (index 1)
df.insert(1, 'date', df.pop('date'))


# THIS STEP DROP USELESS ROWS IN ORDER TO HAVE 365 RUNS FOR EVERY PROJECT (1 YEAR OF DEPLOYMENT RUNS OF THE 3 MODELS)
# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])
# Sort DataFrame by 'project_name' and 'date'
df.sort_values(by=['project_name', 'date'], inplace=True)
# Group by 'project_name' and keep only the first 365 rows in each group
df = df.groupby('project_name').head(365)
# Reset index if needed
df.reset_index(drop=True, inplace=True)
#drop timestamp and run_id columns
df = df.drop(columns=['timestamp','run_id'])
# Normalize column 'project_name'
df['project_name'] = df['project_name'].str.lower().str.replace(' ', '')


'''

# THIS STEP IS NEEDED NOT NOW BUT FOR SURE IN PRODUCTION PHASE, TO DELETE ROWS THAT HAV A DATE PREVIOUS 1 YEAR AGO

# Define the cutoff date (today - 1 year)
cutoff_date = datetime.now() - timedelta(days=365)
# Filter rows based on the condition
df = df[df['date'] >= cutoff_date]

'''

"\n\n# THIS STEP IS NEEDED NOT NOW BUT FOR SURE IN PRODUCTION PHASE, TO DELETE ROWS THAT HAV A DATE PREVIOUS 1 YEAR AGO\n\n# Define the cutoff date (today - 1 year)\ncutoff_date = datetime.now() - timedelta(days=365)\n# Filter rows based on the condition\ndf = df[df['date'] >= cutoff_date]\n\n"

In [169]:
# THIS STEP CREATE 3 DATAFRAMES, ONE FOR EVERY PROJECT, WITH THE AGGREGATIONS NEEDED FOR THE DASHBOARD WILDCARDS
# Aggregate data per project_name
grouped_wildcard_df = df.groupby('project_name').agg({'duration': 'sum', 
                                             'emissions': 'sum',
                                             'emissions_rate': 'sum',
                                             'cpu_power': 'sum',
                                             'ram_power': 'sum',
                                             'cpu_energy': 'sum',
                                             'ram_energy': 'sum',
                                             'energy_consumed': 'sum',
                                             'country_name': 'first',
                                             'region': 'first',
                                             'os': 'first',
                                             'python_version': 'first',
                                             'cpu_count': 'first',
                                             'cpu_model': 'first',
                                             'ram_total_size': 'first',}).reset_index()

# Save each group in a separate variable and in the relative folder
for group_name, group_data in grouped_wildcard_df.groupby('project_name'):
    print(group_name)
    globals()[f"{group_name}_wildcard_df"] = group_data.reset_index(drop=True)
    csv_filename = f"{group_name}_wildcard_df.csv"
    group_data.to_csv(f"data_processed/single_project/{csv_filename}", index=False)

logisticregressionmodel
projectexcecuterdataload
xgboostmodel


In [170]:
# THIS STEP CREATE 3 DATAFRAMES, ONE FOR EVERY PROJECT, WITH THE COLUMNS NEEDED FOR THE GRAPHS (LINECHARTS, ETC...)
# Aggregate data per project_name
logisticregressionmodel_series_df = df[df['project_name']=='logisticregressionmodel'][['project_name','date','duration','emissions','emissions_rate','cpu_power','ram_power','cpu_energy','ram_energy','energy_consumed']]
projectexcecuterdataload_series_df = df[df['project_name']=='projectexcecuterdataload'][['project_name','date','duration','emissions','emissions_rate','cpu_power','ram_power','cpu_energy','ram_energy','energy_consumed']]
xgboostmodel_series_df = df[df['project_name']=='xgboostmodel'][['project_name','date','duration','emissions','emissions_rate','cpu_power','ram_power','cpu_energy','ram_energy','energy_consumed']]

logisticregressionmodel_series_df.to_csv(f"data_processed/single_project/logisticregressionmodel_series_df.csv")
projectexcecuterdataload_series_df.to_csv(f"data_processed/single_project/projectexcecuterdataload_series_df.csv")
xgboostmodel_series_df.to_csv(f"data_processed/single_project/xgboostmodel_series_df.csv")

In [171]:
projectexcecuterdataload_series_df.head()

Unnamed: 0,project_name,date,duration,emissions,emissions_rate,cpu_power,ram_power,cpu_energy,ram_energy,energy_consumed
365,projectexcecuterdataload,2022-01-01,0.00222,1.46019e-10,6.576975e-08,5.0,6.0,2.958377e-09,3.271898e-09,6.230275e-09
366,projectexcecuterdataload,2022-01-02,0.002079,1.323893e-10,6.367903e-08,5.0,6.0,2.69711e-09,2.951622e-09,5.648732e-09
367,projectexcecuterdataload,2022-01-03,0.002253,1.487492e-10,6.602111e-08,5.0,6.0,3.009703e-09,3.337065e-09,6.346769e-09
368,projectexcecuterdataload,2022-01-04,0.003031,2.012126e-10,6.638455e-08,5.0,6.0,4.081925e-09,4.50333e-09,8.585254e-09
369,projectexcecuterdataload,2022-01-05,0.002252,1.447648e-10,6.428667e-08,5.0,6.0,2.963675e-09,3.213088e-09,6.176763e-09


In [174]:
# THIS STEP CREATE THE SERIES FOR GRAPH THE VARIABLES IN THE MAIN DASHBOARD OF ALL PROJECTS TOGETHER

# Group by 'date' and sum 'emission' and 'utilization'
grouped_all_series_df = df.groupby('date').agg({'duration': 'sum', 
                                     'emissions': 'sum',
                                     'emissions_rate': 'sum',
                                     'cpu_power': 'sum',
                                     'ram_power': 'sum',
                                     'cpu_energy': 'sum',
                                     'ram_energy': 'sum',
                                     'energy_consumed': 'sum'}).reset_index()
grouped_all_series_df.to_csv("data_processed/whole_project/grouped_all_series_df.csv", index=False)

In [181]:
# THIS STEP CREATE THE SINGLE ROW FOR ALL PROJECT SUMMING THE WILDCARD VALUES FOR THE WHOLE MAIN DASHBOARD
grouped_all_wildcard_df = df[['duration','emissions','emissions_rate','cpu_power','ram_power','cpu_energy','ram_energy','energy_consumed']]
grouped_all_wildcard_df = pd.DataFrame([grouped_all_wildcard_df.sum()])
grouped_all_wildcard_df.to_csv("data_processed/whole_project/grouped_all_wildcard_df.csv", index=False)