# Preliminaries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Constants

Converstion constants to convert CPU in Nanocores to cores and Memory in bytes to MiB

In [None]:
CPU_CONSTANT = 1e9
MEM_CONSTANT = 1.048576e+6

### Variables

In [None]:
source_data_date = "25-04-2024"

# Load Data

## CPU & Memory

In [None]:
df_main_cpu = pd.read_csv(f'./data/raw/cpu/cpu-{source_data_date}.csv')
df_main_memory = pd.read_csv(f'./data/raw/memory/memory-{source_data_date}.csv')

#for both df_main_cpu and df_main_memory make the timeGenerated to pandas and component to the component name
df_main_cpu['TimeGenerated'] = pd.to_datetime(df_main_cpu['TimeGenerated'])
df_main_cpu['Component'] = df_main_cpu['Name'].str.split('-').str[:-3].str.join('-')
df_main_cpu['CpuCounter'] /= CPU_CONSTANT

df_main_memory['TimeGenerated'] = pd.to_datetime(df_main_memory['TimeGenerated'])
df_main_memory['Component'] = df_main_memory['Name'].str.split('-').str[:-3].str.join('-')
df_main_memory['MemoryCounter'] /= MEM_CONSTANT

## Instances

In [None]:
df_instances = pd.read_csv(f'./data/raw/instances/instances-{source_data_date}.csv')

def clean_data(df_instances):
    """
    Cleans and transforms the instances data.

    This function performs the following steps:
    1. Reshapes the data using a pivot table to have Names as rows and CounterNames as columns.
    2. Fills missing values with 0.
    3. Drops duplicate rows.
    4. Converts CPU and memory values to more readable units (cores and MiB).
    5. Renames columns for clarity.

    Args:
    df_instances: The DataFrame containing the instances data.

    Returns:
    A cleaned and transformed DataFrame.
    """

    # Reshape the data using pivot_table to have Names as rows and CounterNames as columns
    # Filling missing values with 0 to avoid errors in calculations
    df_instances = df_instances.pivot_table(index=['Name'], columns='CounterName', values='Value', fill_value=0) #fills missing values with 0
    df_instances = df_instances.reset_index()
    df_instances.drop_duplicates()
    df_instances["cpuLimitNanoCores"] /= CPU_CONSTANT
    df_instances["cpuRequestNanoCores"] /=CPU_CONSTANT
    df_instances["memoryLimitBytes"] /=MEM_CONSTANT
    df_instances["memoryRequestBytes"] /=MEM_CONSTANT

    df_instances = df_instances.rename(columns={'cpuLimitNanoCores': 'cpuLimitCores','cpuRequestNanoCores': 'cpuRequestCores','memoryLimitBytes': 'memoryLimitMiB','memoryRequestBytes': 'memoryRequestMiB'})
    return df_instances

df_instances = clean_data(df_instances.copy())
df_instances

## HPA Data

In [None]:
df_hpa = pd.read_csv(f'./data/raw/instances/hpa-{source_data_date}.csv')

def clean_data(df_hpa):
    """
    Cleans and transforms the Horziontal Pod Autoscaling data.

    This function performs the following steps:
        1. Filters rows based on the 'ns' column to include only a specific namespace.
        2. Drops unnecessary columns.
        3. Drops duplicate rows.
        4. Fills missing values in 'last_scale_time' with 0 and converts non-zero values to 1, This is to denote which components underwent scaling in that period.
        5. Extracts relevant information from 'deployment_hpa' and 'ns' columns.
        6. Renames columns for clarity.

    Args:
        df_hpa: The DataFrame containing the HPA data.

    Returns:
        A cleaned and transformed DataFrame.
    """

    # Filter rows based on column: 'ns'
    df_hpa = df_hpa[df_hpa['ns'].str.startswith("staging", na=False)]
    # Drop column: 'Cluster'
    df_hpa = df_hpa.drop(columns=['Cluster', 'deployment_hpa2','scale_out_percentage','Computer', 'Origin', 'Namespace', 'Name',
    'AgentId', 'Tags', 'TimeGenerated [Sri Jayawardenepura]', 'Type', '_ResourceId', 'pTags', 'Val'])
    # Drop duplicate rows across all columns
    df_hpa = df_hpa.drop_duplicates()
    # if there is a missing value for last_scale_time fill it with zero, make the others 1
    df_hpa['last_scale_time'] = df_hpa['last_scale_time'].fillna(0)
    #values other than zero are converted to 1
    df_hpa['last_scale_time'] = df_hpa['last_scale_time'].apply(lambda x: 1 if x != 0 else 0)
    df_hpa['deployment_hpa'] = df_hpa['deployment_hpa'].str.split('-').str[:-1].str.join('-')
    df_hpa['ns'] = df_hpa['ns'].str.split('-').str[2:-2].str.join('-')
    # Change the column name to last_scaled
    df_hpa = df_hpa.rename(columns={'last_scale_time': 'LastScaled', 'ns': 'Namespace', 'deployment_hpa': 'Component', 'desired_reps': 'DesiredReplicas', 'min_reps': 'MinReplicas', 'max_reps': 'MaxReplicas'})
    return df_hpa

df_hpa = clean_data(df_hpa.copy())
df_hpa.head()

# Calculate Pod Duration

In [None]:
def pod_duration(df_main_cpu):
    df_pod_duration = df_main_cpu.groupby('Name').agg({'TimeGenerated': ['min', 'max'], 'Component': 'first'})
    df_pod_duration['TimeDifference'] = df_pod_duration['TimeGenerated']['max'] - df_pod_duration['TimeGenerated']['min']
    df_pod_duration.reset_index(inplace=True)
    df_pod_duration.columns = ['Name', 'TimeStamp_min', 'TimeStamp_max', 'Component','TimeDifference']
    return df_pod_duration

df_pod_duration = pod_duration(df_main_cpu.copy())
df_pod_duration.head()

### Check Unique Components

In [None]:
# # make it into a dataframe
# unique_components = pd.DataFrame(df_pod_duration['Component'].unique(), columns=['Component'])
# #make this single column into a single row
# unique_components = unique_components.transpose()

# Grouped Data

In [None]:
df_merged = df_instances.merge(df_pod_duration, on='Name', how='left')
df_merged = df_merged.merge(df_hpa, on='Component', how='left')
df_merged = df_merged.dropna(subset=['TimeDifference', 'Component'])

# Inner Join CPU and Memory Data

In [None]:
#Inner Join (Note: Inner and Outerjoin gave same results)
df_main_inner = df_main_cpu.merge(df_main_memory, on=['TimeGenerated', 'Name'], how='inner', suffixes=('', '_memory'))
df_main_inner = df_main_inner.merge(df_merged, on='Name', how='inner', suffixes=('', '_instances'))

def clean_data(df_main_inner):
    # Drop columns: 'Component_memory', 'PodStartTime_memory' and 3 other columns
    df_main_inner = df_main_inner.drop(columns=['Component_memory', 'PodStartTime_memory', 'Namespace_memory', 'PodCreationTimeStamp_memory', 'InstanceName_memory', 'Namespace_instances', 'Component_instances'])
    return df_main_inner

df_main_inner = clean_data(df_main_inner.copy())

# Recommendations

This is the optimized version of the recommendation algorithm.

In [None]:
def optimized_recommender(data, instances,  window='7d', offset='0d', algorithmCPU='Quantile', algorithmRAM='Quantile', qCPU=0.9999, qRAM=0.95,  targetCPUUtilization=0.7, targetRAMUtilization=0.8, minRecCpuCores=0.1, minRecRamMiB=20, cpu_limit_factor=2, mem_limit_factor=2):
    """This function takes resource utilization data (`data`) and deployment configuration data (`instances`) and generates recommendations for resource requests and limits for each deployment in `instances`.

    Args:
      data: A pandas DataFrame containing resource utilization data. This DataFrame is expected to have the CpuCounter, MemoryCounter and 'TimeGenerated' columns.
      instances: A pandas DataFrame containing deployment configuration data. This DataFrame is expected to have columns named 'Component', 'Name', 'MinReplicas', 'MaxReplicas', 'cpuRequestCores', 'cpuLimitCores', 'memoryRequestMiB', 'memoryLimitMiB', and 'TimeDifference'.
      window: A string representing the time window to consider for calculating resource utilization. Defaults to '3d' (3 days).
      offset: A string representing the offset to apply to the most recent time in the data before considering the window. Defaults to '0d' (no offset). This can be used for shifting the time window.
      algorithmCPU: A string indicating the algorithm to use for calculating CPU utilization. Can be 'Max' (maximum) or 'Quantile' (quantile). Defaults to 'Quantile'.
      algorithmRAM: A string indicating the algorithm to use for calculating RAM utilization. Can be 'Max' (maximum) or 'Quantile' (quantile). Defaults to 'Quantile'.
      qCPU: A float between 0 and 1 representing the quantile to use for calculating CPU utilization if algorithmCPU is set to 'Quantile'. Defaults to 0.9999 (99.99th percentile).
      qRAM: A float between 0 and 1 representing the quantile to use for calculating RAM utilization if algorithmRAM is set to 'Quantile'. Defaults to 0.95 (95th percentile).
      targetCPUUtilization: A float between 0 and 1 representing the target CPU utilization for deployments. This is an additional buffer provided. Defaults to 0.7 (70%).
      targetRAMUtilization: A float between 0 and 1 representing the target RAM utilization for deployments. This is an additional buffer provided. Defaults to 0.8 (80%).
      minRecCpuCores: A float representing the minimum recommended CPU request cores. Defaults to 0.1.
      minRecRamMiB: An integer representing the minimum recommended RAM request in MiB. Defaults to 20.
      cpu_limit_factor: A float representing the factor to apply to the recommended CPU request to calculate the recommended CPU limit. Defaults to 2.
      mem_limit_factor: A float representing the factor to apply to the recommended RAM request to calculate the recommended RAM limit. Defaults to 2.

    Returns:
      A pandas DataFrame containing the updated deployment configuration data with recommended CPU and RAM requests and limits.
    """
   
    # Ensure 'TimeGenerated' is in datetime format
    data['TimeGenerated'] = pd.to_datetime(data['TimeGenerated'])
    instances['TimeDifference'] = pd.to_timedelta(instances['TimeDifference'])   
    
    # Copy and filter instances data to ensure we don't modify the original DataFrame
    instances_data = instances.copy()
    instances_data = instances_data.dropna(subset=['TimeDifference', 'Component', 'MinReplicas'])
    
    # Define lambda functions for CPU and RAM calculations based on chosen algorithm
    cpu_func = lambda x: x.max() if algorithmCPU == 'Max' else x.quantile(qCPU)
    ram_func = lambda x: x.max() if algorithmRAM == 'Max' else x.quantile(qRAM)
    
    # Get unique components and iterate through them
    components = instances['Component'].unique()
    updates = []
    for component in components:
        # Filter data based on component
        component_data = instances_data[instances_data['Component'] == component]
        # Filter data based on window criteria
        component_data = component_data[component_data['TimeDifference'] >= pd.Timedelta(window)]
        if component_data.empty:
            continue
          
        # Filter main data for the current component and time window
        main_data = data[data['Component'] == component]
        #get the most recent time in the data - offset
        end_time = main_data['TimeGenerated'].max()- pd.Timedelta(offset)
        #limit the data to the window
        main_data = main_data[(main_data['TimeGenerated'] >= end_time - pd.Timedelta(window)) & (main_data['TimeGenerated'] <= end_time)]

        # Group data by 'Name' for each component to reduce computations
        grouped_data = main_data.groupby('Name').agg(
            base_cpu_recommendation=('CpuCounter', cpu_func),
            base_mem_recommendation=('MemoryCounter', ram_func)
        ).reset_index()
        
        # Apply calculations for recommended requests and limits
        grouped_data['RecommendedCpuRequestCores'] = grouped_data['base_cpu_recommendation'].apply(lambda x: max(round(x / targetCPUUtilization, 3), minRecCpuCores))
        grouped_data['RecommendedMemoryRequestMiB'] = grouped_data['base_mem_recommendation'].apply(lambda x: max(round(x / targetRAMUtilization, 0), minRecRamMiB))
        grouped_data['RecommendedCpuLimitCores'] = grouped_data['RecommendedCpuRequestCores'].apply(lambda x: round(x * cpu_limit_factor, 3))
        grouped_data['RecommendedMemoryLimitMiB'] = grouped_data['RecommendedMemoryRequestMiB'].apply(lambda x: round(x * mem_limit_factor, 0))
        
        #concatenate the grouped data
        updates.append(grouped_data[['Name', 'RecommendedCpuRequestCores', 'RecommendedMemoryRequestMiB', 'RecommendedCpuLimitCores', 'RecommendedMemoryLimitMiB']])
        
    updates_df = pd.concat(updates, ignore_index=True)
    instances_data = pd.merge(instances, updates_df, on='Name', how='left')
    
    # Group by component and aggregate final resource utilization data
    instances_data = instances_data.groupby('Component').agg(
        Namespace=('Namespace', 'first'),
        cpuRequestCores=('cpuRequestCores', 'max'),
        cpuLimitCores=('cpuLimitCores', 'max'),
        memoryRequestMiB=('memoryRequestMiB', 'max'),
        memoryLimitMiB=('memoryLimitMiB', 'max'),
        MinReplicas=('MinReplicas', 'max'),
        MaxReplicas=('MaxReplicas', 'max'),
        RecommendedCpuRequestCores=('RecommendedCpuRequestCores', 'max'),
        RecommendedCpuLimitCores=('RecommendedCpuLimitCores', 'max'),
        RecommendedMemoryRequestMiB=('RecommendedMemoryRequestMiB', 'max'),
        RecommendedMemoryLimitMiB=('RecommendedMemoryLimitMiB', 'max')
    ).reset_index()
    instances_data = instances_data.dropna(subset=['RecommendedCpuRequestCores', 'RecommendedCpuLimitCores', 'RecommendedMemoryRequestMiB', 'RecommendedMemoryLimitMiB'])
    return instances_data

# Calculate Recommendations

This is used to caculate recommendations with different parameters and different windows. The '7d' window and '0d' offset is what is mostly used.

In [None]:
windows = ['3d', '7d']
offsets = ['0d', '1d', '2d']

for window in windows:
    for offset in offsets:
        df_recommended = optimized_recommender(df_main_inner, df_merged, window=window, offset=offset, algorithmCPU='Max', algorithmRAM='Quantile', qCPU=0.9999, qRAM=0.95,  targetCPUUtilization=0.7, targetRAMUtilization=0.8, minRecCpuCores=0.1, minRecRamMiB=20, cpu_limit_factor=2, mem_limit_factor=2)
        df_recommended.to_csv(f'./reports/Recommendations-{source_data_date}-{window}-{offset}Offset.csv', index=False)