# Overview
This workbook does the following:


1.   Import Raw Data
2.   Create a all_func_grouped DF
3.   Creates a dictionary of DFs for all the sub functions



In [None]:
# Only for Google Colab

from google.colab import drive
drive.mount('/content/drive')

# Only run if you don't have the converted data yet
# with open( '/content/drive/MyDrive/AzureFunctionsInvocationTraceForTwoWeeksJan2021.txt', 'r') as source:
#   with open( '/content/drive/MyDrive/AzureFunctionsInvocationTraceForTwoWeeksJan2021.csv',
# 'w') as target:
#     target.write(source. read())


In [None]:
# Necessary Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
import datetime
plt.rcParams['figure.figsize'] = (50, 20)  # Set figure size

# Data Preparation
Now we have to download the data, build a combined dataset,
a func-specific dataset, and then split each one of these into training, validation and test subsets.

In [None]:
# Read in the raw data, then create a time value based on the end_timestamp
# df = pd.read_csv("/content/drive/MyDrive/small_data_no_head.csv")
df = pd.read_csv("/content/drive/MyDrive/AzureFunctionsInvocationTraceForTwoWeeksJan2021.csv")

df['time'] = [datetime.datetime(2021,1,31) + datetime.timedelta(seconds=x) for x in df['end_timestamp']]

# Dataframe summarized
print(f"Number of Unique Applications: {len(set(df['func']))}")
df.info()
df.head()


In [None]:
# Plotting Raw Data

plt.plot(df['time'], df['duration'], 'o-')
ax = plt.gca()
ax.set_xlim([min(df['time']), max(df['time'])])
ax.set_ylim([min(df['duration']), max(df['duration'])])
plt.title('Duration Over Time (ALL APPLICATIONS)',  fontsize=40)
plt.xlabel('End Timestamp', fontsize=20)
plt.ylabel('Function Runtime Duration', fontsize=20)
plt.show()

## Preparing the all_func_grouped
Now we need to convert this format to a time series usable format

In [None]:
# We need to bin all the data

# Bin the timestamps into 10-minute intervals
df['time_bin'] = df['time'].dt.floor('10T')  # Floor the timestamps to the nearest 10 minutes

# Group by these intervals and count the number of instances
all_func_grouped = df.groupby('time_bin').size().reset_index(name='count')

# Set the intervals as the index (optional)
all_func_grouped.set_index('time_bin', inplace=True)

all_func_grouped.info()

In [None]:
# Distribution of the number of instances per bin
all_func_grouped['count'].plot(kind='hist', bins=20, title='count')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# Number of instances over time, this is what we really want to predict
all_func_grouped['count'].plot(kind='line', figsize=(8, 4), title='count')
plt.gca().spines[['top', 'right']].set_visible(False)

## Duration Dataset Conversion
Here we need to convert the dataset into a time series format
with the duration as the only feature space.

In [None]:
# Set 'time' as the index

duration_df = df[['duration']]

# # Resample to equally spaced time bins (e.g., 10 minutes)
resampled_df = duration_df.resample('10T').mean()  # '10T' = 10-minute bins


# # Handle missing bins (optional)
resampled_df['duration'] = resampled_df['duration'].fillna(0)  # Fill with 0
resampled_df.info()
resampled_df

In [None]:
from matplotlib import pyplot as plt
resampled_df['duration'].plot(kind='hist', bins=20, title='duration')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
from matplotlib import pyplot as plt
resampled_df['duration'].plot(kind='line', figsize=(8, 4), title='duration')
plt.gca().spines[['top', 'right']].set_visible(False)

In [None]:
# Save the duration dataframe to an output csv:
resampled_df.to_csv("/content/drive/MyDrive/AzureFunctionsInvocationTraceForTwoWeeksJan2021_resampled.csv", index=True)

## Preparing func seperated data
Now we are going to prepare a dictionary of dataframes to use
where the keys are the function ids and the values are the
grouped dataframes.

In [None]:
# Create the empty dictionary
func_to_df = {}

# Create the total time range
start_time = df['time'].min().floor('10T')
end_time = df['time'].max().ceil('10T')
all_time_bins = pd.date_range(start=start_time, end=end_time, freq='10T')

# Group the raw data by function id
func_groups = df.groupby('func')

# Iterate through each group and apply the binning and grouping logic
for func_id, func_data in func_groups:
    # Bin the timestamps into 10-minute intervals
    func_data['time_bin'] = func_data['time'].dt.floor('10T')

    # Group by the binned intervals and count the number of rows
    grouped_func_data = func_data.groupby('time_bin').size().reset_index(name='count')

    # Set the binned intervals as the index (optional)
    grouped_func_data.set_index('time_bin', inplace=True)

    # Reindex to include all time bins, filling missing bins with 0
    grouped_func_data = grouped_func_data.reindex(all_time_bins, fill_value=0)

    # Store the grouped DataFrame in the dictionary
    func_to_df[func_id] = grouped_func_data

In [None]:
# Iterate through the dictionary and verify that the func specific data is correct

for func_id, func_data in func_to_df.items():
  print("func_id: ", func_id, len(func_data))

  # Adds all of them on the plot
  func_data['count'].plot(kind='line', figsize=(8, 4), title='count')
  plt.gca().spines[['top', 'right']].set_visible(False)


## Create Lookback Datasets
We need to create a lookback dataset, here we have the lookback as 10 for now.

In [None]:
def create_dataset(data, lookback = 10):
    X, y = [], []

    for i in range(lookback, len(data)):
        X.append(data[i-lookback:i])
        y.append(data[i])


    return np.array(X), np.array(y)

lookback = 20

### all_func_grouped Normalized & Lookback

Here we are going to create a lookback dataset for all the functions added together.

In [None]:
# First we need to normalize the data
all_func_grouped['log1p_count'] = np.log1p(all_func_grouped['count'])

# Now we create the all_func_grouped_X and all_func_grouped_y
all_func_grouped_X, all_func_grouped_y = create_dataset(all_func_grouped['log1p_count'], lookback)

# Display relevant information
print("all_func_grouped_X", np.shape(all_func_grouped_X))
print("all_func_grouped_y", np.shape(all_func_grouped_y))
all_func_grouped.head()


### func seperated data normalized & lookback

Here we are going to create a lookback dataset for all the functions added together.

In [None]:
func_to_df_X = {}
func_to_df_y = {}

for func_id, func_data in func_to_df.items():
  # First we need to normalize the data
  func_data['log1p_count'] = np.log1p(func_data['count'])
  func_to_df[func_id] = func_data  # Update df with log norm

  # Now we create the func_data_X and func_data_y
  func_data_X, func_data_y = create_dataset(func_data['log1p_count'], lookback)

  # Populate the X and Y dictionaries
  func_to_df_X[func_id] = func_data_X
  func_to_df_y[func_id] = func_data_y

  # Display relevant information
  print("func_data_X", np.shape(func_data_X))
  print("func_data_y", np.shape(func_data_y))
  all_func_grouped.head()

print("func_to_df_X: ", len(func_to_df_X))
print("func_to_df_y: ", len(func_to_df_y))

## Creating a Train, Valid, & Test subset

We are going to follow this pattern:



*   First 60% is Train
*   Next 20% is Validation
*   Last 20% is Test



### all_func_grouped train, val, test