# Overview
This workbook does the following:


1.   Import Raw Data
2.   Create a all_func_grouped DF
3.   Creates a dictionary of DFs for all the sub functions



In [None]:
# Only for Google Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Necessary Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
import datetime
plt.rcParams['figure.figsize'] = (50, 20)  # Set figure size

In [None]:
# Read in the raw data, then create a time value based on the end_timestamp
df = pd.read_csv("/content/drive/MyDrive/small_data_no_head.csv")
df['time'] = [datetime.datetime(2021,1,31) + datetime.timedelta(seconds=x) for x in df['end_timestamp']]

# Dataframe summarized
print(f"Number of Unique Applications: {len(set(df['func']))}")
df.info()
df.head()


In [None]:
# Plotting Raw Data

plt.plot(df['time'], df['duration'], 'o-')
ax = plt.gca()
ax.set_xlim([min(df['time']), max(df['time'])])
ax.set_ylim([min(df['duration']), max(df['duration'])])
plt.title('Duration Over Time (ALL APPLICATIONS)',  fontsize=40)
plt.xlabel('End Timestamp', fontsize=20)
plt.ylabel('Function Runtime Duration', fontsize=20)
plt.show()

# Preparing the all_func_grouped
Now we need to convert this format to a time series usable format

In [None]:
# We need to bin all the data

# Bin the timestamps into 10-minute intervals
df['time_bin'] = df['time'].dt.floor('10T')  # Floor the timestamps to the nearest 10 minutes

# Group by these intervals and count the number of instances
grouped = df.groupby('time_bin').size().reset_index(name='count')

# Set the intervals as the index (optional)
grouped.set_index('time_bin', inplace=True)

grouped.info()

In [None]:
# Distribution of the number of instances per bin
grouped['count'].plot(kind='hist', bins=20, title='count')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# Number of instances over time, this is what we really want to predict
grouped['count'].plot(kind='line', figsize=(8, 4), title='count')
plt.gca().spines[['top', 'right']].set_visible(False)

# Preparing func seperated data
Now we are going to prepare a dictionary of dataframes to use
where the keys are the function ids and the values are the
grouped dataframes.

In [None]:
# Create the empty dictionary
func_to_df = {}

# Create the total time range
start_time = df['time'].min().floor('10T')
end_time = df['time'].max().ceil('10T')
all_time_bins = pd.date_range(start=start_time, end=end_time, freq='10T')

# Group the raw data by function id
func_groups = df.groupby('func')

# Iterate through each group and apply the binning and grouping logic
for func_id, func_data in func_groups:
    # Bin the timestamps into 10-minute intervals
    func_data['time_bin'] = func_data['time'].dt.floor('10T')

    # Group by the binned intervals and count the number of rows
    grouped_func_data = func_data.groupby('time_bin').size().reset_index(name='count')

    # Set the binned intervals as the index (optional)
    grouped_func_data.set_index('time_bin', inplace=True)

    # Reindex to include all time bins, filling missing bins with 0
    grouped_func_data = grouped_func_data.reindex(all_time_bins, fill_value=0)

    # Store the grouped DataFrame in the dictionary
    func_to_df[func_id] = grouped_func_data

In [None]:
# Iterate through the dictionary and verify that the func specific data is correct

for func_id, func_data in func_to_df.items():
  print("func_id: ", func_id, len(func_data))
  func_data['count'].plot(kind='line', figsize=(8, 4), title='count')
  plt.gca().spines[['top', 'right']].set_visible(False)
