In [None]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt

pd.options.display.max_rows = 5
pd.options.mode.copy_on_write = True
# Define the file paths for the two CSV files
podInstancesInput = ''
eventsServicesInput = ''


## Setup and manipulate the **Pods per Service** File

In [None]:
# Read in the first CSV file
podInstancesDf = pd.read_csv(podInstancesInput)

# print(podInstancesDf.head())
with open('column_dtypes.txt', 'w') as file:
    file.write(str(podInstancesDf.dtypes))


In [None]:
#manipulate the podInstancesDf by adding a date time column and grouping up into 10 minute intervals
podInstancesDf['runTime'] = pd.to_datetime(podInstancesDf['Time'], format='%Y-%m-%d %H:%M:%S')

# need to drop Time column prior to datamanipulation so it the mean works                             
podInstancesDf.drop('Time', axis=1).groupby(pd.Grouper(key="runTime", freq='5Min')).mean()

## Setup and manipulate the **Events per Service** File

In [None]:
# Read in the events per Service File
eventsServicesDf = pd.read_csv(eventsServicesInput)
print(eventsServicesDf.dtypes)

In [None]:
#manipulate the podInstancesDf by adding a date time column and grouping up into 10 minute intervals
eventsServicesDf['runTime'] = pd.to_datetime(eventsServicesDf['Time'], format='%Y-%m-%d %H:%M:%S')
eventsServicesDf.drop('Time', axis=1).groupby(pd.Grouper(key="runTime", freq='5Min')).mean()


## Join the two files together

In [None]:
# Here we assume 'new_name1' is the common column. Adjust as necessary
combinePodEvents = pd.merge(podInstancesDf, eventsServicesDf, on='runTime')

thinPodEvents = combinePodEvents[['runTime','Rate of events: ki-features-service','ki-features-service']]
print(thinPodEvents.head())

In [None]:

thinPodEvents["ratePerPod"] = thinPodEvents["Rate of events: ki-features-service"] / thinPodEvents["ki-features-service"]
print(thinPodEvents.head())

In [None]:
# Define the bucket range
bins = range(1, 202, 20)  # Create bins of size 5 from 0 to 100

# Create a new column for the buckets
thinPodEvents['podsBucketed'] = pd.cut(thinPodEvents['ki-features-service'], bins=bins, right=False)


stats = thinPodEvents.groupby('podsBucketed').agg(
    mean=('ratePerPod', 'mean'),
    std=('ratePerPod', 'std'),
    min=('ratePerPod', 'min'),
    max=('ratePerPod', 'max'),
    count=('ratePerPod', 'count')
).reset_index()

In [None]:
# Plot the statistics
fig, ax = plt.subplots()

# Plot mean and standard deviation as error bars
ax.errorbar(stats['podsBucketed'].astype(str), stats['mean'], yerr=stats['std'], fmt='o', capsize=5, label='Mean with Std Dev')

# Plot the count of observations in each bucket
ax2 = ax.twinx()
ax2.bar(stats['podsBucketed'].astype(str), stats['count'], alpha=0.3, label='Count', color='orange')

# Labels and legend
ax.set_xlabel('podsBucketed')
ax.set_ylabel('Mean Value')
ax2.set_ylabel('Count')
ax.set_title('Dependent Variable Statistics by Bucket')
fig.legend(loc='upper left')

# Show plot
plt.xticks(rotation=45)
plt.show()

In [None]:
stats.to_csv('stats.csv', index=False)

In [None]:
thinPodEvents.to_csv('reviewData.csv', index=False)

In [None]:
bins = range(0, 201, 20) 
for n in bins:
  print(n)