In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import os
import matplotlib.cm as cm

In [None]:
folders = ["result_sf1/", "result_sf2/", "result_sf5/"]
scales = ["sf1", "sf2", "sf5"]
variations = ["all", "dates_text", "dates", "no_index", "numbers_dates", "numbers_text", "numbers", "text", "default_index"]

In [None]:
folder = "result_sf1/"
file = "index_stats_numbers.csv"
kb_in_one_mb = 1024
bytes_in_one_mb = 1_048_576
mb_in_one_gb = 1024

In [None]:
df = pd.read_csv(folder+file)

In [None]:
df.dropna(inplace=True)

In [None]:
def to_mb(row):
   units = ["kB", "bytes", "GB"]
   if not any(unit in row for unit in units):
      return row
   
   splitted = row.split()
   number = float(splitted[0])
   
   if "kB" in row:
      mb = number / kb_in_one_mb
   elif "GB" in row:
      mb = number * mb_in_one_gb
   else: # bytes
      mb = number / bytes_in_one_mb
   
   return str(mb) + " MB"

In [None]:
def clean_mb_to_number(row):
   
   if "MB" not in row:
      return row
   
   row = row.replace("MB", "").strip()
   return float(row)

In [None]:
# columns that can contain kb
kb_columns = ["Total Size","Total Size of all Indexes", "Table Size", "Index Size"]
# Columns that 
mb_columns = ["Total Size MB","Total Size of all Indexes MB", "Table Size MB", "Index Size MB"]


In [None]:
def drop_and_convert_to_mb(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame:
   df = df.copy()
   for column in columns:
      df[column + " MB"] = df[column].apply(to_mb)
   dropped_df = df.drop(columns=columns)
   return dropped_df

def drop_mb_convert_to_float(df: pd.DataFrame, columns: list[str])-> pd.DataFrame:
   df = df.copy()
   for column in columns:
      df[column] = df[column].apply(clean_mb_to_number)
   return df

def get_table_info(df: pd.DataFrame)-> pd.DataFrame:
   table_info = df.drop_duplicates(subset=["Table Name"])
   return table_info.drop(columns=["Index Size MB", "Index Name"])

def clean_and_table_info(df : pd.DataFrame) -> pd.DataFrame:
   df = df.copy()
   df_nona = df.dropna()
   df_mb = drop_and_convert_to_mb(df_nona, kb_columns)
   df_mb_numbers = drop_mb_convert_to_float(df_mb, mb_columns)
   table_info = get_table_info(df_mb_numbers)
   return table_info

In [None]:
table_info = clean_and_table_info(df)

In [None]:
table_info

In [None]:
def get_total_size_sum(df: pd.DataFrame):
   return df["Total Size MB"].sum()

def get_table_size_sum(df: pd.DataFrame):
   return df["Table Size MB"].sum()

def get_sum(df: pd.DataFrame, column: str):
   return df[column].sum()

In [None]:
table_size = get_table_size_sum(table_info)
total_size = get_total_size_sum(table_info)
index_size = get_sum(table_info, "Total Size of all Indexes MB")
print(table_size, total_size)

In [None]:
table_info

In [None]:
columns = ["scale", "index", "index_size", "table_size", "total_size" ]
data = pd.DataFrame(columns=columns)


In [None]:
for scale in scales:
   for variation in variations:
      path = "result_" + scale + "/" + "index_stats_" + variation + ".csv"
      df = pd.read_csv(path)
      table_info = clean_and_table_info(df)

      table_size_sum = get_table_size_sum(table_info)
      index_size_sum = get_sum(table_info, "Total Size of all Indexes MB")
      total_size_sum = get_total_size_sum(table_info)
      data.loc[len(data)] = [scale, variation, index_size_sum, table_size_sum, total_size_sum]

      print(f"{folder + variation} - table: {table_size_sum} + index: {index_size_sum} = total_size {total_size_sum}")


In [None]:
sf1 = data[data["scale"]=="sf1"]
sf2 = data[data["scale"]=="sf2"]
sf5 = data[data["scale"]=="sf5"]

In [None]:
sf5

In [None]:
if not os.path.exists("imgs"):
   os.makedirs("imgs")

In [None]:
plt.rcParams.update({'font.size': 12})  # Set default font size for all text
def create_barplot(data:pd.DataFrame, scale: str):
   plt.figure(figsize=(8,8))
   sf = data[data["scale"]==scale]
   # sf_sorted = sf.sort_values(by="index_size", ascending=False)
   n_bars = len(variations)
   colors = cm.get_cmap('tab10', n_bars).colors
   axes = sf.plot(kind='bar', y="index_size", color=colors)
   axes.legend().set_visible(False)
   axes.set_xticklabels(sf["index"], rotation=45, ha='right')
   axes.get_yticklabels()
   axes.spines["top"].set_visible(False)
   axes.spines["left"].set_visible(False)
   axes.spines["right"].set_visible(False)
   
   axes.yaxis.grid(True, which='major', linestyle='--', linewidth=0.7, zorder=0)
   axes.set_xlabel("Index")
   axes.set_ylabel("Storage overhead (MB)")
   axes.set_title(f"Storage overhead for indexing scale {scale.replace("sf", "")} GB", fontweight="bold")
   plt.tight_layout()
   plt.savefig(f"imgs/index_size_{scale}",dpi=300)

In [None]:
create_barplot(data, "sf1")

In [None]:
create_barplot(data, "sf2")


In [None]:
create_barplot(data, "sf5")
