# Analysis of Generated Data

In [None]:
import os, sys, json

import pandas as pd

import matplotlib.pyplot as plt

# Get the current working directory of the notebook
notebook_dir = os.getcwd()
print(notebook_dir)
# Add the parent directory to the system path
sys.path.append(os.path.join(notebook_dir, '../'))

import log_files
from data_processing import DataProcessing

In [None]:
pd.set_option('max_colwidth', 800)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
analysis_dict = {}

## Load Predictions

In [None]:
log_file_path = "data/prediction_logs"
predictions = True
predictions_df = log_files.read_data(notebook_dir, log_file_path, predictions)
predictions_df.head(7)

## Load Observations

In [None]:
log_file_path = "data/observation_logs"
predictions = False
observations_df = log_files.read_data(notebook_dir, log_file_path, predictions)
observations_df.head(7)

In [None]:
df = DataProcessing.concat_dfs([predictions_df, observations_df])
df.head(7)

## Analysis

### Number of predictions and observations, respectively

In [None]:
analysis_dict["#predictions"] = len(predictions_df)
analysis_dict["#observations"] = len(observations_df)
analysis_dict

In [None]:
def get_column_counts(df: pd.DataFrame, analysis_dict: dict, type_data: str):
    predictions_per_domain = df["Domain"].value_counts()
    analysis_dict[f"#{type_data} per domain"] = predictions_per_domain.to_dict()

    predictions_per_model_name = df["Model Name"].value_counts()
    analysis_dict[f"#{type_data} per model name"] = predictions_per_model_name.to_dict()

    predictions_per_api_name = df["API Name"].value_counts()
    analysis_dict[f"#{type_data} per api name"] = predictions_per_api_name.to_dict()

    predictions_per_template_number = df["Template Number"].value_counts()
    analysis_dict[f"#{type_data} per template #"] = predictions_per_template_number.to_dict()

    return analysis_dict

In [None]:
get_column_counts(predictions_df, analysis_dict, type_data="predictions")
get_column_counts(observations_df, analysis_dict, type_data="observations")

## Plot Data: Create + Save
- Save dict after we create plots to ensure proper structure of dictionary. Proper structure as in we can extract contents to plot.

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

def create_versioned_run_directory(base_path):
    """
    Creates a unique, versioned directory for a script run.
    If 'path/to/analysis' exists, it will create 'path/to/analysis_v1', etc.
    Returns the path of the directory that was created.
    """
    # Sanitize base path by removing any trailing slash
    if base_path.endswith('/') or base_path.endswith('\\'):
        base_path = base_path[:-1]

    # Start with the base path as the first candidate
    run_directory = base_path
    counter = 1
    
    # If the path already exists, start appending version numbers
    while os.path.exists(run_directory):
        run_directory = f"{base_path}_v{counter}"
        counter += 1
    
    # Create the final, unique directory
    os.makedirs(run_directory)
    print(f"Created output directory for this run: {run_directory}\n")
    return run_directory

def create_plots(results: dict, prediction_key: str, observation_key: str, save_data: bool, output_directory: str):
    """
    Generates and saves a bar plot into the specified output_directory.
    The directory is expected to already exist.
    """
    df = pd.DataFrame([results[prediction_key], results[observation_key]], index=['Predictions', 'Observations'])
    title = f"{prediction_key} x {observation_key}"
    # Plot the DataFrame
    df.plot(kind='bar', title=title, xlabel='Predictions/Observations', ylabel='Counts', rot=360)
    
    if save_data:
        # The filename is now simple and doesn't need versioning.
        image_filename = f"{prediction_key}_vs_{observation_key}.png".replace(" ", "_").replace("#", "")
        
        # The full path is simply the pre-determined directory + the filename.
        full_path = os.path.join(output_directory, image_filename)
        
        plt.savefig(full_path, bbox_inches='tight')
        print(f"Plot saved to: {full_path}\n")
        
    plt.tight_layout()
    plt.close()

def save_json(obj, filepath):
    """
    Saves a dictionary or other JSON-serializable object to a file.
    Ensures the target directory exists before writing.
    """
    # This line ensures the directory exists, which is helpful,
    # though our main script already creates it.
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    
    with open(filepath, "w", encoding="utf-8") as f:
        # indent=4 makes the JSON file human-readable
        json.dump(obj, f, indent=4)
    print(f"Data saved to: {filepath}\n")

In [None]:
from datetime import date

# Get today's date
today = date.today()

# Print the date
print("Today's date:", today)

In [None]:
# 1. Define the base directory for all analysis runs.
base_output_path = f"../data/dataset_analyses/{today}"

# 2. Create ONE unique, versioned directory for THIS run.
#    This function is called only once.
run_output_directory = create_versioned_run_directory(base_output_path)

create_plots(analysis_dict, "#predictions", "#observations", True, run_output_directory)
create_plots(analysis_dict, "#predictions per domain", "#observations per domain", True, run_output_directory)
create_plots(analysis_dict, "#predictions per model name", "#observations per model name", True, run_output_directory)
create_plots(analysis_dict, "#predictions per api name", "#observations per api name", True, run_output_directory)
create_plots(analysis_dict, "#predictions per template #", "#observations per template #", True, run_output_directory)

json_filepath = os.path.join(run_output_directory, "analysis_dict.json")

save_json(analysis_dict, json_filepath)