In [3]:
import pandas as pd
import numpy as np

# Load the CSV files
file_paths = {
    "wo_markov_chain": r"wo_markov_approach\summarization_results_test\wp_markov_chain__approach_summarization_results_test.csv",
    "our_approach": r"our_approach\summarization_results_test\our_approach_summarization_results_test.csv",
    "llm_pure": r"llm_approach\summarization_results_test\llm_pure_approach_summarization_results_test.csv"
}

# Read the CSV files
markov_chain_df = pd.read_csv(file_paths["wo_markov_chain"])
our_approach_df = pd.read_csv(file_paths["our_approach"])
llm_pure_df = pd.read_csv(file_paths["llm_pure"])

# Function to extract and calculate average scores from nested dictionaries, handling NaN values
def extract_average_scores_handling_nan(df, score_column, keys):
    scores = {key: [] for key in keys}
    for _, row in df.iterrows():
        try:
            score_dict = eval(row[score_column].replace('nan', 'np.nan'))  # Replace 'nan' with np.nan for safe evaluation
            for key in keys:
                scores[key].append(score_dict.get(key, np.nan))
        except:
            for key in keys:
                scores[key].append(np.nan)  # Append NaN if the row is problematic
    averages = {key: np.nanmean(values) for key, values in scores.items()}  # Use np.nanmean to handle NaNs
    return averages

# Define the keys for Rouge, BERT, and Coherence scores
rouge_keys = ['rouge1', 'rouge2', 'rougeL']
bert_keys = ['bert_precision', 'bert_recall', 'bert_f1']
coherence_keys = ['first_order_coherence', 'second_order_coherence']

# Extract and calculate average scores for each approach using the corrected function
markov_chain_avg_rouge = extract_average_scores_handling_nan(markov_chain_df, 'ROUGE Scores', rouge_keys)
markov_chain_avg_bert = extract_average_scores_handling_nan(markov_chain_df, 'BERTScore', bert_keys)
markov_chain_avg_coherence = extract_average_scores_handling_nan(markov_chain_df, 'Coherence Scores', coherence_keys)

our_approach_avg_rouge = extract_average_scores_handling_nan(our_approach_df, 'ROUGE Scores', rouge_keys)
our_approach_avg_bert = extract_average_scores_handling_nan(our_approach_df, 'BERTScore', bert_keys)
our_approach_avg_coherence = extract_average_scores_handling_nan(our_approach_df, 'Coherence Scores', coherence_keys)

llm_pure_avg_rouge = extract_average_scores_handling_nan(llm_pure_df, 'ROUGE Scores', rouge_keys)
llm_pure_avg_bert = extract_average_scores_handling_nan(llm_pure_df, 'BERTScore', bert_keys)
llm_pure_avg_coherence = extract_average_scores_handling_nan(llm_pure_df, 'Coherence Scores', coherence_keys)

# Combine results into a summary dataframe for easier comparison
summary_df = pd.DataFrame({
    'Markov Chain': {**markov_chain_avg_rouge, **markov_chain_avg_bert, **markov_chain_avg_coherence},
    'Our Approach': {**our_approach_avg_rouge, **our_approach_avg_bert, **our_approach_avg_coherence},
    'LLM Pure': {**llm_pure_avg_rouge, **llm_pure_avg_bert, **llm_pure_avg_coherence}
}).transpose()

# Display the summary dataframe
print(summary_df)


                 rouge1    rouge2     rougeL  bert_precision  bert_recall  \
Markov Chain  33.720140  6.190105  14.607146        0.823217     0.827640   
Our Approach  34.130364  6.392678  14.571875        0.821687     0.825116   
LLM Pure      23.990682  5.447992  11.996699        0.832335     0.829389   

               bert_f1  first_order_coherence  second_order_coherence  
Markov Chain  0.825415               0.852806                0.850670  
Our Approach  0.823393               0.863621                0.862106  
LLM Pure      0.830805               0.729531                0.730952  
