# Evaluate metrics


In [None]:
import json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.preprocessing import StandardScaler


In [None]:

metrics_to_plot = ['chrf', 'rouge', 'ter', 'bleu', 'codebleu', "manual_evaluation"]
metrics_to_plot_label = ['chrf', 'rouge', '1 - min(ter , 1)', 'bleu', 'codebleu', "manual_evaluation"]

file_names_to_exclude = ["data/member/repository/MemberEntityListener.java",
                          "data/member/controller/RoomController.java", 
                          "data/member/business/RoomAccommodationService.java", 
                          "data/member/business/LanguageService.java", 
                          "data/member/business/AssociateFacultyService.java", 
                          "data/member/business/FacultyService.java", 
                          "data/member/business/BlockAccessService.java", 
                          "data/member/dto/RoleAssignmentCreateDto.java", 
                          ]

dataset_path = "data/preprocess_and_result_data/data_manually_evaluated.json" 
with open(dataset_path, 'r') as f:
    metric_dataset = json.load(f)
 
"""
# Filter out the files that are in the exclude list
metric_dataset = [
    item for item in metric_dataset 
    if item.get("filename") not in file_names_to_exclude
]
"""

In [None]:

# Extract data for plotting
file_names = [item['filename'] for item in metric_dataset]
num_files = len(file_names)
num_metrics = len(metrics_to_plot)
bar_width = 0.8 / num_metrics
index = range(num_files)

fig, ax = plt.subplots(figsize=(20, 10))  # Adjust figure size as needed


for i, metric_name in enumerate(metrics_to_plot):
    metric_values = []
    for item in metric_dataset:
        score = 0
        if metric_name == 'chrf':
            score = item.get(metric_name, {}).get('score', 0) / 100.0
            metric_values.append(score)
        elif metric_name == 'rouge':
            score = item.get(metric_name, {}).get('avg', 1)
         
            metric_values.append(score)
        elif metric_name == 'ter':
            score =  1- min(  item.get(metric_name, {}).get('score', 0) / 100.0, 1) 
          #  score =    item.get(metric_name, {}).get('score', 0) / 100.0
            metric_values.append(score)
          #   display("ter " + str(score) + " " + str(item.get(metric_name, {}).get('score', 0)))
        elif metric_name == 'bleu' or metric_name == 'codebleu':
            score = item.get(metric_name, {}).get(metric_name, 0)  # Assuming 'bleu' is directly under the top level
            metric_values.append(score)
        elif metric_name == 'manual_evaluation':

            score = item.get(metric_name, 0)  
        
            if score != None  :
                score = score / 10.0
            else:
                score = 0
            
            metric_values.append(score)
        else:
            score = item.get(metric_name, 0)  # Default to 0 if metric not found
            metric_values.append(score)

    positions = [p + i * bar_width for p in index]
    ax.bar(positions, metric_values, bar_width, label=metrics_to_plot_label[i], alpha=0.7)

# Customize the plot
ax.set_xlabel("Files")
ax.set_ylabel("Metric Score")
ax.set_title("Metric Comparison Across Files")
ax.set_xticks([p + bar_width * (num_metrics - 1) / 2 for p in index])

x_labels = [item['filename'].split("/")[-1] for item in metric_dataset]

ax.set_xticklabels(x_labels, rotation=45, ha="right")

ax.set_xlim(-0.5, num_files ) 
ax.set_xlim(-0.5, num_files+ 0.5 ) 

ax.legend()
plt.tight_layout()
 # Adjust to match the number of files



plt.grid(True, which="both", ls="-", alpha=0.5)
plt.show()

In [None]:
matrix_dataset = []

for item in metric_dataset:
    row = {}
   #  for metric_name in metrics_to_plot:
    for i, metric_name in enumerate(metrics_to_plot):
        if metric_name == 'chrf':
            score = item.get(metric_name, {}).get('score', 0)  
            row[metrics_to_plot[i]] = score
        elif metric_name == 'rouge':
            score_avg = item.get(metric_name, {}).get('avg', 1)
            score_rouge1 = item.get(metric_name, {}).get('rouge1', 1)
            score_rouge2 = item.get(metric_name, {}).get('rouge2', 1)
            score_rougeL = item.get(metric_name, {}).get('rougeL', 1)
            score_rougeLsum = item.get(metric_name, {}).get('rougeLsum', 1)

            row[metrics_to_plot[i] + " avg"] = score_avg
            row[metrics_to_plot[i] + " rouge1"] = score_rouge1
            row[metrics_to_plot[i] + " rouge2"] = score_rouge2
            row[metrics_to_plot[i] + " rougeL"] = score_rougeL
            row[metrics_to_plot[i] + " rougeLsum"] = score_rougeLsum

        elif metric_name == 'ter':
            score =  1- min(  item.get(metric_name, {}).get('score', 0) / 100.0, 1) 
            score = item.get(metric_name, {}).get('score', 0) 
            row[metrics_to_plot[i]] = score
        elif metric_name == 'bleu' or metric_name == 'codebleu':
            score = item.get(metric_name, {}).get(metric_name, 0)
            row[metrics_to_plot[i]] = score
        elif metric_name == 'manual_evaluation':
            score = item.get(metric_name, 0)
            if score is not None:
                score = score / 10.0
            else:
                score = 0
            row[metrics_to_plot[i]] = score
        else:
            row[metrics_to_plot[i]] = item.get(metric_name, 0)

    matrix_dataset.append(row)



In [None]:
matrix_dataset = pd.DataFrame(matrix_dataset)

def display_correlation(matrix_dataset, method='pearson'):
    # Convert to DataFrame
    df = pd.DataFrame(matrix_dataset)

    # Compute correlation matrix
    correlation_matrix = df.corr(method=method)  # Options: 'pearson', 'spearman', 'kendall'

    # Plot heatmap
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title("Metric Correlation Heatmap")
    plt.show()

    return correlation_matrix

correlation_matrix = display_correlation(matrix_dataset)