In [16]:
import re

def process_csv_and_extract_responses(csv_file_path):
    extracted_data = []  # To store (id, rep, response, coherence, fluency, relevance, consistency) tuples
    current_id = None
    current_rep = None
    current_response = []

    with open(csv_file_path, 'r') as csvfile:
        for line in csvfile:
            line = line.strip()
            if line and line[0].isdigit():  # Line starts with an integer
                # If there's a previous response, save it
                if current_id is not None and current_rep is not None:
                    response_text = " ".join(current_response)
                    coherence, fluency, relevance, consistency = extract_metrics(response_text)
                    extracted_data.append((current_id, current_rep, coherence, fluency, relevance, consistency))
                # Extract id and rep
                parts = line.split(',')
                try:
                    current_id = int(parts[0])
                    current_rep = int(parts[1])
                    current_response = ["".join(parts[2:])]
                except (ValueError, IndexError):
                    continue
            else:  # Line does not start with an integer, treat as a response
                current_response.append(line)
        
        # Save the final response block, if any
        if current_id is not None and current_rep is not None:
            response_text = " ".join(current_response)
            coherence, fluency, relevance, consistency = extract_metrics(response_text)
            extracted_data.append((current_id, current_rep, coherence, fluency, relevance, consistency))
    
    return extracted_data

def extract_metrics(response_text):
    """Extract coherence, fluency, relevance, and consistency metrics from the response."""
    coherence = extract_metric(response_text, r"coherence:\s*(\d+)")
    fluency = extract_metric(response_text, r"fluency:\s*(\d+)")
    relevance = extract_metric(response_text, r"relevance:\s*(\d+)")
    consistency = extract_metric(response_text, r"consistency:\s*(\d+)")
    return coherence, fluency, relevance, consistency

def extract_metric(text, pattern):
    """Helper function to extract a metric based on a regex pattern."""
    match = re.search(pattern, text, re.IGNORECASE)
    return int(match.group(1)) if match else None

# Example usage
file_path = 'model2results.csv'  # Replace with your CSV file path
data_with_responses = process_csv_and_extract_responses(file_path)
final_response =""
for entry in data_with_responses:
    print(f"ID: {entry[0]}, Reps: {entry[1]}, coherence: {entry[2]}, fluency: {entry[3]}, relevance: {entry[4]}, consistency: {entry[5]}")
    




ID: 0, Reps: 0, coherence: 4, fluency: 5, relevance: 4, consistency: 3
ID: 0, Reps: 1, coherence: 4, fluency: 4, relevance: 5, consistency: 4
ID: 0, Reps: 2, coherence: 4, fluency: 5, relevance: 4, consistency: 5
ID: 1, Reps: 0, coherence: 4, fluency: 5, relevance: 4, consistency: 5
ID: 1, Reps: 1, coherence: 4, fluency: 4, relevance: 5, consistency: 4
ID: 1, Reps: 2, coherence: 4, fluency: 5, relevance: 4, consistency: 3
ID: 2, Reps: 0, coherence: 4, fluency: 5, relevance: 4, consistency: 5
ID: 2, Reps: 1, coherence: 4, fluency: 5, relevance: 4, consistency: 5
ID: 2, Reps: 2, coherence: 3, fluency: 4, relevance: 4, consistency: 3
ID: 3, Reps: 0, coherence: 4, fluency: 4, relevance: 5, consistency: 4
ID: 3, Reps: 1, coherence: 4, fluency: 5, relevance: 4, consistency: 3
ID: 3, Reps: 2, coherence: 4, fluency: 4, relevance: 5, consistency: 4
ID: 4, Reps: 0, coherence: 4, fluency: 4, relevance: 5, consistency: 3
ID: 4, Reps: 1, coherence: 4, fluency: 5, relevance: 4, consistency: 5
ID: 4,

In [17]:
def calculate_average_scores_by_groups(data):
    """
    Calculate average scores for coherence, fluency, relevance, and consistency 
    for entries grouped 3 by 3 with the same ID, handling None values.
    
    Args:
        data (list): A list of tuples in the format:
                     (id, rep, response, coherence, fluency, relevance, consistency)
    
    Returns:
        dict: A dictionary where the key is the ID and the value is another dictionary
              with average scores for coherence, fluency, relevance, and consistency.
    """
    def safe_average(values):
        """Calculate the average of values, ignoring None values."""
        valid_values = [v for v in values if v is not None]
        return sum(valid_values) / len(valid_values) if valid_values else None

    averages_by_id = {}
    
    for i in range(0, len(data), 3):  # Process in chunks of 3
        group = data[i:i+3]  # Get the current group of 3 entries
        if len(group) < 3:  # Ignore incomplete groups
            continue
        
        current_id = group[0][0]  # The ID is the same for all entries in the group
        
        # Collect values for each metric
        coherence_values = [entry[2] for entry in group]
        fluency_values = [entry[3] for entry in group]
        relevance_values = [entry[4] for entry in group]
        consistency_values = [entry[5] for entry in group]
        
        # Calculate averages for each metric
        coherence_avg = safe_average(coherence_values)
        fluency_avg = safe_average(fluency_values)
        relevance_avg = safe_average(relevance_values)
        consistency_avg = safe_average(consistency_values)
        
        # Store the results
        averages_by_id[current_id] = {
            "coherence": coherence_avg,
            "fluency": fluency_avg,
            "relevance": relevance_avg,
            "consistency": consistency_avg,
        }
    
    return averages_by_id

averages = calculate_average_scores_by_groups(data_with_responses)
for entry_id, scores in averages.items():
    print(f"ID: {entry_id}, Averages: {scores}")



ID: 0, Averages: {'coherence': 4.0, 'fluency': 4.666666666666667, 'relevance': 4.333333333333333, 'consistency': 4.0}
ID: 1, Averages: {'coherence': 4.0, 'fluency': 4.666666666666667, 'relevance': 4.333333333333333, 'consistency': 4.0}
ID: 2, Averages: {'coherence': 3.6666666666666665, 'fluency': 4.666666666666667, 'relevance': 4.0, 'consistency': 4.333333333333333}
ID: 3, Averages: {'coherence': 4.0, 'fluency': 4.333333333333333, 'relevance': 4.666666666666667, 'consistency': 3.6666666666666665}
ID: 4, Averages: {'coherence': 4.0, 'fluency': 4.666666666666667, 'relevance': 4.333333333333333, 'consistency': 4.333333333333333}
ID: 5, Averages: {'coherence': 3.6666666666666665, 'fluency': 4.333333333333333, 'relevance': 4.0, 'consistency': 4.333333333333333}
ID: 6, Averages: {'coherence': 3.6666666666666665, 'fluency': 4.666666666666667, 'relevance': 3.3333333333333335, 'consistency': 4.666666666666667}
ID: 7, Averages: {'coherence': 3.6666666666666665, 'fluency': 4.333333333333333, 'rel

In [36]:
import csv

def extract_expert_scores_from_file(csv_file_path):
    """
    Extract expert scores for each ID from a CSV file.

    Args:
        csv_file_path (str): The path to the CSV file.

    Returns:
        dict: A dictionary with IDs as keys and dictionaries of expert scores as values.
    """
    scores_by_id = {}

    with open(csv_file_path, 'r') as csvfile:
        csv_reader = csv.reader(csvfile)
        header = next(csv_reader)  # Read the header

        # Find indices of required columns
        expert_score_indices = {
            "coherence": header.index("expert_coherence_score"),
            "relevance": header.index("expert_relevance_score"),
            "consistency": header.index("expert_consistency_score"),
            "fluency": header.index("expert_fluency_score"),
        }

        # Process each row in the CSV
        for row in csv_reader:
            if not row:  # Skip empty rows
                continue

            try:
                entry_id = int(row[0]) - 1100  # Extract and adjust ID
                coherence = float(row[expert_score_indices["coherence"]]) if row[expert_score_indices["coherence"]] else None
                relevance = float(row[expert_score_indices["relevance"]]) if row[expert_score_indices["relevance"]] else None
                consistency = float(row[expert_score_indices["consistency"]]) if row[expert_score_indices["consistency"]] else None
                fluency = float(row[expert_score_indices["fluency"]]) if row[expert_score_indices["fluency"]] else None

                # Store scores in dictionary
                scores_by_id[entry_id] = {
                    "coherence": coherence,
                    "relevance": relevance,
                    "consistency": consistency,
                    "fluency": fluency,
                }
            except (ValueError, IndexError):
                # Handle cases where conversion or indexing fails
                continue

    return scores_by_id

# Example usage
csv_file_path = 'csv/powerful_model.csv'  # Replace with your CSV file path
scores = extract_expert_scores_from_file(csv_file_path)

# Output the results
for entry_id, scores_dict in scores.items():
    print(f"ID: {entry_id}, Scores: {scores_dict}")


ID: 0, Scores: {'coherence': 5.0, 'relevance': 4.333333333333333, 'consistency': 5.0, 'fluency': 5.0}
ID: 1, Scores: {'coherence': 4.0, 'relevance': 5.0, 'consistency': 5.0, 'fluency': 5.0}
ID: 2, Scores: {'coherence': 3.6666666666666665, 'relevance': 4.333333333333333, 'consistency': 5.0, 'fluency': 5.0}
ID: 3, Scores: {'coherence': 5.0, 'relevance': 5.0, 'consistency': 5.0, 'fluency': 5.0}
ID: 4, Scores: {'coherence': 4.333333333333333, 'relevance': 4.333333333333333, 'consistency': 5.0, 'fluency': 5.0}
ID: 5, Scores: {'coherence': 5.0, 'relevance': 5.0, 'consistency': 5.0, 'fluency': 5.0}
ID: 6, Scores: {'coherence': 3.6666666666666665, 'relevance': 3.6666666666666665, 'consistency': 5.0, 'fluency': 4.666666666666667}
ID: 7, Scores: {'coherence': 3.3333333333333335, 'relevance': 4.0, 'consistency': 5.0, 'fluency': 5.0}
ID: 8, Scores: {'coherence': 4.333333333333333, 'relevance': 4.0, 'consistency': 5.0, 'fluency': 5.0}
ID: 9, Scores: {'coherence': 4.333333333333333, 'relevance': 4.6

In [34]:
from scipy.stats import pearsonr

def calculate_pearson_correlation(averages, scores):
    """
    Calculate the Pearson correlation for each attribute between two dictionaries.

    Args:
        averages (dict): Dictionary of averages by ID.
        scores (dict): Dictionary of scores by ID.

    Returns:
        dict: Pearson correlation coefficients for each attribute.
    """
    attributes = ["coherence", "relevance", "consistency", "fluency"]
    correlations = {}

    for attr in attributes:
        # Collect values for the current attribute from both dictionaries
        averages_values = []
        scores_values = []

        for entry_id in averages.keys() & scores.keys():
            avg_value = averages[entry_id].get(attr)
            score_value = scores[entry_id].get(attr)

            # Only include pairs where both values are not None
            if avg_value is not None and score_value is not None:
                averages_values.append(avg_value)
                scores_values.append(score_value)

        # Calculate Pearson correlation if there are sufficient data points
        if len(averages_values) > 1:  
            correlation, _ = pearsonr(averages_values, scores_values)
            correlations[attr] = correlation
        else:
            correlations[attr] = None  

    return correlations

correlations = calculate_pearson_correlation(averages, scores)
for attr, corr in correlations.items():
    print(f"Pearson correlation for {attr}: {corr}")


Pearson correlation for coherence: 0.058214239422191824
Pearson correlation for relevance: 0.15468920215307905
Pearson correlation for consistency: 0.049989488853502545
Pearson correlation for fluency: 0.017720231767457444


In [37]:
from scipy.stats import spearmanr

def calculate_spearman_correlation(dict1, dict2):
    """
    Calculate Spearman correlation for each attribute between two dictionaries.

    Args:
        dict1 (dict): The first dictionary containing scores (e.g., expert scores).
        dict2 (dict): The second dictionary containing scores (e.g., averaged scores).

    Returns:
        dict: Spearman correlation coefficients for each attribute.
    """
    correlations = {}

    # Ensure the dictionaries have the same IDs and attributes
    common_ids = set(dict1.keys()) & set(dict2.keys())
    attributes = ["coherence", "relevance", "consistency", "fluency"]

    for attr in attributes:
        # Extract attribute values for common IDs
        values1 = [dict1[id][attr] for id in common_ids if dict1[id][attr] is not None and dict2[id][attr] is not None]
        values2 = [dict2[id][attr] for id in common_ids if dict1[id][attr] is not None and dict2[id][attr] is not None]

        if values1 and values2:
            # Calculate Spearman correlation
            correlation, _ = spearmanr(values1, values2)
            correlations[attr] = correlation
        else:
            correlations[attr] = None  # No valid data to calculate correlation

    return correlations

spearman_correlations = calculate_spearman_correlation(averages, scores)

# Output the results
for attr, corr in spearman_correlations.items():
    print(f"Spearman correlation for {attr}: {corr}")


Spearman correlation for coherence: -0.017490406404858678
Spearman correlation for relevance: -0.10218742665393342
Spearman correlation for consistency: -0.11423252086268515
Spearman correlation for fluency: 0.06965942021223112
