In [2]:
import pandas as pd

In [30]:
import pandas as pd

def compare_csvs(csv_path1, csv_path2):
    """
    Compare two CSV files in terms of repeated materials, matching materials, and accuracy based on value comparison.

    Parameters:
    csv_path1 (str): Path to the first CSV file.
    csv_path2 (str): Path to the second CSV file.

    Returns:
    tuple: A tuple containing the number of repeated materials in the first CSV,
           the number of materials in the first CSV that are also in the second CSV,
           and the accuracy of these materials' values being within a 10% error margin.
    """
    # Load the CSV files into pandas DataFrames
    df1 = pd.read_csv(csv_path1)
    df2 = pd.read_csv(csv_path2)
    # Calculate the number of repeated materials in the first CSV
    repeated_materials = df1.duplicated(subset=['Material']).sum()

    # Group duplicated materials
    df1 = df1.groupby('Material')['Value'].apply(list).reset_index()

    # Find materials in the first CSV that are also in the second CSV
    matching_materials = pd.merge(df1, df2, on='Material', how='inner', suffixes=('_x', '_y'))
    # Function to check if the values are within a 10% error margin
    def is_within_10_percent(row):
        if(row['Value_y'] == 0):
            return all(abs(gpt_value - row['Value_y']) <= 5 for gpt_value in row['Value_x'])
        return all(abs(gpt_value - row['Value_y']) / row['Value_y'] <= 0.1 for gpt_value in row['Value_x'])

    # Apply the function to determine correctness
    matching_materials['is_correct'] = matching_materials.apply(is_within_10_percent, axis=1)

    # Count how many are correct
    correct_count = matching_materials['is_correct'].sum()

    # Calculate accuracy
    accuracy = correct_count / len(matching_materials) if len(matching_materials) > 0 else 0

    return repeated_materials, len(matching_materials), accuracy

# Example usage
csv_path1 = 'PolymerTransitionTemperature_politely_6_2024_03_04-142435.csv'
csv_path2 = 'test.csv'
repeated_materials, matching_material_count, accuracy = compare_csvs(csv_path1, csv_path2)

print(f"Number of repeated materials in the first CSV: {repeated_materials}")
print(f"Materials in the first CSV that are also in the second CSV: {matching_material_count}")
print(f"Accuracy (materials within 10% error margin): {accuracy:.2%}")


Number of repeated materials in the first CSV: 46
Materials in the first CSV that are also in the second CSV: 21
Accuracy (materials within 10% error margin): 57.14%
