In [40]:
import pandas as pd
from rdkit import Chem
def compare_csvs(csv_path1, csv_path2):
    """
    Compare two CSV files in terms of repeated materials, matching materials, and accuracy based on value comparison.

    Parameters:
    csv_path1 (str): Path to the first CSV file.
    csv_path2 (str): Path to the second CSV file.

    Returns:
    tuple: A tuple containing the number of repeated materials in the first CSV,
           the number of materials in the first CSV that are also in the second CSV,
           and the accuracy of these materials' values being within a 10% error margin.
    """
    # Load the CSV files into pandas DataFrames
    df1 = pd.read_csv(csv_path1)
    df2 = pd.read_csv(csv_path2)
    df1['Material'].apply(lambda x: Chem.CanonSmiles(x))
    #df2['Material'].apply(lambda x: Chem.CanonSmiles(x))
    # Calculate the number of repeated materials in the first CSV
    repeated_materials = df1.duplicated(subset=['Material']).sum()

    # Group duplicated materials
    df1 = df1.groupby('Material')['Value'].apply(list).reset_index()
    print(df1.head)
    print(df2.head)
    # Find materials in the first CSV that are also in the second CSV
    matching_materials = pd.merge(df1, df2, on='Material', how='inner', suffixes=('_x', '_y'))
    # Function to check if the values are within a 10% error margin
    def is_within_10_percent(row):
        if(row['Value_y'] == 0):
            return all(abs(gpt_value - row['Value_y']) <= 5 for gpt_value in row['Value_x'])
        return all(abs(gpt_value - row['Value_y']) / row['Value_y'] <= 0.1 for gpt_value in row['Value_x'])

    print(matching_materials.head)
    # Apply the function to determine correctness
    matching_materials['is_correct'] = matching_materials.apply(is_within_10_percent, axis=1)

    # Count how many are correct
    correct_count = matching_materials['is_correct'].sum()

    # Calculate accuracy
    accuracy = correct_count / len(matching_materials) if len(matching_materials) > 0 else 0

    return repeated_materials, len(matching_materials), accuracy

# Example usage
csv_path1 = 'test.csv'
csv_path2 = '../test_smiles_reg.csv'
repeated_materials, matching_material_count, accuracy = compare_csvs(csv_path1, csv_path2)

print(f"Number of repeated materials in the first CSV: {repeated_materials}")
print(f"Materials in the first CSV that are also in the second CSV: {matching_material_count}")
print(f"Accuracy (materials within 10% error margin): {accuracy:.2%}")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [39]:
csv_path2 = '../test_smiles_reg.csv'
df2 = pd.read_csv(csv_path2)
df2['Value'] = df2['Value'].str.strip('[]').astype(float)
#df2['Material'].apply(lambda x: Chem.CanonSmiles(x))
df2.to_csv('../test_smiles_reg.csv', index=False)  

## Use this one!

In [45]:
import pandas as pd

def compare_csvs(csv_path1, csv_path2):
    # Read the CSV files
    df1 = pd.read_csv(csv_path1)
    df2 = pd.read_csv(csv_path2)
    
    # Step 1: Canonicalize SMILES strings in df1 (assuming external canonicalization)
    # Note: This step is assumed to be done prior or outside this function due to RDKit dependency.
    df1['Material'].apply(lambda x: Chem.CanonSmiles(x))
    # Step 2: Identify duplicate materials in df1
    duplicate_materials_count = df1.duplicated('Material', keep=False).sum()
    
    # Step 3: Find unique materials in both df1 and df2
    unique_materials_df1 = pd.unique(df1['Material'])
    unique_materials_df2 = pd.unique(df2['Material'])
    unique_in_both = len(set(unique_materials_df1).intersection(set(unique_materials_df2)))
    # Define a function to check if the value in df1 falls within the desired range for the same material in df2
    def is_correct_prediction(row, df2):
        material = row['Material']
        value = row['Value']
        df2_values = df2[df2['Material'] == material]['Value']
        min_value = df2_values.min() * 0.9
        max_value = df2_values.max() * 1.1
        return min_value <= value <= max_value
    
    # Step 4: Calculate Overall Accuracy for df1
    df1['Correct Prediction'] = df1.apply(is_correct_prediction, df2=df2, axis=1)
    overall_accuracy = df1['Correct Prediction'].mean()
    
    # Step 5: Calculate Accuracy for Unique Materials
    unique_materials_in_both = df1[df1['Material'].isin(unique_materials_df2)]
    unique_accuracy = unique_materials_in_both['Correct Prediction'].mean()
    
    return {
        'Duplicate Materials in df1': duplicate_materials_count,
        'Unique Materials in Both df1 and df2': unique_in_both,
        'Overall Accuracy': overall_accuracy,
        'Accuracy for Unique Materials': unique_accuracy
    }

# You can call this function with the paths to your CSV files:
csv_path1 = 'PolymerTransitionTemperature_SMILES_1_2024_03_19-121341.csv'
csv_path2 = '../test_smiles_reg.csv'
results = compare_csvs(csv_path1, csv_path2)
print(results)


{'Duplicate Materials in df1': 613, 'Unique Materials in Both df1 and df2': 0, 'Overall Accuracy': 0.0, 'Accuracy for Unique Materials': nan}


In [47]:
print(Chem.CanonSmiles("CC=C"))

C=CC
