In [1]:
import pandas as pd

def compare_smiles_and_decorations(file1, file2):
    # Load the data from both files, assuming the data is separated by semicolons in a single column
    df1 = pd.read_csv(file1, header=None, sep=';', names=['scaffold', 'decorations', 'smiles'])
    df2 = pd.read_csv(file2, header=None, sep=';', names=['scaffold', 'decorations', 'smiles'])
    
    # Sort and reset index for reliable comparison
    df1_sorted = df1.sort_values(by='smiles').reset_index(drop=True)
    df2_sorted = df2.sort_values(by='smiles').reset_index(drop=True)
    
    # Check if all 'smiles' in both files match and if 'smiles' matches 'decorations' in df2
    if (df1_sorted['smiles'].equals(df2_sorted['smiles'])) and (df2_sorted['smiles'].equals(df2_sorted['decorations'])):
        print('ALL the smiles and decorations in the second file are the same as the smiles in the first.')  
        
        # Calculate statistics for 'smiles' from the second file
        smiles_lengths = df2_sorted['smiles'].str.len()
        max_length = smiles_lengths.max()
        mean_length = smiles_lengths.mean()
        min_length = smiles_lengths.min()

        # Return results including statistics
        return {
            'all_smiles_and_decorations_match': True,
            'max_length': max_length,
            'mean_length': mean_length,
            'min_length': min_length
        }
    else:
        print('Not all smiles match between the files or smiles do not match decorations in the second file.')
        return {'all_smiles_and_decorations_match': False}

In [2]:
# Usage example:
qm9_original = 'QM9_10k_LEN_3.csv'
qm9_gen = 'GEN_QM9_10k_LEN_3.csv'
result = compare_smiles_and_decorations(qm9_original, qm9_gen)
print(result)

ALL the smiles and decorations in the second file are the same as the smiles in the first.
{'all_smiles_and_decorations_match': True, 'max_length': 30, 'mean_length': 14.4709, 'min_length': 8}


In [3]:
# Usage example:
zinc_original = 'ZINC_10k_LEN_10.csv'
zinc_gen = 'GEN_ZINC_10k_LEN_10.csv'
result = compare_smiles_and_decorations(zinc_original, zinc_gen)
print(result)

ALL the smiles and decorations in the second file are the same as the smiles in the first.
{'all_smiles_and_decorations_match': True, 'max_length': 69, 'mean_length': 37.8216, 'min_length': 22}
