# PG fingeprinting for polymers, morgan for additives

In [15]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
from pickle import dump
from pgfingerprinting import fp
from rdkit import Chem
from rdkit.Chem import AllChem

def morgan_fingerprint_from_smiles(smiles, radius=5, nBits=64):
    """
    Generate a Morgan fingerprint (as a list of ints) from a SMILES string.
    If the SMILES is a placeholder '*CC*' (representing no additive),
    return a zero vector of length nBits.
    """
    if smiles == '*CC*':
        # Return a zero vector to represent empty additive fingerprint
        return [0]*nBits
    
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        # If invalid SMILES, also return a zero vector rather than fail
        return [0]*nBits
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
    arr = list(fp.ToBitString())
    arr = [int(x) for x in arr]
    return arr

def fingerprint_smiles(smiles_array, use_morgan=False):
    """
    Fingerprint a set of SMILES.
    If use_morgan=True, use RDKit Morgan fingerprints for each SMILES.
    If use_morgan=False, use the original pgfingerprinting method.
    """
    unique_sm = np.unique(smiles_array)
    sm_store = {}
    
    if use_morgan:
        # Morgan fingerprints
        for sm in unique_sm:
            fp_data = morgan_fingerprint_from_smiles(sm)
            sm_store[sm] = fp_data
    else:
        # Use pgfingerprinting as before
        params = {
            "fp_identifier": "fp_",
            "write_property": 0,
            "col_property": "",
            "normalize_a": 1,
            "normalize_b": 1,
            "normalize_m": 1,
            "normalize_e": 1,
            "block_list_version": "20201210",
            "ismolecule": 0,
            "polymer_fp_type": ["aS", "aT", "bT", "m", "e"],
            "calculate_side_chain": 1,
            "use_chirality": 1,
        }
        for sm in unique_sm:
            fp_data = fp.fingerprint_from_smiles(sm, params)
            if fp_data is None:
                raise ValueError(f"Fingerprinting failed for SMILES: {sm}")
            sm_store[sm] = fp_data

    fps_list = [sm_store[sm] for sm in smiles_array]
    fps_df = pd.DataFrame(fps_list, dtype=np.float16)

    # For Morgan fingerprints, we have a fixed length, so no need to drop columns if all zero.
    # But for the original method, we drop columns that are all zero.
    if not use_morgan:
        drops = fps_df.columns[(fps_df == 0).all(0)]
        fps_df = fps_df.drop(drops, axis=1)

    fps_df = fps_df.fillna(0).reset_index(drop=True)
    return fps_df

def fingerprint(
    df='',
    df_file='',
    name='',
    candidates=False,
    fps_save='fps_file',
    include_theor_IEC=True,
    fps_df_save='',
    columns_file='',
    fps_size='',
):
    # If a df_file is provided, load the DataFrame
    if df_file != '':
        df = pd.read_csv(df_file, engine='python')
        df = df.reset_index(drop=True)

    # Replace missing SMILES with a placeholder
    def prepare_smiles_columns(df, prefix):
        # prefix could be "smiles" or "additive_smiles"
        s1, s2, s3 = f"{prefix}1", f"{prefix}2", f"{prefix}3"
        arr = np.stack((df[s1].values, df[s2].values, df[s3].values)).T
        arr = np.where(pd.isna(arr), '*CC*', arr)
        return arr.flatten()

    polymer_smiles = prepare_smiles_columns(df, "smiles")
    additive_smiles = prepare_smiles_columns(df, "additive_smiles")

    # Fingerprint polymer using original method (pgfingerprinting)
    polymer_fps = fingerprint_smiles(polymer_smiles, use_morgan=False)
    print('---This is polymer fps---', polymer_fps)

    # Fingerprint additive using Morgan fingerprints
    additive_fps = fingerprint_smiles(additive_smiles, use_morgan=True)

    # Align columns: If additive_fps missing some columns from polymer_fps or vice versa, add them as zeros
    poly_cols = set(polymer_fps.columns)
    add_cols = set(additive_fps.columns)
    poly_missing = add_cols - poly_cols
    add_missing = poly_cols - add_cols

    for col in poly_missing:
        polymer_fps[col] = 0
    for col in add_missing:
        additive_fps[col] = 0

    # Reorder additive_fps columns to match polymer_fps
    additive_fps = additive_fps[polymer_fps.columns]

    # Save columns info
    with open(columns_file, 'wb') as f:
        dump(polymer_fps.columns, f)

    # Convert the flattened fps back to shape (N, 3, features)
    N = len(df)
    polymer_fps_3d = polymer_fps.values.reshape(N, 3, -1)
    additive_fps_3d = additive_fps.values.reshape(N, 3, -1)

    # Save a combined fps_df for reference (polymers)
    polymer_fps_df = pd.DataFrame([tuple(x) for x in polymer_fps_3d])
    polymer_fps_df.to_csv(fps_df_save)
    if fps_size != '':
        with open(fps_size, 'wb') as f:
            dump([polymer_fps_df.shape[0]], f)

    # Compute weighted fingerprints for polymers and additives
    # Polymer weights
    c1 = df.c1.astype(float) / 100.0
    c2 = df.c2.astype(float) / 100.0
    c3 = df.c3.astype(float) / 100.0

    # Additive weights
    ac1 = df.additivec1.astype(float) / 100.0
    ac2 = df.additivec2.astype(float) / 100.0
    ac3 = df.additivec3.astype(float) / 100.0

    polymer_weighted = []
    polymer_sum = []
    additive_weighted = []
    additive_sum = []

    for i in range(N):
        # Polymer
        p_fp1, p_fp2, p_fp3 = polymer_fps_3d[i]
        p_wfp = (p_fp1 * c1[i]) + (p_fp2 * c2[i]) + (p_fp3 * c3[i])
        p_sum = p_fp1 + p_fp2 + p_fp3
        polymer_weighted.append(p_wfp)
        polymer_sum.append(p_sum)

        # Additive
        a_fp1, a_fp2, a_fp3 = additive_fps_3d[i]
        a_wfp = (a_fp1 * ac1[i]) + (a_fp2 * ac2[i]) + (a_fp3 * ac3[i])
        a_sum = a_fp1 + a_fp2 + a_fp3
        additive_weighted.append(a_wfp)
        additive_sum.append(a_sum)

    polymer_weighted = np.array(polymer_weighted)
    additive_weighted = np.array(additive_weighted)
    combined_fps = np.concatenate([polymer_weighted, additive_weighted], axis=1)

    # Create final DataFrame of fingerprints
    polymer_cols = [f"poly_{col}" for col in polymer_fps.columns]
    additive_cols = [f"add_{col}" for col in polymer_fps.columns]
    combined_columns = polymer_cols + additive_cols

    combined_df = pd.DataFrame(combined_fps, columns=combined_columns)

    # Add additional features: Temp(C), RH(%), stab_temp, solvent, solvent_conc, time
    combined_df['Temp(C)'] = df['Temp(C)']
    combined_df['RH(%)'] = df['RH(%)']
    combined_df['stab_temp'] = df['stab_temp']
    combined_df['solvent'] = df['solvent']
    combined_df['solvent_conc(M)'] = df['solvent_conc(M)']
    combined_df['time(h)'] = df['time(h)']

    # Include theoretical IEC if requested
    if include_theor_IEC and 'IEC' in df.columns:
        combined_df['EXP_IEC'] = df['EXP_IEC']

    # Save the combined fps
    combined_df.to_csv(fps_save)

    if not candidates:
        # Add fps and sum info to original df
        df['fps'] = list(zip(combined_df[combined_columns].values.tolist()))

        poly_sum_str = [str(arr.tolist()) for arr in polymer_sum]
        add_sum_str = [str(arr.tolist()) for arr in additive_sum]
        combined_sum_str = [p + a for p, a in zip(poly_sum_str, add_sum_str)]

        df['fps_sum'] = poly_sum_str
        df['fps_cpid'] = add_sum_str
        df['fps_composition'] = combined_sum_str

        df['pid'] = df.groupby(by='fps_sum').ngroup()
        df['cpid'] = df.groupby(by='fps_cpid').ngroup()
        df['composition_pid'] = df.groupby(by='fps_composition').ngroup()

        if 'value' in df.columns:
            df.dropna(subset=["value"], inplace=False)

        final_cols = [
            'selector', 'prop', 'value', 'fps', 'fps_sum', 'fps_cpid',
            'fps_composition', 'pid', 'cpid', 'composition_pid'
        ]
        final_cols = [c for c in final_cols if c in df.columns]
        df = df[final_cols]

        file_train = f'{name}.csv'
        df.to_csv(file_train)

    return df

In [16]:
# Set up file names and paths
df_file = '/data/wschertzer/aem_aging/modeling/data/aem_aging_12_16_24.csv'            # The CSV file you provided
name = 'example_run'               # A name to distinguish your output files
fps_save = 'fps_output.csv'        # Where to save the final fingerprints
fps_df_save = 'fps_raw_df.csv'     # An intermediate file that saves the polymer fps before combination
columns_file = 'fps_columns.pkl'   # A pickle file to save column names
fps_size = 'fps_size.pkl'          # A pickle file to save the size information

# Call the fingerprint function
df_processed = fingerprint(
    df_file=df_file,
    name=name,
    candidates=False,            # Set to False if you're processing training data
    fps_save=fps_save,
    include_theor_IEC=True,      # Set based on whether you have IEC in your data
    fps_df_save=fps_df_save,
    columns_file=columns_file,
    fps_size=fps_size,
)

print(df_processed.head())


---This is polymer fps---          afp_H1    afp_C4    afp_C3    afp_O2    afp_O1    afp_N3  \
0      0.395752  0.041656  0.458252  0.041656  0.020828  0.041656   
1      0.666504  0.333252  0.000000  0.000000  0.000000  0.000000   
2      0.666504  0.333252  0.000000  0.000000  0.000000  0.000000   
3      0.395752  0.041656  0.458252  0.041656  0.020828  0.041656   
4      0.666504  0.333252  0.000000  0.000000  0.000000  0.000000   
...         ...       ...       ...       ...       ...       ...   
10105  0.666504  0.333252  0.000000  0.000000  0.000000  0.000000   
10106  0.666504  0.333252  0.000000  0.000000  0.000000  0.000000   
10107  0.666504  0.333252  0.000000  0.000000  0.000000  0.000000   
10108  0.666504  0.333252  0.000000  0.000000  0.000000  0.000000   
10109  0.666504  0.333252  0.000000  0.000000  0.000000  0.000000   

       afp_H1_C4_H1  afp_C3_C4_H1  afp_H1_C4_N3  afp_C3_C3_H1  ...  \
0          0.083313      0.041656      0.104187           0.5  ...   
1    

  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[col] = 0
  additive_fps[c

          prop value                                                fps  \
0  Swelling(%)   6.1  ([0.395751953125, 0.041656494140625, 0.4582519...   
1  Swelling(%)  7.85  ([0.395751953125, 0.041656494140625, 0.4582519...   
2  Swelling(%)     7  ([0.395751953125, 0.041656494140625, 0.4582519...   
3  Swelling(%)   9.4  ([0.395751953125, 0.041656494140625, 0.4582519...   
4  Swelling(%)  5.69  ([0.395751953125, 0.041656494140625, 0.4582519...   

                                             fps_sum  \
0  [1.728759765625, 0.708160400390625, 0.45825195...   
1  [1.728759765625, 0.708160400390625, 0.45825195...   
2  [1.728759765625, 0.708160400390625, 0.45825195...   
3  [1.728759765625, 0.708160400390625, 0.45825195...   
4  [1.728759765625, 0.708160400390625, 0.45825195...   

                                            fps_cpid  \
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 