## Filter solvents

In [None]:
import pandas as pd 

In [None]:
def filter_solvents(df):
    solvents = []
    with open('data/raw/solvents.txt', 'r') as file:
        for line in file:
            solvents.append(line.strip('\n'))
    df_filtered = df[df.G.isin(solvents) == False].copy()
    if 'Unnamed: 0' in df_filtered.columns:
        df_filtered = df_filtered.drop(columns = 'Unnamed: 0')
    print(f'Removed {df.shape[0] - df_filtered.shape[0]} rows with solvent guests')
    return df_filtered

In [None]:
csd_df = pd.read_csv('data/raw/csd.csv')
exp_df = pd.read_csv('data/raw/exp.csv')

In [None]:
csd_df['cryst'] = 1

In [None]:
csd_filtered = filter_solvents(csd_df)

In [None]:
exp_filtered = filter_solvents(exp_df)

## Filter SMILES not converting to .mol

In [None]:
from rdkit import Chem
from rdkit.Chem import PandasTools

In [None]:
def filter_incorrect_smiles(df):
    df_filtered = df.copy()
    target_columns = ['A','B','G']
    for col in target_columns:
        PandasTools.AddMoleculeColumnToFrame(df_filtered, smilesCol=col, molCol = col + '_mol')
    df_filtered = df_filtered.drop_duplicates()
    print(f'Removed {df.shape[0] - df_filtered.shape[0]} duplicates')
    df_filtered = df_filtered.dropna()
    print(f'Removed {df.shape[0] - df_filtered.shape[0]} incorrect SMILES')
    return df_filtered.reset_index(drop=True)

In [None]:
csd_filtered = filter_incorrect_smiles(csd_filtered)

In [None]:
exp_filtered = filter_incorrect_smiles(exp_filtered)

# Create negatives for train

In [None]:
import numpy as np
import seaborn as sns
from itertools import combinations

from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import PandasTools
from rdkit import DataStructs

In [None]:
a = csd_filtered['A'].unique()
b = csd_filtered['B'].unique()
g = csd_filtered['G'].unique()
print(f'Unique amins: {len(a)}, unique sulfoacids: {len(b)}, unique guests: {len(g)}') 

In [None]:
abg_dict = {'unique_amins' : a, 'unique_sulfoacids': b, 'unique_guests' : g}
unique_df = pd.DataFrame.from_dict(abg_dict, orient='index')
unique_df = unique_df.transpose()

### Analyze similarities

In [None]:
fp_radius = 4 #to include large fragments
fp_n_bits = 1024
fps = {}

In [None]:
fpg = rdFingerprintGenerator.GetMorganGenerator(radius = fp_radius, fpSize=fp_n_bits)

fps['A'] = [fpg.GetFingerprint(Chem.MolFromSmiles(smiles)) for smiles in a]
fps['B'] = [fpg.GetFingerprint(Chem.MolFromSmiles(smiles)) for smiles in b]
fps['G'] = [fpg.GetFingerprint(Chem.MolFromSmiles(smiles)) for smiles in g]

In [None]:
def calculate_similarities_within(fingerprints):
    similarity_scores = []
    for fp1, fp2 in combinations(fingerprints, 2):
        similarity_score = DataStructs.TanimotoSimilarity(fp1, fp2)
        similarity_scores.append(similarity_score)
    return similarity_scores

In [None]:
similarity = {}
for key in fps.keys():
    similarity[key] = calculate_similarities_within(fps[key])

In [None]:
q = 0.5
threshold = {'A' : np.quantile(similarity['A'], q), 'B' : np.quantile(similarity['B'], q), 'G' : np.quantile(similarity['G'], q)}

### Create negatives from dataframe

Negatives are formed via replacing **guest** with not similar molecule

In [None]:
import random

In [None]:
def find_not_similar(smiles_list, fp, thresh):
    smiles = random.choice(smiles_list)
    mol = Chem.MolFromSmiles(smiles)
    fp2 = fpg.GetFingerprint(mol)
    
    while DataStructs.TanimotoSimilarity(fp, fp2) > thresh:
        smiles = random.choice(smiles_list)
        mol = Chem.MolFromSmiles(smiles)
        fp2 = fpg.GetFingerprint(mol)
        
    return smiles, mol

In [None]:
def replace_guest(df, g, thresh, fpg, amount = 1000):
    new_df = df.sample(n = amount, replace=True).reset_index(drop=True).copy()
    rows_list = []
    
    for index, row in new_df.iterrows():
        fp1 = fpg.GetFingerprint(row['G_mol'])
        row['G'], row['G_mol'] = find_not_similar(g, fp1, thresh['G'])
        row['cryst'] = 0
        rows_list.append(row)
        
    return pd.DataFrame(rows_list)

In [None]:
neg_df = pd.DataFrame()
neg_df = pd.concat([neg_df, replace_guest(csd_filtered, csd_filtered['G'].unique(), threshold, fpg, 1000)])

# Save train and test datasets

In [None]:
train_df = pd.concat([neg_df, csd_filtered]).reset_index(drop=True).drop(columns = ['A_mol', 'B_mol', 'G_mol'])

In [None]:
train_df.to_csv('data/processed/train.csv') #cryst < 1 means negative sample

In [None]:
test_df = exp_filtered.reset_index(drop=True).drop(columns = ['A_mol', 'B_mol', 'G_mol'])

In [None]:
test_df.to_csv('data/processed/test.csv') #cryst < 1 means negative sample