In [None]:
# Quick hack to import chemspace
import sys
sys.path.append("..")

import chemspace as cs

import numpy as np
import pandas as pd

In [None]:
dataset = pd.read_csv("../chemspace/Dataset/Data/Dataset.gz")
dataset = dataset[['SMILES', 'AllText']].dropna().reset_index(drop=True)
dataset

In [None]:
names, smiless, descriptions = cs.download_compounds(1,2)#(2244-5, 2244+5)

In [None]:
def generate_df(start_cid, end_cid, path=None):
    if path:
        old_df = pd.read_csv(path)
    names, smiless, descriptions = cs.download_compounds(start_cid, end_cid)
    df = pd.DataFrame({"Name": names, "SMILES": smiless, "Description": descriptions})
    df = df[df['Description'] != 'No description available.']
    df['xyz'] = df['SMILES'].apply(lambda x: cs.get_3D_representation(x, representation='xyz'))
    df['zmat'] = df['SMILES'].apply(lambda x: cs.get_zmat(x))
    if path:
        df = pd.concat([old_df, df])
        df = df.drop_duplicates(subset=['SMILES'])
    return df

#2000
df = generate_df(5000, 15000, '../chemspace/Dataset/Data/PubChem.csv')

In [None]:
df.to_csv('../chemspace/Dataset/Data/PubChem.csv', index=False)

In [None]:
df = pd.read_csv('../chemspace/Dataset/Data/PubChem.csv')
df

In [None]:
import seaborn as sns
df = pd.read_csv('../chemspace/Dataset/Data/PubChem.csv')
sns.histplot(df['Description'].apply(lambda x: len(x.split())), bins=100)

# Old tests

In [None]:
smls=[
    'CC(C)C1=CC=C(C=C1)C(C)C',
    'CC(C)C1=CC=C(C=C1)C(C)C',
    'CC(C)C1=CC=C(C=C1)C(C)C',
    'CC(C)C1=CC=C(C=C1)C(C)C',
    'CC(C)C1=CC=C(C=C1)C(C)C',
]

## Using OpenBabel

In [None]:
# load smls in a pandas df
df = pd.DataFrame(smls, columns=['smiles'])

In [None]:
m=pybel.readstring('smi', smls[0])
m.OBMol.AddHydrogens()
m.make3D()

conv = openbabel.OBConversion()
conv.SetOutFormat("gzmat")

print(conv.WriteString(m.OBMol))

In [None]:
conv = openbabel.OBConversion()
conv.SetOutFormat("gzmat")

df['mol'] = df['smiles'].apply(lambda x: pybel.readstring('smi', x))

#addH and make3D change the mol object and return None. So we need to use a dummy variable to get the return
_ = df['mol'].apply(lambda x: x.OBMol.AddHydrogens())
_ = df['mol'].apply(lambda x: x.make3D())

# Compute xyz. Also accept pdb, sdf and gzmat
df['xyz'] = df['mol'].apply(lambda x: conv.WriteString(x.OBMol))
df

In [None]:
# use openbabel to convert smiles to 3D coordinates

sml=smls[0]
mol = pybel.readstring('smi', sml)
mol.OBMol.AddHydrogens()
mol.make3D()
mol.write('sdf', 'obabel.sdf', overwrite=True)
mol.write('pdb', 'obabel.pdb', overwrite=True)
mol.write('xyz', 'obabel.xyz', overwrite=True)
mol.write('gzmat', 'obabel.zmat', overwrite=True)
mol

### compacting the gzmat

In [None]:
def replace_variables_in_zmatrix(gzmat):
    lines = gzmat.split('\n')

    variables = {}
    for line in lines:
        if "Variables:" in line:
            # Start of variables section
            index = lines.index(line)
            for variable_line in lines[index+1:]:
                if not variable_line:
                    continue
                var_name, var_value = variable_line.split('=')
                variables[var_name.strip()] = var_value.strip()

    new_lines = []
    for line in lines:
        if "Variables:" in line:
            # Start of variables section, end of file rewrite
            break
        for var_name, var_value in variables.items():
            line = line.replace(var_name, var_value)
        new_lines.append(line)
    return '\n'.join(new_lines)

print(df['xyz'][0])
print(replace_variables_in_zmatrix(df['xyz'][0]))

In [None]:
def replace_variables_in_zmatrix(filename):
    with open(filename, 'r') as file:
        lines = file.readlines()

    variables = {}
    for line in lines:
        if "Variables:" in line:
            # Start of variables section
            index = lines.index(line)
            for variable_line in lines[index+1:]:
                if not variable_line:
                    continue
                var_name, var_value = variable_line[:-1].split('=')
                variables[var_name.strip()] = var_value.strip()

    with open(filename, 'w') as file:
        for line in lines:
            if "Variables:" in line:
                # Start of variables section, end of file rewrite
                break
            for var_name, var_value in variables.items():
                line = line.replace(var_name, var_value)
            file.write(line)

replace_variables_in_zmatrix("obabel.zmat")

## Using RDKit

In [None]:
# load smls in a pandas df
df = pd.DataFrame(smls, columns=['smiles'])

In [None]:
m = Chem.MolFromSmiles(smls[0])
m = Chem.AddHs(m)
AllChem.EmbedMolecule(m)
AllChem.MMFFOptimizeMolecule(m)

# confs = AllChem.EmbedMultipleConfs(m, numConfs=1)
# conf = m.GetConformer(0)

Chem.MolToPDBBlock(m)

In [None]:
df['mol'] = df['smiles'].apply(lambda x: Chem.MolFromSmiles(x))
df['mol'] = df['mol'].apply(lambda x: Chem.AddHs(x))
_ = df['mol'].apply(lambda x: AllChem.EmbedMolecule(x))

# Compute xyz. Also support MolToPDBBlock. Writing sdf is a little different
df['xyz'] = df['mol'].apply(lambda x: Chem.MolToXYZBlock(x))
df

In [None]:
# SDWriter('-') writes to stdout
with Chem.SDWriter('rdkit.sdf') as w:
    for cid in range(m.GetNumConformers()):
        print(cid)
        m.SetProp('ID', f'{cid}')
        w.write(m, confId=cid)

Chem.MolToPDBFile(m, 'rdkit.pdb')
Chem.MolToXYZFile(m, 'rdkit.xyz')
