In [6]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import os
import subprocess

## MAP4

In [2]:
input_csv = '../data/processed/data_cmnpd_after2000.csv' 
output_smi = '../data/processed/data_cmnpd_after2000.smi' 

df = pd.read_csv(input_csv)

smiles_column = df.columns[0]  
smiles_list = df[smiles_column]  

with open(output_smi, 'w') as smi_file:
    for smile in smiles_list:
        smi_file.write(f"{smile}\n")

###  Generating MAP4 Fingerprints

We use the [MAP4](http://github.com/reymond-group/map4) , a universal molecular fingerprint designed to capture both small molecule substructure and large molecular scaffolds, including peptides and complex natural products 


To generate MAP4 fingerprints in this project, follow these steps:


1. Clone the MAP4 repository

`git clone https://github.com/reymond-group/map4.git@v1.0`

`cd map4`

2. Install dependencies via Conda

`conda env create -f environment.yml`

`conda activate map4`

Alternatively, install via pip (requires RDKit and tmap)

`pip install git+https://github.com/reymond-group/map4@v1.0`

3. Once installed, you can generate MAP4 fingerprints from a SMILES file

`cd map4`

`python map4.py -i /data/processed/data_cmnpd_after2000.smi -o /data/map4output`

In [5]:
input_file_path = '../data/map4output'  
output_file_path = '../data/MAP4.csv'

with open(input_file_path, 'r') as file:
    lines = file.readlines()

fingerprints = []
for line in lines:
    parts = line.strip().split('\t')  
    if len(parts) > 1:  
        fingerprint_values = parts[2].split(';')
        fingerprints.append(fingerprint_values)

fingerprints_df = pd.DataFrame(fingerprints)

column_names = [f'fp_{i}' for i in range(fingerprints_df.shape[1])]
fingerprints_df.columns = column_names

fingerprints_df.to_csv(output_file_path, index=False)


In [None]:
file = '../data/processed/data_cmnpd_after2000.csv' 
smiles_file = '../data/processed/data_cmnpd_after2000_smiles.csv'

df = pd.read_csv(file)
smiles_df = df[['SMILES']]  
smiles_df.to_csv(smiles_file, index=False)

## MPN

In [8]:


command = [
    'chemprop', 'fingerprint',
    '--test-path', smiles_file, 
    '--preds-path', '../data/data_cmnpd_after2000_MPN.csv',
    '--model-paths', '../results/models/model_after2000/fold_0/model_0/checkpoints/best-epoch=128-val_loss=0.58.ckpt',
    '--num-workers', '0',
    '--ffn-block-index', '0'
]

subprocess.run(command)

## last_FFN

In [None]:
command = [
    'chemprop', 'fingerprint',
    '--test-path', smiles_file, 
    '--preds-path', '../data/data_cmnpd_after2000_last_FFN.csv',
    '--model-paths', '../results/models/model_after2000/fold_0/model_0/checkpoints/best-epoch=128-val_loss=0.58.ckpt',
    '--num-workers', '0',
    '--ffn-block-index', '-1'
]

subprocess.run(command)

## Downloading fingerprint data

The precomputed molecular fingerprints used in this project are hosted on [Zenodo](https://zenodo.org/record/XXXXXXX) for reproducibility.

## T-SNE analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import seaborn as sns

sns.set_palette("Set2")

category_map = {0: 'Animalia', 1: 'Bacteria', 2: 'Fungi'}

fingerprint_data_MAP4 = pd.read_csv('MAP4.csv')
fingerprint_data_MPN = pd.read_csv('MPN.csv')
fingerprint_data_FFN = pd.read_csv('last_FFN.csv')

molecule_data = pd.read_csv('../data/processed/data_cmnpd_after2000.csv')

labels = molecule_data['labels'].values

perplexity = 30
tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)

fingerprints_2d_MAP4 = tsne.fit_transform(fingerprint_data_MAP4)
fingerprints_2d_MPN = tsne.fit_transform(fingerprint_data_MPN)
fingerprints_2d_FFN = tsne.fit_transform(fingerprint_data_FFN)

ax = axes[0]
for label in np.unique(labels):
    category = category_map[label]
    indices = np.where(labels == label)[0] 
    ax.scatter(map4_df.iloc[indices, 0], map4_df.iloc[indices, 1], label=category, alpha=0.4, s=5)  
ax.set_title('MAP4')

ax = axes[1]
for label in np.unique(labels):
    category = category_map[label]
    indices = np.where(labels == label)[0]
    ax.scatter(mpn_df.iloc[indices, 0], mpn_df.iloc[indices, 1], label=category, alpha=0.4, s=5)
ax.set_title('MPN')

ax = axes[2]
for label in np.unique(labels):
    category = category_map[label]
    indices = np.where(labels == label)[0]  
    ax.scatter(ffn_df.iloc[indices, 0], ffn_df.iloc[indices, 1], label=category, alpha=0.4, s=5)
ax.set_title('last_FFN')

fig.text(0.5, 0.12, 't-SNE Dimension 1', ha='center', va='center', fontsize=10)
fig.text(0.08, 0.5, 't-SNE Dimension 2', ha='center', va='center', rotation='vertical', fontsize=10)

plt.subplots_adjust(wspace=0.03 ,hspace=0.1, bottom=0.2)

handles, legend_labels = axes[0].get_legend_handles_labels()
fig.legend(handles, legend_labels, loc='lower center', ncol=5, bbox_to_anchor=(0.5, 0.02))

plt.savefig('../results/figures/tsne_comparison', format='svg')

plt.show()
