In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
import os
import subprocess

## MAP4

In [2]:
input_csv = '../data/processed/data_cmnpd_after2000.csv' 
output_smi = '../data/processed/data_cmnpd_after2000.smi' 

df = pd.read_csv(input_csv)

smiles_column = df.columns[0]  
smiles_list = df[smiles_column]  

with open(output_smi, 'w') as smi_file:
    for smile in smiles_list:
        smi_file.write(f"{smile}\n")

###  Generating MAP4 Fingerprints

We use the [MAP4](http://github.com/reymond-group/map4) , a universal molecular fingerprint designed to capture both small molecule substructure and large molecular scaffolds, including peptides and complex natural products 


To generate MAP4 fingerprints in this project, follow these steps:


1. Clone the MAP4 repository

`git clone https://github.com/reymond-group/map4.git@v1.0`

`cd map4`

2. Install dependencies via Conda

`conda env create -f environment.yml`

`conda activate map4`

Alternatively, install via pip (requires RDKit and tmap)

`pip install git+https://github.com/reymond-group/map4@v1.0`

3. Once installed, you can generate MAP4 fingerprints from a SMILES file

`cd map4`

`python map4.py -i /data/processed/data_cmnpd_after2000.smi -o /data/map4output`

In [5]:
input_file_path = '../data/map4output'  
output_file_path = '../data/MAP4.csv'

with open(input_file_path, 'r') as file:
    lines = file.readlines()

fingerprints = []
for line in lines:
    parts = line.strip().split('\t')  
    if len(parts) > 1:  
        fingerprint_values = parts[2].split(';')
        fingerprints.append(fingerprint_values)

fingerprints_df = pd.DataFrame(fingerprints)

column_names = [f'fp_{i}' for i in range(fingerprints_df.shape[1])]
fingerprints_df.columns = column_names

fingerprints_df.to_csv(output_file_path, index=False)


## MPN

## last_FFN