In [ ]:
import pandas as pd
import os

### preprocessing CMNPD

In [ ]:
cmnpd_animalia = "../data/raw/CMNPD-animalia-after2000.tsv"
cmnpd_bacteria = "../data/raw/CMNPD-bacteria-after2000.tsv"
cmnpd_fungi = "../data/raw/CMNPD-fungi-after2000.tsv"


def load_cmnpd(filepath, label):
    df = pd.read_csv(filepath, sep="\t")
    if "SMILES" not in df.columns:
        raise ValueError(f"{filepath} does not contain a SMILES column")
    df['labels'] = label
    return df[['SMILES', 'labels']]

animalia_data = load_cmnpd(cmnpd_animalia, 0)
bacteria_data = load_cmnpd(cmnpd_bacteria, 1)
fungi_data = load_cmnpd(cmnpd_fungi, 2)

cmnpd_combined = pd.concat([animalia_data, bacteria_data, fungi_data], ignore_index=True)

cmnpd_combined.to_csv('../data/processed/data_cmnpd_after2000.csv',index=False)

### preprocessing NPAtlas

In [ ]:
npatlas_file = "../data/raw/NPAtlas_download_2024_09.tsv"
npatlas_data = pd.read_csv(npatlas_file, sep="\t")
npatlas_data = npatlas_data[['compound_smiles','origin_type']]
npatlas_data = npatlas_data[npatlas_data['origin_type'].isin(['Bacterium', 'Fungus'])]
npatlas_data['origin_type'] = npatlas_data['origin_type'].map({'Bacterium': 1, 'Fungus': 2})
npatlas_data = npatlas_data.rename(columns={
    'compound_smiles': 'SMILES',
    'origin_type': 'labels'
})

npatlas_data.to_csv('../data/processed/npatlas.csv',index=False)

# 