In [1]:
import pandas as pd
import os

In [2]:
FOLDER = "loocv-results"
DATA_BASE = ["Chembl28CCandD-loocv", "global-loocv", "e-loocv", "gpcr-loocv", "ic-loocv", "nr-loocv"]
FINGERPRINTS = {"ecfp4": 0.2, 
                "fcfp4": 0.3,
                "maccs": 0.6}

### Import df

In [3]:
def import_df(file_path):
    df = pd.read_csv(file_path, header=None)
    df.columns = ['Fold', 'Ligand', 'Target', 'Score', 'TP']
    df.drop_duplicates(inplace=True)
    df['Ligand'] = df['Ligand'].str.replace('"', '', regex=True)
    df['Ligand'] = df['Ligand'].str.strip()
    df['Target'] = df['Target'].str.replace('"', '', regex=True)
    df['Target'] = df['Target'].str.strip()
    df['TP'] = df['TP'].astype(int)
    df['Score'] = df['Score'].astype(float)
    return df

### Save

In [None]:
for db in DATA_BASE:
    for fingerprint, value in FINGERPRINTS.items():
        print(f"En: {db}: {fingerprint}")
        file_path = os.path.join(FOLDER, db, fingerprint, f"{value}.out")
        df = import_df(file_path)
        df.to_csv(f"out/raw_results/{db}_{fingerprint}_pool.csv", index=False)

En: Chembl28CCandD-loocv: ecfp4
En: Chembl28CCandD-loocv: fcfp4
En: Chembl28CCandD-loocv: maccs
En: global-loocv: ecfp4
En: global-loocv: fcfp4
En: global-loocv: maccs
En: e-loocv: ecfp4
En: e-loocv: fcfp4
En: e-loocv: maccs
En: gpcr-loocv: ecfp4
En: gpcr-loocv: fcfp4
En: gpcr-loocv: maccs
En: ic-loocv: ecfp4
En: ic-loocv: fcfp4
En: ic-loocv: maccs
En: nr-loocv: ecfp4
En: nr-loocv: fcfp4
En: nr-loocv: maccs


### Normalizar

In [5]:
def norm(df):
    df_sum = df.groupby('Ligand')['Score'].sum().reset_index()
    df_sum.columns = ['Ligand', 'Score_Sum']
    df = df.merge(df_sum, on='Ligand')
    df['Normalized'] = df['Score'] / df['Score_Sum']
    # Escribir
    df = df[["Fold", "Ligand", "Target", "Normalized", "TP"]]
    df.rename(columns={'Normalized': 'Score'}, inplace=True)
    df = df[df['Score'] >= 0]
    return df

In [6]:
for db in DATA_BASE:
    for fingerprint, value in FINGERPRINTS.items():
        print(f"En: {db}: {fingerprint}")
        file_path = os.path.join(FOLDER, db, fingerprint, f"{value}.out")
        df = import_df(file_path)
        df = norm(df)
        df.to_csv(f"out/normalized/{db}_{fingerprint}_normalized.csv", index=False)

En: Chembl28CCandD-loocv: ecfp4
En: Chembl28CCandD-loocv: fcfp4
En: Chembl28CCandD-loocv: maccs
En: global-loocv: ecfp4
En: global-loocv: fcfp4
En: global-loocv: maccs
En: e-loocv: ecfp4
En: e-loocv: fcfp4
En: e-loocv: maccs
En: gpcr-loocv: ecfp4
En: gpcr-loocv: fcfp4
En: gpcr-loocv: maccs
En: ic-loocv: ecfp4
En: ic-loocv: fcfp4
En: ic-loocv: maccs
En: nr-loocv: ecfp4
En: nr-loocv: fcfp4
En: nr-loocv: maccs
