In [1]:
import pandas as pd

xgb_predictions= pd.read_csv('../../results/predictions/final_testset/XGB_predictions.csv')

xgb_predictions

Unnamed: 0,pred_0,pred_1,pred_2,pred,True Label
0,0.000131,0.999841,0.000028,1,1
1,0.000014,0.999980,0.000006,1,1
2,0.001567,0.021770,0.976663,2,1
3,0.000220,0.999422,0.000358,1,1
4,0.000075,0.999889,0.000036,1,1
...,...,...,...,...,...
3654,0.000003,0.000004,0.999993,2,2
3655,0.000010,0.000072,0.999917,2,2
3656,0.000094,0.002577,0.997328,2,2
3657,0.000007,0.000015,0.999978,2,2


In [2]:
testset=pd.read_csv('../../data/processed/final_testset_smiles.csv')
testset

Unnamed: 0,smiles
0,C1(=O)[C@H](CC(=O)N)NC(=O)[C@H](C)NC(=O)[C@H](...
1,N(C(=O)CC(O)C(C)N)[C@@H]([C@H](O)C(=O)O)C(=O)N...
2,NC(=O)C1=C(N)[C@@H](O)[C@H](O)[C@@H](Cl)C1=O
3,NC(=O)C1=C(N)[C@@H](O)C[C@H](O)C1=O
4,C1\C=C\C=C(/C)\C(=O)O[C@H](C(C)C)C\C=C(/C)\C=C...
...,...
3654,C1(\C=C\[C@@H](C)CC)=CC(=C(Cl)C(=O)[C@](O[C@@]...
3655,[C@@H]1(O[C@H]1[C@H](O)[C@@H]2OC(=O)C(C)=C2)C
3656,c1c(OC)c(C(=O)CC[C@@H]2OC(C)=O)c2cc1
3657,c1c(OC)c(c(O)ccc2OC)c2cc1


In [3]:
import pandas as pd

matched = xgb_predictions[xgb_predictions['pred'] == xgb_predictions['True Label']]

matched_smiles = testset.loc[matched.index, 'smiles']

matched_smiles_df = pd.DataFrame({
    'smiles': matched_smiles,
    'label': matched['pred']  
})

for label in [0, 1, 2]:
    subset = matched_smiles_df[matched_smiles_df['label'] == label]
    subset[['smiles']].to_csv(f'../../data/processed/npclassifier/class_{label}_smiles.csv', index=False)

In [None]:
import pandas as pd
import requests
import time
import os
from urllib.parse import quote

def query_npclassifier(smiles):
    try:
        encoded_smiles = quote(smiles)
        url = f'https://npclassifier.gnps2.org/classify?smiles={encoded_smiles}'
        response = requests.get(url, timeout=10)

        if response.status_code == 200:
            data = response.json()
            pathway = data.get('pathway_results', [None])[0]
            superclass = data.get('superclass_results', [None])[0]
            _class = data.get('class_results', [None])[0]
            print(f'processed {smiles} with pathway: {pathway}, superclass: {superclass}, class: {_class}')
            return pathway, superclass, _class
        else:
            print(f"Failed for SMILES: {smiles} with status code {response.status_code}")
            return None, None, None
    except Exception as e:
        print(f"Error querying SMILES: {smiles}, error: {e}")
        return None, None, None

for label in [0, 1, 2]:
    input_file = f'../../data/processed/npclassifier/class_{label}_smiles.csv'
    output_file = f'../../data/processed/npclassifier/class_{label}_npclassified.csv'

    df = pd.read_csv(input_file)

    if os.path.exists(output_file):
        df_existing = pd.read_csv(output_file)
        df_incomplete = df_existing[
            df_existing[['Pathway', 'Superclass', 'Class']].isnull().any(axis=1)
        ]
        print(f"Resuming: {len(df_incomplete)} incomplete entries to fill.")
    else:
        df_existing = pd.DataFrame(columns=['smiles', 'Pathway', 'Superclass', 'Class'])
        df_incomplete = df.copy()
        print(f"First run: {len(df_incomplete)} SMILES to process.")

    for i, row in df_incomplete.iterrows():
        smiles = row['smiles']
        pathway, superclass, _class = query_npclassifier(smiles)

        mask = (df_existing['smiles'] == smiles) & (
            df_existing[['Pathway', 'Superclass', 'Class']].isnull().any(axis=1)
        )
        if mask.any():
            df_existing.loc[mask, ['Pathway', 'Superclass', 'Class']] = pathway, superclass, _class
        else:
            df_existing = pd.concat([
                df_existing,
                pd.DataFrame([{'smiles': smiles, 'Pathway': pathway, 'Superclass': superclass, 'Class': _class}])
            ], ignore_index=True)

        df_existing.to_csv(output_file, index=False)
        time.sleep(1)

    print(f"Finished label {label}")

In [2]:
import pandas as pd
import plotly.express as px

file_path = "../../data/processed/npclassifier/class_0_npclassified.csv"
df = pd.read_csv(file_path)

df = df.dropna(subset=["Pathway", "Superclass", "Class"])

fig = px.sunburst(
    df,
    path=["Pathway", "Superclass", "Class"],
    color="Pathway",
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_layout(
    margin=dict(t=40, l=0, r=0, b=20),
    title_text="Animalia",
    title_x=0.5
)

fig.write_html("../../results/figures/ani_sunburst.html", auto_open=True)

In [3]:
import pandas as pd
import plotly.express as px

file_path = "../../data/processed/npclassifier/class_1_npclassified.csv"
df = pd.read_csv(file_path)

df = df.dropna(subset=["Pathway", "Superclass", "Class"])

fig = px.sunburst(
    df,
    path=["Pathway", "Superclass", "Class"],
    color="Pathway",
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_layout(
    margin=dict(t=40, l=0, r=0, b=20),
    title_text="Bacteria",
    title_x=0.5
)

fig.write_html("../../results/figures/bac_sunburst.html", auto_open=True)

In [4]:
import pandas as pd
import plotly.express as px

file_path = "../../data/processed/npclassifier/class_2_npclassified.csv"
df = pd.read_csv(file_path)

df = df.dropna(subset=["Pathway", "Superclass", "Class"])

fig = px.sunburst(
    df,
    path=["Pathway", "Superclass", "Class"],
    color="Pathway",
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_layout(
    margin=dict(t=40, l=0, r=0, b=20),
    title_text="Fungi",
    title_x=0.5
)

fig.write_html("../../results/figures/fun_sunburst.html", auto_open=True)