In [2]:
import jupyter_black

jupyter_black.load()

In [3]:
import json

import pandas as pd

# proteom_json_path = "../../data/proteomes_proteome_type_1_AND_busco_90_2023_11_09.json"
proteom_json_path = "../../data/proteomes_proteome_type_1_AND_busco_90_AND_taxonomy_id_332082023_11_09.json"

with open(proteom_json_path) as file:
    proteom_data = json.load(file)

In [4]:
data = []

for entry in list(proteom_data.values())[0]:
    proteome_id = entry["id"]
    # proteam score
    proteom_score = entry["proteomeCompletenessReport"]
    busco = proteom_score["buscoReport"]["score"]
    busco_nr_proteins = proteom_score["buscoReport"]["total"]
    cpdReport = proteom_score["cpdReport"]["status"]
    # taxonomy
    taxonomy = entry["taxonomy"]
    name = taxonomy["scientificName"]
    taxon_id = taxonomy["taxonId"]
    # genome
    genome = entry["genomeAssembly"]
    if "assemblyId" not in genome:
        print(f"Taxon ID `{taxon_id}` has no Genome ID")
        continue
    genome_id = genome["assemblyId"]

    data.append(
        (
            proteome_id,
            taxon_id,
            name,
            busco,
            # busco_nr_proteins,
            cpdReport,
            genome_id,
        )
    )
    # TODO: display for debugging
    # print(name, " -> ", busco)
    # print(genome_id)
    # break

df = pd.DataFrame(
    data,
    columns=[
        "proteome_id",
        "taxon_id",
        "name",
        "busco",
        # "busco_nr_proteins",
        "cpdReport",
        "genome_id",
    ],
)

# make `cpdReport` categorical
categories = [
    "Standard",
    "Close to standard (high value)",
    "Close to standard (low value)",
    "Outlier (high value)",
    "Outlier (low value)",
    "Unknown",
]
category_type = pd.CategoricalDtype(categories=categories, ordered=True)
df["cpdReport"] = df["cpdReport"].astype(category_type)

# sort by best scores
df = df.sort_values(by=["busco", "cpdReport"], ascending=[False, True])
df = df.reset_index(drop=True)
df.to_csv("tmp.csv")

Taxon ID `6279` has no Genome ID


In [7]:
proteom_score

{'buscoReport': {'complete': 2817,
  'completeSingle': 2355,
  'completeDuplicated': 462,
  'fragmented': 43,
  'missing': 74,
  'total': 2934,
  'lineageDb': 'arachnida_odb10',
  'score': 97},
 'cpdReport': {'proteomeCount': 0,
  'stdCdss': 0.0,
  'averageCdss': 0,
  'confidence': 0,
  'status': 'Unknown'}}

In [6]:
df["cpdReport"]  # .value_counts()

0      Outlier (high value)
1                  Standard
2                  Standard
3                  Standard
4                  Standard
               ...         
314                 Unknown
315                 Unknown
316                Standard
317    Outlier (high value)
318                 Unknown
Name: cpdReport, Length: 319, dtype: category
Categories (6, object): ['Standard' < 'Close to standard (high value)' < 'Close to standard (low value)' < 'Outlier (high value)' < 'Outlier (low value)' < 'Unknown']

In [3]:
# Now that we have verified the presence of the 'buscoReport' field and its structure, we can parse the JSON file.
# We will load the proteome data and extract the proteome IDs along with their BUSCO 'complete' scores.

# Define a function to find top X proteomes with the highest BUSCO score
def find_top_busco_proteomes(proteomes_data, x=10):
    # Extract proteome ID and BUSCO complete scores
    proteome_busco_scores = []
    for entry in proteomes_data:
        proteome_id = entry.get("id", "")
        busco_score = (
            entry.get("proteomeCompletenessReport", {})
            .get("buscoReport", {})
            .get("score", 0)
        )
        proteome_busco_scores.append((proteome_id, busco_score))

    # Sort by BUSCO score in descending order
    proteome_busco_scores.sort(key=lambda entry: entry[1], reverse=True)

    # Select the top X proteomes
    top_x_proteomes = proteome_busco_scores[:x]

    return top_x_proteomes


# Call the function and print the top X proteomes
top_busco_proteomes = find_top_busco_proteomes(proteom_json_path, x=10)
top_busco_proteomes

AttributeError: 'str' object has no attribute 'get'