In [1]:
import pandas as pd
import urllib.request
import json
import concurrent.futures
from tqdm import tqdm  # Progress bar


# Filter the assembly stats for specific version and assembler

In [2]:
# Read the CSV file
stats_df = pd.read_csv("input/mgnify_assemblies_stats.csv")

# Filter for assembler "metaspades"
filtered_df = stats_df[stats_df["assembler"] == "metaspades"]

# Count samples per assembler version
version_counts = filtered_df["assembler_version"].value_counts()

# Select the latest version (assuming 3.15.3 is the latest)
latest_version = "3.15.3"
latest_version_df = filtered_df[filtered_df["assembler_version"] == latest_version]

# Count occurrences of each lineage
lineage_counts = latest_version_df["lineage"].value_counts()

# Print the results
print(f"Sample count per assembler version:\n{version_counts}\n")
print(f"Lineage count for assembler version {latest_version}:\n{lineage_counts}")


Sample count per assembler version:
assembler_version
3.15.3    9102
3.12.0    5621
3.14.1    5542
3.10.0    2669
3.13.0    1773
3.11.1     445
3.10.1     107
3.11.0      54
3.13.1       8
3.14         3
Name: count, dtype: int64

Lineage count for assembler version 3.15.3:
lineage
root:Host-associated:Human:Digestive system:Large intestine:Fecal      2063
root:Host-associated:Mammals:Gastrointestinal tract:Intestine:Fecal    1439
root:Environmental:Aquatic:Marine                                       756
root:Host-associated:Human:Digestive system:Oral                        743
root:Host-associated:Human:Skin                                         433
root:Host-associated:Insecta:Digestive system                           377
root:Host-associated:Human:Digestive system                             354
root:Host-associated:Human:Excretory system:Urethra:Urine               331
root:Host-associated:Mammals:Digestive system:Large intestine:Fecal     267
root:Host-associated:Mammals:Gast

# Add SSR via EBI API

In [3]:
# Use the latest version DataFrame
stats_df = latest_version_df.copy()

# Function to fetch SRR ID
def fetch_srr_id(erz_id):
    try:
        with urllib.request.urlopen(f"https://www.ebi.ac.uk/metagenomics/api/v1/assemblies/{erz_id}?format=json") as url:
            data = json.load(url)
        srr_id = data["data"]["relationships"]["runs"]["data"][0]["id"]
        return erz_id, srr_id  # Return tuple
    except Exception as e:
        return erz_id, None  # Return None instead of printing for efficiency

# Use threading for I/O-bound operations
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:  # Adjust workers as needed
    future_to_erz = {executor.submit(fetch_srr_id, erz_id): erz_id for erz_id in stats_df["primary_accession"]}
    
    # Use tqdm for progress tracking
    for future in tqdm(concurrent.futures.as_completed(future_to_erz), total=len(future_to_erz)):
        results.append(future.result())

# Convert results to a dictionary and update DataFrame
srr_dict = dict(results)
stats_df["srr_id"] = stats_df["primary_accession"].map(srr_dict)

# Save updated DataFrame to a new CSV
stats_df.to_csv("updated_mgnify_assemblies_stats_v3.15.3_metaspades.csv", index=False)


100%|██████████| 9102/9102 [1:15:51<00:00,  2.00it/s]


# Pick biomes, store full list and only SRR for Galaxy

In [12]:
# Get 30 samples per biome (otherwise this runs too long !!)

stats_df = pd.read_csv("updated_mgnify_assemblies_stats_v3.15.3_metaspades.csv")
stats_df["Biomes"] = stats_df["lineage"].str.split(':').str[1:3].str.join(':')
stats_df["Biomes"].value_counts()

new_df = stats_df.groupby("Biomes", group_keys=False).apply(lambda x: x.sample(n=min(len(x), 30), random_state=42))
new_df.head()

print(new_df["Biomes"].value_counts())

new_df.to_csv("updated_mgnify_assemblies_stats_v3.15.3_metaspades_subset.csv", index=False)
new_df = new_df.loc[:,"srr_id"]
new_df.to_csv("updated_mgnify_assemblies_stats_v3.15.3_metaspades_subset_srr.csv", index=False, header=False)

Biomes
Engineered:Wastewater         30
Environmental:Aquatic         30
Environmental:Terrestrial     30
Host-associated:Animal        30
Host-associated:Fish          30
Host-associated:Human         30
Host-associated:Insecta       30
Host-associated:Mammals       30
Host-associated:Plants        30
Host-associated:Birds         16
Host-associated:Arthropoda     6
Mixed                          4
Host-associated:Porifera       3
Engineered:Solid waste         1
Name: count, dtype: int64


  new_df = stats_df.groupby("Biomes", group_keys=False).apply(lambda x: x.sample(n=min(len(x), 30), random_state=42))
