In [2]:
import pandas as pd
from pathlib import Path

# Paths
mobileelements_dir = Path("/nobackup/zxww47/flye_mobile_elements_copy")
metadata_file = Path("/nobackup/zxww47/SraRunTable.csv")
output_file = Path("/nobackup/zxww47/merged_mge.csv")

# Load metadata
metadata_df = pd.read_csv(metadata_file)

# List all mobile_elements CSVs
mobile_csvs = list(mobileelements_dir.glob("*_mobile_elements.csv"))
print(f"Found {len(mobile_csvs)} mobile_elements.csv files")

# Read and merge them
all_data = []
for csv_file in mobile_csvs:
    try:
        df = pd.read_csv(csv_file)
        accession = csv_file.stem.replace("_mobile_elements", "")  # extract Run code
        df.insert(0, "Run", accession)
        all_data.append(df)
    except Exception as e:
        print(f"⚠️ Error reading {csv_file}: {e}")

# Combine into one DataFrame
mobile_df = pd.concat(all_data, ignore_index=True)

# Merge with metadata
merged_df = pd.merge(mobile_df, metadata_df, on="Run", how="left")

# Save merged data
merged_df.to_csv(output_file, index=False)
print(f"✅ Merged file saved to {output_file}")



Found 643 mobile_elements.csv files


  mobile_df = pd.concat(all_data, ignore_index=True)


✅ Merged file saved to /nobackup/zxww47/merged_mge.csv


In [4]:
import pandas as pd
from pathlib import Path

# Base paths
base_dir = Path("/nobackup/zxww47/flye")
output_base = Path("/nobackup/zxww47")
metadata_file = output_base / "SraRunTable.csv"

# Output folders
db_map = {
    "plasmidfinder": output_base / "Plasmids",
    "resfinder": output_base / "Resfinder",
    "vfdb": output_base / "Vfdb",
}

# Ensure output dirs exist
for folder in db_map.values():
    folder.mkdir(parents=True, exist_ok=True)

# Load metadata
metadata = pd.read_csv(metadata_file, dtype=str)

# Helper: extract run code from filename (before first "_")
def extract_run_code(filename: str) -> str:
    return Path(filename).stem.split("_")[0]

# Process each db type
for db_name, out_dir in db_map.items():
    print(f"🔹 Processing {db_name} files...")

    # Find all TSV files matching this db
    tsv_files = list(base_dir.rglob(f"*_{db_name}_abricate.tsv"))
    print(f"Found {len(tsv_files)} files for {db_name}.")

    dfs = []
    for tsv_file in tsv_files:
        try:
            df = pd.read_csv(tsv_file, sep="\t", dtype=str)  # keep everything as text

            # Drop #FILE column if present
            if "#FILE" in df.columns:
                df = df.drop(columns=["#FILE"])

            # Add run code column
            run_code = extract_run_code(tsv_file.name)
            df.insert(0, "Run", run_code)

            # Save individual CSV to correct folder
            csv_file = out_dir / (tsv_file.stem + ".csv")
            df.to_csv(csv_file, index=False)

            dfs.append(df)

        except Exception as e:
            print(f"⚠️ Error processing {tsv_file}: {e}")

    # Combine into one CSV
    if dfs:
        combined = pd.concat(dfs, ignore_index=True)

        # Merge with metadata (both use "Run")
        merged = combined.merge(metadata, on="Run", how="left")

        # Save combined + merged file
        combined_file = out_dir / f"{db_name}_combined.csv"
        merged.to_csv(combined_file, index=False)

        print(f"✅ Combined file saved: {combined_file}")
    else:
        print(f"⚠️ No valid files found for {db_name}.")

print("🎉 All processing complete.")


🔹 Processing plasmidfinder files...
Found 596 files for plasmidfinder.
✅ Combined file saved: /nobackup/zxww47/Plasmids/plasmidfinder_combined.csv
🔹 Processing resfinder files...
Found 596 files for resfinder.
✅ Combined file saved: /nobackup/zxww47/Resfinder/resfinder_combined.csv
🔹 Processing vfdb files...
Found 596 files for vfdb.
✅ Combined file saved: /nobackup/zxww47/Vfdb/vfdb_combined.csv
🎉 All processing complete.


In [5]:
import chromadb
import pandas as pd
from chromadb.utils import embedding_functions

# Persistent client
client = chromadb.PersistentClient(path="chroma_storage_nomic")

# Nomic embeddings
nomic_ef = embedding_functions.OllamaEmbeddingFunction(
    model="nomic-embed-text",
    url="http://127.0.0.1:11434/api/embeddings"
)

def create_collection(name):
    return client.create_collection(name=name, embedding_function=nomic_ef)

def load_csv_to_chroma(csv_path, collection, batch_size=100):
    df = pd.read_csv(csv_path)

    docs, ids, metas = [], [], []

    for i, row in df.iterrows():
        accession = str(row.get("Run", "NA"))

        # Choose text fields dynamically depending on dataset
        if "RESISTANCE" in df.columns:   # resfinder
            text = f"""
            Gene: {row.get('GENE','')}
            Product: {row.get('PRODUCT','')}
            Resistance: {row.get('RESISTANCE','')}
            Organism: {row.get('Organism','')}
            Database: {row.get('DATABASE','')}
            AccessionID: {row.get('ACCESSION','')}
            %Identity: {row.get('%IDENTITY','')}
            %Coverage: {row.get('%COVERAGE','')}
            """
        elif "vfdb" in csv_path:  # vfdb
            text = f"""
            Gene: {row.get('GENE','')}
            Product: {row.get('PRODUCT','')}
            Organism: {row.get('Organism','')}
            Database: {row.get('DATABASE','')}
            AccessionID: {row.get('ACCESSION','')}
            %Identity: {row.get('%IDENTITY','')}
            %Coverage: {row.get('%COVERAGE','')}
            """
        elif "plasmidfinder" in csv_path:  # plasmidfinder
            text = f"""
            Gene: {row.get('GENE','')}
            Product: {row.get('PRODUCT','')}
            Organism: {row.get('Organism','')}
            Database: {row.get('DATABASE','')}
            AccessionID: {row.get('ACCESSION','')}
            %Identity: {row.get('%IDENTITY','')}
            %Coverage: {row.get('%COVERAGE','')}
            """
        elif "mge" in csv_path.lower():  # mobile genetic elements
            text = f"""
            Name: {row.get('name','')}
            Synonyms: {row.get('synonyms','')}
            Type: {row.get('type','')}
            Prediction: {row.get('prediction','')}
            Organism: {row.get('Organism','')}
            Identity: {row.get('identity','')}
            Coverage: {row.get('coverage','')}
            """
        else:
            text = str(row.to_dict())

        # Always keep accession + basic metadata
        docs.append(text.strip())
        ids.append(f"{accession}_{i}")
        metas.append({
            "accession": accession,
            "gene_db_acc": str(row.get("ACCESSION", "")),
            "collection_date": str(row.get("Collection_Date", "")),
            "country": str(row.get("geo_loc_name_country", "")),
            "continent": str(row.get("geo_loc_name_country_continent", "")),
            "host": str(row.get("HOST", "")),
            "organism": str(row.get("Organism", "")),
        })

        if len(docs) >= batch_size:
            collection.add(documents=docs, ids=ids, metadatas=metas)
            docs, ids, metas = [], [], []

    if docs:
        collection.add(documents=docs, ids=ids, metadatas=metas)

# === Build all four collections ===
resfinder_col = create_collection("resfinder")
vfdb_col = create_collection("vfdb")
plasmid_col = create_collection("plasmidfinder")
mge_col = create_collection("mge")

load_csv_to_chroma("/nobackup/zxww47/Resfinder/resfinder_combined.csv", resfinder_col)
load_csv_to_chroma("/nobackup/zxww47/Vfdb/vfdb_combined.csv", vfdb_col)
load_csv_to_chroma("/nobackup/zxww47/Plasmids/plasmidfinder_combined.csv", plasmid_col)
load_csv_to_chroma("/nobackup/zxww47/flye_mobile_elements_copy/merged_mge.csv", mge_col)

print("✅ All collections rebuilt with Nomic embeddings")


TypeError: OllamaEmbeddingFunction.__init__() got an unexpected keyword argument 'model'