In [8]:
!docker run hello-world


Hello from Docker!
This message shows that your installation appears to be working correctly.

To generate this message, Docker took the following steps:
 1. The Docker client contacted the Docker daemon.
 2. The Docker daemon pulled the "hello-world" image from the Docker Hub.
    (amd64)
 3. The Docker daemon created a new container from that image which runs the
    executable that produces the output you are currently reading.
 4. The Docker daemon streamed that output to the Docker client, which sent it
    to your terminal.

To try something more ambitious, you can run an Ubuntu container with:
 $ docker run -it ubuntu bash

Share images, automate workflows, and more with a free Docker ID:
 https://hub.docker.com/

For more examples and ideas, visit:
 https://docs.docker.com/get-started/



In [9]:
import os
import zipfile
import subprocess
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shutil

from Bio import SeqIO
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [10]:
print(os.listdir('..'))

['view?usp=drive_link', '.idea', 'TrainFiles.zip', 'data_extraction.ipynb', 'main.py', '.venv']


In [11]:
# Define your zip files and corresponding output directories
zip_targets = {
    'TrainFiles.zip': './',
}

for zip_path, extract_to in zip_targets.items():
    # Create the output directory if it doesn't exist
    os.makedirs(extract_to, exist_ok=True)

    # Extract zip content
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        print(f"✅ Extracted {zip_path} to ./{extract_to}/")

✅ Extracted TrainFiles.zip to ././/


In [13]:
!docker pull muefab/genie:latest

latest: Pulling from muefab/genie
Digest: sha256:c3112a3879cc18061bbab5ed8f76dec255ab1be46e2133cd59320dd5ba98ef89
Status: Image is up to date for muefab/genie:latest
docker.io/muefab/genie:latest


In [19]:
notebook_dir = os.getcwd()

# Pick one `.mgb` file from TrainFiles
mgb_filename = "ID_AAFNOT.mgb"
mgb_filename_no_mgb = mgb_filename[:-4]
train_dir = os.path.join(os.getcwd(), "TrainFiles")
mgb_file_path = os.path.join(train_dir, mgb_filename)

# Output location for decoded FASTQ
output_fastq = f"{mgb_filename_no_mgb}.fastq"

# Docker mount paths
host_dir = train_dir                # Local directory with the `.mgb` file
container_dir = "/data"             # Directory inside the container

# Show paths
print(f"📁 Host path to `.mgb`: {mgb_file_path}")
print(f"📁 Host directory mounted: {host_dir}")
print(f"📦 Container directory will be: {container_dir}")
print(f"📄 Output FASTQ: {output_fastq}")

📁 Host path to `.mgb`: /home/valentinabuoro4/mpeg-folder/TrainFiles/ID_AAFNOT.mgb
📁 Host directory mounted: /home/valentinabuoro4/mpeg-folder/TrainFiles
📦 Container directory will be: /data
📄 Output FASTQ: ID_AAFNOT.fastq


In [20]:
def inspect_mgb_structure(host_dir=".", container_dir="/work", mgb_filename=mgb_filename):
    command = [
        "docker", "run", "--rm",
        "-v", f"{host_dir}:{container_dir}",
        "muefab/genie:latest", "run",  # ✅ Add "run" subcommand here
        "-f",
        "-i", f"{container_dir}/TrainFiles/{mgb_filename}",
        "-o", f"{container_dir}/TrainFiles/{mgb_filename_no_mgb}.fastq"
    ]
    print("Running:", " ".join(command))
    result = subprocess.run(command, capture_output=True, text=True)
    print("\n--- STDOUT ---\n")
    print(result.stdout)
    if result.stderr:
        print("\n--- STDERR ---\n")
        print(result.stderr)

inspect_mgb_structure()

Running: docker run --rm -v .:/work muefab/genie:latest run -f -i /work/TrainFiles/ID_AAFNOT.mgb -o /work/TrainFiles/ID_AAFNOT.fastq

--- STDOUT ---

[INFO,      0.000s, App]:    ______           _
[INFO,      0.000s, App]:   / ____/__  ____  (_)__
[INFO,      0.000s, App]:  / / __/ _ \/ __ \/ / _ \
[INFO,      0.000s, App]: / /_/ /  __/ / / / /  __/
[INFO,      0.000s, App]: \____/\___/_/ /_/_/\___/
[INFO,      0.000s, App]: Command: /usr/local/bin/genie run -f -i /work/TrainFiles/ID_AAFNOT.mgb -o /work/TrainFiles/ID_AAFNOT.fastq 
[INFO,      0.000s, App/Run]: Input file 1: /work/TrainFiles/ID_AAFNOT.mgb with size 1.34MiB
[INFO,      0.000s, App/Run]: Working directory: /work/TrainFiles with 338GiB available
[INFO,      0.001s, App/Run]: Output file: /work/TrainFiles/ID_AAFNOT.fastq with 338GiB available
[INFO,      0.001s, App/Run]: Threads: 4 with 4 supported
[INFO,      0.001s, Spring]: Temporary directory: /work/TrainFiles/tmp.t0rQ4BLb2X/
[INFO,      0.001s, Spring]: Temporary dir

In [21]:
# Safer path for Windows (forward slashes or raw string)
fastq_path = os.path.join(os.getcwd(), train_dir, f"{mgb_filename_no_mgb}.fastq")

# Check if the file exists before parsing
if not os.path.exists(fastq_path):
    print(f"❌ FASTQ file not found at: {fastq_path}")
else:
    total_reads = 0
    read_lengths = []
    quality_scores = []

    for record in SeqIO.parse(fastq_path, "fastq"):
        total_reads += 1
        read_lengths.append(len(record.seq))
        quality_scores.extend(record.letter_annotations["phred_quality"])

    print(f"🔍 Total reads: {total_reads}")
    print(f"📏 Avg read length: {sum(read_lengths)/len(read_lengths):.1f} bp")
    print(f"🎯 Avg quality score: {sum(quality_scores)/len(quality_scores):.1f}")


🔍 Total reads: 42840
📏 Avg read length: 124.5 bp
🎯 Avg quality score: 33.8


In [22]:
print("🧪 First 3 reads:\n")
for i, record in enumerate(SeqIO.parse(fastq_path, "fastq")):
    print(f"🔹 ID: {record.id}")
    print(f"🔹 SEQ: {record.seq[:50]}...")  # just preview first 50 bp
    print(f"🔹 QUALITY: {record.letter_annotations['phred_quality'][:10]}...\n")
    if i >= 2:
        break

🧪 First 3 reads:

🔹 ID: NB501656:321:HTWL2AFXX:1:11101:20537:1996
🔹 SEQ: TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTA...
🔹 QUALITY: [36, 36, 36, 36, 36, 36, 36, 36, 36, 36]...

🔹 ID: NB501656:321:HTWL2AFXX:1:11101:20537:1996
🔹 SEQ: CCTGTTTGATCCCCACGCTTTCGCACATCAGCGTCAGTTACAGACCAGAA...
🔹 QUALITY: [36, 36, 36, 36, 36, 36, 36, 36, 32, 36]...

🔹 ID: NB501656:321:HTWL2AFXX:4:21612:15914:20008
🔹 SEQ: TACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGTA...
🔹 QUALITY: [36, 36, 36, 36, 36, 36, 36, 36, 36, 36]...



In [23]:
# Set base directories
notebook_dir = os.getcwd()
container_dir = "/data"  # This is the container's path

def decode_all_mgb_in_folder(folder_name):
    host_dir = os.path.join(notebook_dir, folder_name)
    for mgb_filename in os.listdir(host_dir):
        if not mgb_filename.endswith(".mgb"):
            continue

        mgb_filename_no_ext = os.path.splitext(mgb_filename)[0]
        print(f"\n🔄 Decoding: {mgb_filename}")

        command = [
            "docker", "run", "--rm",
            "-v", f"{host_dir}:{container_dir}",
            "muefab/genie:latest", "run",
            "-f",
            "-i", f"{container_dir}/{mgb_filename}",
            "-o", f"{container_dir}/{mgb_filename_no_ext}.fastq"
        ]

        print("Running:", " ".join(command))
        result = subprocess.run(command, capture_output=True, text=True)

        """
        Caution on printing out each line as this does take up memory.

        print("\n--- STDOUT ---\n")
        print(result.stdout)
        if result.stderr:
            print("\n--- STDERR ---\n")
            print(result.stderr)#

        """

In [24]:
decode_all_mgb_in_folder("TrainFiles")


🔄 Decoding: ID_ZJFLTR.mgb
Running: docker run --rm -v /home/valentinabuoro4/mpeg-folder/TrainFiles:/data muefab/genie:latest run -f -i /data/ID_ZJFLTR.mgb -o /data/ID_ZJFLTR.fastq

🔄 Decoding: ID_HBDGLE.mgb
Running: docker run --rm -v /home/valentinabuoro4/mpeg-folder/TrainFiles:/data muefab/genie:latest run -f -i /data/ID_HBDGLE.mgb -o /data/ID_HBDGLE.fastq

🔄 Decoding: ID_LEKKJQ.mgb
Running: docker run --rm -v /home/valentinabuoro4/mpeg-folder/TrainFiles:/data muefab/genie:latest run -f -i /data/ID_LEKKJQ.mgb -o /data/ID_LEKKJQ.fastq

🔄 Decoding: ID_VVNFHD.mgb
Running: docker run --rm -v /home/valentinabuoro4/mpeg-folder/TrainFiles:/data muefab/genie:latest run -f -i /data/ID_VVNFHD.mgb -o /data/ID_VVNFHD.fastq

🔄 Decoding: ID_JZEQQC.mgb
Running: docker run --rm -v /home/valentinabuoro4/mpeg-folder/TrainFiles:/data muefab/genie:latest run -f -i /data/ID_JZEQQC.mgb -o /data/ID_JZEQQC.fastq

🔄 Decoding: ID_VNRKYC.mgb
Running: docker run --rm -v /home/valentinabuoro4/mpeg-folder/TrainF