In [2]:
count = 0
with open("apple.genome") as f:
    for line in f:
        if line.startswith(">"):
            count += 1

print("Number of chromosomes:", count)

Number of chromosomes: 3


In [7]:
%%bash
grep -c ">" apple.genome 

3


In [1]:
genes = set()
total_nonempty = 0

with open("apple.genes", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        total_nonempty += 1
        gene_id = line.split("\t", 1)[0].strip()  # first field only
        if gene_id:
            genes.add(gene_id)

print("Total non-empty records:", total_nonempty)
print("Unique genes:", len(genes))
print("Duplicates found:", total_nonempty - len(genes))

Total non-empty records: 5456
Unique genes: 5453
Duplicates found: 3


In [2]:
from collections import Counter

gene_counts = Counter()

with open("apple.genes") as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith("#"):
            continue
        gene_id = line.split("\t", 1)[0]
        gene_counts[gene_id] += 1

duplicates = {g: c for g, c in gene_counts.items() if c > 1}

print("Duplicated gene IDs:")
for g, c in duplicates.items():
    print(f"{g}: {c} entries")

Duplicated gene IDs:
MDP0000025650: 2 entries
MDP0000301110: 2 entries
MDP0000575784: 2 entries


In [6]:
# file: count_single_splice_variants.py
# Usage (in notebook):
#   path = "apple.genes"
#   summary = count_single_splice_variants(path)
#   summary

from collections import Counter
from dataclasses import dataclass
from typing import Dict, List, Tuple


@dataclass(frozen=True)
class SpliceVariantSummary:
    total_records: int
    unique_transcript_ids: int
    unique_gene_ids: int
    single_variant_gene_count: int
    multi_variant_gene_count: int
    multi_variant_examples: List[Tuple[str, int]]  # (gene_id, num_variants)


def _gene_root(transcript_or_gene_id: str) -> str:
    """
    If IDs look like GENE.1, GENE.2 (isoforms), collapse to GENE.
    Otherwise returns the ID unchanged.
    """
    return transcript_or_gene_id.split(".", 1)[0]


def count_single_splice_variants(genes_path: str) -> SpliceVariantSummary:
    """
    Counts how many genes have a single splice variant based on apple.genes-like tables.
    Assumes the first column is an ID (gene or transcript). If transcript IDs include
    isoform suffixes (e.g., .1, .2), they are grouped by the root gene ID.
    """
    transcript_ids: List[str] = []

    with open(genes_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            cols = line.split()
            if not cols:
                continue
            transcript_ids.append(cols[0])

    total_records = len(transcript_ids)
    unique_transcript_ids = len(set(transcript_ids))

    gene_ids = [_gene_root(tid) for tid in transcript_ids]
    gene_counts: Counter[str] = Counter(gene_ids)

    single_variant_gene_count = sum(1 for _, c in gene_counts.items() if c == 1)
    multi_variant_gene_count = sum(1 for _, c in gene_counts.items() if c > 1)

    multi_variant_examples = sorted(
        ((g, c) for g, c in gene_counts.items() if c > 1),
        key=lambda x: (-x[1], x[0]),
    )[:20]

    return SpliceVariantSummary(
        total_records=total_records,
        unique_transcript_ids=unique_transcript_ids,
        unique_gene_ids=len(gene_counts),
        single_variant_gene_count=single_variant_gene_count,
        multi_variant_gene_count=multi_variant_gene_count,
        multi_variant_examples=multi_variant_examples,
    )


# --- Example (notebook) ---
summary = count_single_splice_variants("apple.genes")
print(summary)

SpliceVariantSummary(total_records=5456, unique_transcript_ids=5453, unique_gene_ids=5453, single_variant_gene_count=5450, multi_variant_gene_count=3, multi_variant_examples=[('MDP0000025650', 2), ('MDP0000301110', 2), ('MDP0000575784', 2)])


In [3]:
gene_strand = {}

with open("apple.genes") as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith("#"):
            continue

        cols = line.split("\t")
        gene_id = cols[0]
        strand = cols[3]

        # store strand once per gene
        gene_strand[gene_id] = strand

# Count genes per strand
plus_genes = sum(1 for s in gene_strand.values() if s == "+")
minus_genes = sum(1 for s in gene_strand.values() if s == "-")

print("Genes on + strand:", plus_genes)
print("Genes on - strand:", minus_genes)

Genes on + strand: 2662
Genes on - strand: 2791


In [4]:
from collections import defaultdict

gene_sets_by_chr = defaultdict(set)         # chr -> set(gene_id)
transcript_sets_by_chr = defaultdict(set)   # chr -> set(transcript_id)
records_by_chr = defaultdict(int)           # chr -> number of lines/records

with open("apple.genes", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith("#"):
            continue

        cols = line.split("\t")
        if len(cols) < 3:
            continue  # skip malformed lines

        gene_id = cols[0].strip()
        transcript_id = cols[1].strip()
        chrom = cols[2].strip()

        # Count transcript variants as "records/lines"
        records_by_chr[chrom] += 1

        # Count unique genes
        if gene_id:
            gene_sets_by_chr[chrom].add(gene_id)

        # Optional: count unique transcript IDs (sometimes same as records, sometimes not)
        if transcript_id:
            transcript_sets_by_chr[chrom].add(transcript_id)

# Print summary
print("chromosome\tgenes(unique)\ttranscripts(records)\ttranscripts(unique_ids)")
for chrom in sorted(records_by_chr.keys()):
    genes = len(gene_sets_by_chr[chrom])
    transcript_records = records_by_chr[chrom]
    transcript_unique = len(transcript_sets_by_chr[chrom])
    print(f"{chrom}\t{genes}\t{transcript_records}\t{transcript_unique}")

chromosome	genes(unique)	transcripts(records)	transcripts(unique_ids)
chr1	1624	1625	1625
chr2	2058	2059	2059
chr3	1771	1772	1772


In [16]:
genesA = set()
genesB = set()

with open("apple.conditionA") as f:
    for line in f:
        if line.strip():
            genesA.add(line.split()[0])

with open("apple.conditionB") as f:
    for line in f:
        if line.strip():
            genesB.add(line.split()[0])

common = genesA & genesB
print("Genes in common:", len(common))

Genes in common: 2410


In [13]:
import re

# Change this to True if your IDs have versions like MDP0000123456.1
STRIP_VERSION_SUFFIX = True

gene_id_pattern = re.compile(r"MDP\d{10}")  # matches IDs like MDP0000303933

def extract_gene_id(line: str) -> str | None:
    line = line.strip()
    if not line or line.startswith("#"):
        return None

    # Split on ANY whitespace or tabs (handles tab or space-delimited files)
    cols = re.split(r"\s+", line)

    # Try to find a token that looks like an MDP gene ID
    for token in cols[:5]:  # gene IDs usually appear early
        m = gene_id_pattern.fullmatch(token)
        if m:
            gene = token
            if STRIP_VERSION_SUFFIX:
                gene = gene.split(".", 1)[0]
            return gene

    # Fallback: try first column anyway
    gene = cols[0]
    if STRIP_VERSION_SUFFIX:
        gene = gene.split(".", 1)[0]
    return gene if gene else None

def load_unique_genes(filename: str) -> set[str]:
    genes = set()
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            g = extract_gene_id(line)
            if g:
                genes.add(g)
    return genes

A = load_unique_genes("apple.conditionA")
B = load_unique_genes("apple.conditionB")
C = load_unique_genes("apple.conditionC")

exclusive_A = A - (B)

print("Unique genes in A:", len(A))
print("Unique genes in B:", len(B))
print("Unique genes in C:", len(C))
print("Exclusive unique genes in A:", len(exclusive_A))

Unique genes in A: 3615
Unique genes in B: 3653
Unique genes in C: 3676
Exclusive unique genes in A: 1205


In [10]:
# Look at a few exclusive genes
sample = sorted(list(exclusive_A))[:20]
print("Sample exclusive genes:", sample)

# If you suspect format issues, check membership explicitly
for g in sample[:10]:
    print(g, "in B?", g in B, "in C?", g in C)

Sample exclusive genes: ['MDP0000022758', 'MDP0000047696', 'MDP0000089058', 'MDP0000119184', 'MDP0000121700', 'MDP0000123910', 'MDP0000124015', 'MDP0000125333', 'MDP0000126153', 'MDP0000126191', 'MDP0000126549', 'MDP0000127019', 'MDP0000128463', 'MDP0000129827', 'MDP0000130988', 'MDP0000132162', 'MDP0000132294', 'MDP0000133269', 'MDP0000133516', 'MDP0000133671']
MDP0000022758 in B? False in C? False
MDP0000047696 in B? False in C? False
MDP0000089058 in B? False in C? False
MDP0000119184 in B? False in C? False
MDP0000121700 in B? False in C? False
MDP0000123910 in B? False in C? False
MDP0000124015 in B? False in C? False
MDP0000125333 in B? False in C? False
MDP0000126153 in B? False in C? False
MDP0000126191 in B? False in C? False


In [12]:
import re

# Set to True if IDs may have suffixes like .1, .2
STRIP_VERSION_SUFFIX = True
gene_id_pattern = re.compile(r"MDP\d{10}")

def extract_gene_id(line):
    line = line.strip()
    if not line or line.startswith("#"):
        return None

    cols = re.split(r"\s+", line)
    for token in cols[:5]:
        if gene_id_pattern.fullmatch(token):
            gene = token
            if STRIP_VERSION_SUFFIX:
                gene = gene.split(".", 1)[0]
            return gene

    gene = cols[0]
    if STRIP_VERSION_SUFFIX:
        gene = gene.split(".", 1)[0]
    return gene if gene else None

def load_unique_genes(filename):
    genes = set()
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            g = extract_gene_id(line)
            if g:
                genes.add(g)
    return genes

genes_A = load_unique_genes("apple.conditionA")
genes_B = load_unique_genes("apple.conditionB")
genes_C = load_unique_genes("apple.conditionC")

# genes present only in B
exclusive_B = genes_B - (genes_A)

print("Number of unique genes exclusive to condition B:", len(exclusive_B))

Number of unique genes exclusive to condition B: 1243


In [19]:
A = {line.split()[0] for line in open("apple.conditionA") if line.strip()}
B = {line.split()[0] for line in open("apple.conditionB") if line.strip()}
C = {line.split()[0] for line in open("apple.conditionC") if line.strip()}

common_all = A & B & C
print("Genes common to all three conditions:", len(common_all))

Genes common to all three conditions: 1608


In [None]:
done