In [5]:
#Need ancseq kernel

from pathlib import Path

#Define paths for current project
# --- Centralized paths ---
ROOT = Path("..")
DATA = ROOT / "data"
LOGS = ROOT / "logs"
SCRIPTS = ROOT / "scripts"
RESULTS = ROOT / "results"
ALIGN_DIR = RESULTS / "align"
TREE_DIR = RESULTS / "trees"
FIGURES = RESULTS / "figures"
ANCESTORS = RESULTS / "ancestors"

**Note:**  
For long-running jobs (like `ancseq`), do **not** run them in a Jupyter notebook if you are worried about losing your connection.  
Instead, use a terminal multiplexer such as `tmux` or `screen` in a regular terminal:

1. Open a terminal.
2. Start a tmux session: `tmux`
3. Activate your environment and run your command:
   ```
   conda activate ancseq
   ancseq -s ../results/align/combined_plus_hasegawa24_ALN.fasta -m AA -o ../results/ancestors/cph24
   ```
4. Detach from tmux with `Ctrl+b` then `d`.

This ensures your job continues running even if you disconnect.

In [None]:
from Bio import SeqIO

# Paths
HITS = ALIGN_DIR / "pumphits_ALN.fasta"
ANCESTOR_FASTA = ANCESTORS / "cph24/30_result/ancestral_state_result.fasta"
FINAL = DATA / "final_with_ancestors.fasta"

# Ancestor node names
Ancestors = ("Node36", "Node73", "Node37", "Node6",
             "Node87", "Node4", "Node117", "Node38",
             "Node40", "Node44")

#Node 36 is the root in a midpoint rooted tree
#Node 73 is ancestor of PR sequences
#Node 37 is ancestor of all GR + NaR
#Node 6 is ancestor of all CyHR
#Node 87 is ancestor of all XeR
#Node 4 is ancestor of all YCyR + GCyR + BR
#Node 117 is the ancestor of BR
#Node 38 is the ancestor of GR
#Node 40 and 44 are ancestors within GR Clade




# Read original alignment
aln_records = list(SeqIO.parse(HITS, "fasta"))

# Read ancestor sequences
ancestor_records = [rec for rec in SeqIO.parse(ANCESTOR_FASTA, "fasta") if rec.id in Ancestors]

# Concatenate and write to FINAL
with open(FINAL, "w") as out_handle:
    SeqIO.write(aln_records + ancestor_records, out_handle, "fasta")

print(f"Combined alignment and ancestor sequences written to {FINAL}")

# Align with MAFFT
ALN_FASTA = ALIGN_DIR / "final_with_ancestors_ALN.fasta"
!mafft --auto --thread -1 --quiet "{FINAL}" > "{ALN_FASTA}"
print("Aligned ->", ALN_FASTA)

Combined alignment and ancestor sequences written to ../data/final_with_ancestors.fasta
Aligned -> ../results/align/final_with_ancestors_ALN.fasta


#### Repeat using TM-COFFEE Alignment

### Repeat using tm-coffee alignment 
For long-running jobs (like `ancseq`), do **not** run them in a Jupyter notebook if you are worried about losing your connection.  
Instead, use a terminal multiplexer such as `tmux` or `screen` in a regular terminal:

1. Open a terminal.
2. Start a tmux session: `tmux`
3. Activate your environment and run your command:
   ```
   conda activate ancseq
   ancseq -s ../results/align/tmcoffee/result.fasta_aln -m AA -o ../results/ancestors/cph24_tcoffee
   ```
4. Detach from tmux with `Ctrl+b` then `d`.

This ensures your job continues running even if you disconnect.

In [7]:
from Bio import SeqIO

# Paths
HITS = DATA / "pumphitsM.fasta"
ANCESTOR_FASTA = ANCESTORS / "cph24_tcoffee/30_result/ancestral_state_result.fasta"
FINAL = DATA / "final_with_ancestors_tcoffee.fasta"

# Ancestor node names
Ancestors = ("Node36", "Node6", "Node87", "Node5",
             "Node116", "Node45", "Node37", "Node44",
             "Node41", "Node38")

# Read original alignment
aln_records = list(SeqIO.parse(HITS, "fasta"))

# Read ancestor sequences
ancestor_records = [rec for rec in SeqIO.parse(ANCESTOR_FASTA, "fasta") if rec.id in Ancestors]

# Concatenate and write to FINAL
with open(FINAL, "w") as out_handle:
    SeqIO.write(aln_records + ancestor_records, out_handle, "fasta")

print(f"Combined alignment and ancestor sequences written to {FINAL}")

Combined alignment and ancestor sequences written to ../data/final_with_ancestors_tcoffee.fasta


#### Visual inspection of alignment leads to discovery of sequences to remove

# BRhit__Halobacterium_salinarum__UniRef90_UPI0000110B77
# That sequence is identical to one in the alignment, but missing several amino acids, will simply remove

# GCyR2hit__Pseudanabaenaceae_cyanobacterium_LEGE_13415__UniRef90_A0A928YZN9
# That sequence has another from the same species that is very similar but not truncated

# GRhit__Chamaesiphon_sp__UniRef90_UPI0035935AE6
# That sequence has others from the same species that are highly similar but not truncated

In [1]:
from Bio import SeqIO

input_fasta = "../data/final_with_ancestors_tcoffee.fasta"
output_fasta = "../data/final_with_ancestors_tcoffee_culled.fasta"

remove_ids = [
    "BRhit__Halobacterium_salinarum__UniRef90_UPI0000110B77",
    "GCyR2hit__Pseudanabaenaceae_cyanobacterium_LEGE_13415__UniRef90_A0A928YZN9",
    "GRhit__Chamaesiphon_sp__UniRef90_UPI0035935AE6"
]

def normalize_id(s):
    return "".join(s.strip().strip('"').strip("'").split())

remove_ids_normalized = set(normalize_id(x) for x in remove_ids)

records = []
removed_count = 0
with open(input_fasta) as in_handle:
    for record in SeqIO.parse(in_handle, "fasta"):
        rec_id_norm = normalize_id(record.id)
        if rec_id_norm not in remove_ids_normalized:
            records.append(record)
        else:
            print(f"Removed: {record.id!r} (normalized: {rec_id_norm!r})")
            removed_count += 1

with open(output_fasta, "w") as out_handle:
    SeqIO.write(records, out_handle, "fasta")

print(f"Filtered FASTA written to {output_fasta}")
print(f"Total sequences removed: {removed_count}")
print(f"Total unique sequences retained: {len(records)}")

Removed: 'BRhit__Halobacterium_salinarum__UniRef90_UPI0000110B77' (normalized: 'BRhit__Halobacterium_salinarum__UniRef90_UPI0000110B77')
Removed: 'GCyR2hit__Pseudanabaenaceae_cyanobacterium_LEGE_13415__UniRef90_A0A928YZN9' (normalized: 'GCyR2hit__Pseudanabaenaceae_cyanobacterium_LEGE_13415__UniRef90_A0A928YZN9')
Removed: 'GRhit__Chamaesiphon_sp__UniRef90_UPI0035935AE6' (normalized: 'GRhit__Chamaesiphon_sp__UniRef90_UPI0035935AE6')
Filtered FASTA written to ../data/final_with_ancestors_tcoffee_culled.fasta
Total sequences removed: 3
Total unique sequences retained: 101
