# Notebook to find cyanobacterial opsins similar to GR

This notebook submits a **BLASTP** job to the EMBL‑EBI web service against **UniRef90**, poll for completion, parse the tabular results, and fetch the top hits as FASTA sequences. It uses GR opsin as bait.

In [34]:
import requests, time, os
from pathlib import Path
from Bio import SeqIO
import re

print('Packages imported — ready!')

Packages imported — ready!


In [35]:
# Helper functions for the BLAST workflow
def submit_blast(sequence, email, db='uniref90', program='blastp'):
    url = 'https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/run'
    clean_seq = sequence.replace("\n", "").replace("*", "").strip()
    data = {
        'email': email,
        'sequence': clean_seq,
        'stype': 'protein',
        'database': db,
        'program': program,
        'alignments': 500,
        'exp': 100,
        'filter': 'F'
    }
    response = requests.post(url, data=data)
    response.raise_for_status()
    return response.text.strip()
def get_results(job_id, result_type='out'):
    url = f"https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/result/{job_id}/{result_type}"
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def check_status(job_id: str) -> str:
    url = f'https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/status/{job_id}'
    r = requests.get(url)
    r.raise_for_status()
    return r.text.strip()

def get_tabular_results(job_id: str):
    url = f'https://www.ebi.ac.uk/Tools/services/rest/ncbiblast/result/{job_id}/tab'
    r = requests.get(url)
    if r.status_code == 400:
        return None  # likely no hits
    r.raise_for_status()
    return r.text


def parse_uniref_ids_from_out(text_result, max_hits=200):
    cluster_ids = []
    seen = set()
    for line in text_result.splitlines():
        if line.startswith(">UR90:UniRef90_"):
            # Only allow IDs with valid protein accessions: typically 6+ alphanum characters
            match = re.match(r">UR90:(UniRef90_[A-Z0-9]{6,})", line)
            if match:
                uid = match.group(1)
                if uid not in seen:
                    cluster_ids.append(uid)
                    seen.add(uid)
        if len(cluster_ids) >= max_hits:
            break
    return cluster_ids

def fetch_fasta(uid):
    url = f'https://rest.uniprot.org/uniref/{uid}.fasta'
    r = requests.get(url)
    if r.status_code == 404:
        print(f"⚠️  Skipping invalid UniRef cluster ID: {uid}")
        return None
    r.raise_for_status()
    return r.text


def write_fasta(seqs, out_path):
    with open(out_path, 'w') as fh:
        for s in seqs:
            fh.write(s)
    print(f'Wrote {len(seqs)} sequences to {out_path}')

In [36]:
# Parameters — edit these!
email = 'oakley@ucsb.edu'   # required by EBI
input_fasta = 'GR.fasta'               # path to your query FASTA file
max_hits = 150                         # number of UniRef90 hits to keep

out_dir = Path('blast_notebook_results')
out_dir.mkdir(exist_ok=True)
print(f'Output directory: {out_dir.resolve()}')

Output directory: /Users/oakley/Documents/GitHub/cyano_rhodopsins/blast_notebook_results


In [37]:
# --- BLASTP against UniRef90: run, poll, and save raw text output ---

record = next(SeqIO.parse(input_fasta, "fasta"))
print(f"Submitting BLAST for: {record.id}  (length {len(record.seq)} aa)")

job_id = submit_blast(str(record.seq), email)   # uses the helper in earlier cell
print("Job ID:", job_id)

# Poll until finished
while True:
    status = check_status(job_id)               # helper
    print(f"[{time.strftime('%H:%M:%S')}] status: {status}")
    if status == "FINISHED":
        break
    elif status == "ERROR":
        raise RuntimeError("BLAST job failed")
    time.sleep(5)

# Retrieve raw BLAST text output
text_result = get_results(job_id, result_type="out")  # plain text
print(text_result[:1000])  # preview first 1 000 chars

# Save to disk for later parsing
blast_txt_path = out_dir / f"{record.id}_uniref90.blast.out"
blast_txt_path.write_text(text_result)
print(f"Full BLAST output saved ➜  {blast_txt_path}")


Submitting BLAST for: BAC88139.1  (length 298 aa)
Job ID: ncbiblast-R20250619-013645-0617-23440871-p1m
[17:36:49] status: RUNNING
[17:36:56] status: RUNNING
[17:37:03] status: RUNNING
[17:37:10] status: RUNNING
[17:37:17] status: RUNNING
[17:37:24] status: RUNNING
[17:37:31] status: RUNNING
[17:37:39] status: RUNNING
[17:37:45] status: RUNNING
[17:37:52] status: RUNNING
[17:37:59] status: RUNNING
[17:38:07] status: RUNNING
[17:38:14] status: RUNNING
[17:38:23] status: RUNNING
[17:38:30] status: RUNNING
[17:38:37] status: RUNNING
[17:38:44] status: RUNNING
[17:38:51] status: RUNNING
[17:38:58] status: RUNNING
[17:39:05] status: RUNNING
[17:39:12] status: RUNNING
[17:39:19] status: RUNNING
[17:39:27] status: RUNNING
[17:39:34] status: RUNNING
[17:39:41] status: RUNNING
[17:39:48] status: RUNNING
[17:39:55] status: RUNNING
[17:40:02] status: RUNNING
[17:40:09] status: RUNNING
[17:40:17] status: RUNNING
[17:40:24] status: RUNNING
[17:40:31] status: RUNNING
[17:40:38] status: RUNNING
[17:40

In [38]:
# --- Parse saved BLAST output and fetch top UniRef90 FASTA sequences ---

max_hits = 150                             # Or set as needed
text_result = blast_txt_path.read_text()   # Load saved output

uids = parse_uniref_ids_from_out(text_result, max_hits)  # Parse UniRef cluster IDs
print(f"{len(uids)} UniRef IDs retrieved")

if uids:
    seqs = []
    for uid in uids:
        seq = fetch_fasta(uid)             # Returns None on 404
        if seq:
            seqs.append(seq)

    if seqs:
        out_fasta = out_dir / f"{record.id}_top{max_hits}_uniref90.fasta"
        write_fasta(seqs, out_fasta)       # Save valid FASTA sequences
        print(f"FASTA file written ➜  {out_fasta}")
    else:
        print("All UniRef IDs failed to fetch. No FASTA file written.")
else:
    print("No UniRef IDs found. Nothing to download.")


150 UniRef IDs retrieved
⚠️  Skipping invalid UniRef cluster ID: UniRef90_A0A0Q4FQD5
⚠️  Skipping invalid UniRef cluster ID: UniRef90_A0A6J7GMR2
⚠️  Skipping invalid UniRef cluster ID: UniRef90_A0A0J6VIK5
⚠️  Skipping invalid UniRef cluster ID: UniRef90_UPI001A1BE314
Wrote 146 sequences to blast_notebook_results/BAC88139.1_top150_uniref90.fasta
FASTA file written ➜  blast_notebook_results/BAC88139.1_top150_uniref90.fasta


# Next Run MAFFT on all the sequences to align

In [1]:
from pathlib import Path
import subprocess

# Input: FASTA of UniRef sequences
fasta_in = Path("blast_notebook_results/BAC88139.1_top150_uniref90.fasta")                           # from previous step
msa_out = fasta_in.with_suffix(".mafft.fasta")  # output file

# Run MAFFT
cmd = ["mafft", "--auto", str(fasta_in)]
print("Running MAFFT...")

with open(msa_out, "w") as outfile:
    subprocess.run(cmd, check=True, stdout=outfile)

print(f"Alignment saved to: {msa_out}")


Running MAFFT...


outputhat23=2
treein = 0
compacttree = 0
stacksize: 8192 kb
rescale = 1
All-to-all alignment.
tbfast-pair (aa) Version 7.525
alg=L, model=BLOSUM62, 2.00, -0.10, +0.10, noshift, amax=0.0
0 thread(s)

outputhat23=2
Loading 'hat3.seed' ... 
done.
Writing hat3 for iterative refinement
rescale = 1
Gap Penalty = -1.53, +0.00, +0.00
tbutree = 1, compacttree = 0
Constructing a UPGMA tree ... 
  140 / 146
done.

Progressive alignment ... 
STEP   143 /145 
Reallocating..done. *alloclen = 1816
STEP   145 /145 
done.
tbfast (aa) Version 7.525
alg=A, model=BLOSUM62, 1.53, -0.00, -0.00, noshift, amax=0.0
1 thread(s)

minimumweight = 0.000010
autosubalignment = 0.000000
nthread = 0
randomseed = 0
blosum 62 / kimura 200
poffset = 0
niter = 2
sueff_global = 0.100000
nadd = 2
Loading 'hat3' ... done.
rescale = 1

  140 / 146
Segment   1/  1    1- 483
done 002-001-1  identical.    identical.    identical.    rejected. identical.    rejected. rejected. identical.    identical.    identical.    identical. 

Alignment saved to: blast_notebook_results/BAC88139.1_top150_uniref90.mafft.fasta


In [5]:
from Bio import AlignIO

msa_path = "blast_notebook_results/BAC88139.1_top150_uniref90.mafft.fasta"
alignment = AlignIO.read(msa_path, "fasta")

print(f"Alignment length: {alignment.get_alignment_length()}")
print(f"Number of sequences: {len(alignment)}")

# Preview first 10 aligned sequences (trimmed to 100 columns)
for record in alignment[:10]:
    print(f"{record.id[:25]:25} {record.seq[:100]}")



Alignment length: 467
Number of sequences: 146
UniRef90_Q7NP59           M------------------------------------------------------------------LMTVFSSAPELALLGSTFAQVD-------PSNL
UniRef90_A0A969T0G4       --------------------------------------------------------------------MIEVSLAPDFTLLGALFVRGD------IADRL
UniRef90_A0A2W7ARY7       ----------------------------------------------------------------------MISSMPDFALLGSLVDQGD------VLDRL
UniRef90_A0A969FEC7       ---------------------------------------------------------------------MSAILLPDLSLLGAV-AQDD------LLDRL
UniRef90_A0A925M2S9       --------------------------------------------------------------------MFDLSPIPDFALLGALLVQDD------SSQRL
UniRef90_UPI000A475A3D    --------------------------------------------------------------------MLVISLVPDGTLLGAL-TTGD------MSDRL
UniRef90_UPI0035946169    MTTIEDSAFFRKLNLQAILSSVVGTFIFFYSIAQFQISHGFNLQQDSSTNVNGSSCRFKPNPPEQILLMIAVSFISDFPLLGAVLATDD------LPSRV
UniRef90_UPI003592F9D2    MET-----------------------------------

In [3]:
import os

print("\n".join(os.listdir("blast_notebook_results")))


BAC88139.1_top150_uniref90.fasta
BAC88139.1_top150_uniref90.mafft.fasta
BAC88139.1_uniref90.blast.out
