<a href="https://colab.research.google.com/github/wiz124/chem169-git/blob/main/Li_Harry_RID_016_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
#Exercise 0
#!pip install -q biopython
from Bio import SeqIO
import requests
from io import StringIO

Prot_IDs=['P04406','P10599','P68871','P04637']
def getFasta(ID):
    url = f"https://rest.uniprot.org/uniprotkb/{ID}.fasta"
    response = requests.get(url)
    response.raise_for_status()
    fasta_data = StringIO(response.text)

    for seq_record in SeqIO.parse(fasta_data, "fasta"):
      print(len(seq_record.seq))
      return str(seq_record.seq)


query_dict={}
for id in Prot_IDs:
  query_dict[id]=getFasta(id)

print(query_dict)


335
sp|P04406|G3P_HUMAN
105
sp|P10599|THIO_HUMAN
147
sp|P68871|HBB_HUMAN
393
sp|P04637|P53_HUMAN
{'P04406': 'MGKVKVGVNGFGRIGRLVTRAAFNSGKVDIVAINDPFIDLNYMVYMFQYDSTHGKFHGTVKAENGKLVINGNPITIFQERDPSKIKWGDAGAEYVVESTGVFTTMEKAGAHLQGGAKRVIISAPSADAPMFVMGVNHEKYDNSLKIISNASCTTNCLAPLAKVIHDNFGIVEGLMTTVHAITATQKTVDGPSGKLWRDGRGALQNIIPASTGAAKAVGKVIPELNGKLTGMAFRVPTANVSVVDLTCRLEKPAKYDDIKKVVKQASEGPLKGILGYTEHQVVSSDFNSDTHSSTFDAGAGIALNDHFVKLISWYDNEFGYSNRVVDLMAHMASKE', 'P10599': 'MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQDVASECEVKCMPTFQFFKKGQKVGEFSGANKEKLEATINELV', 'P68871': 'MVHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPWTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH', 'P04637': 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSG

In [3]:
#Exercise 1
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Blast import NCBIXML


recordlst=[]
for key,value in query_dict.items():
  record=SeqRecord(Seq(value),id=key)
  recordlst.append(record)
with open('query_proteins.fasta', "w") as output:
  queryfasta = SeqIO.write(recordlst,output, "fasta")

with open("blast_results.xml") as f:
    blast_records = list(NCBIXML.parse(f))  # one record per query protein

alignment_dict={}
for record in blast_records:
    print(f"\n{record.query}")
    print(record.alignments[:10])
    for alignment in record.alignments[:10]:  # top 10 hits
        hsp = alignment.hsps[0]
        uniprot_id = alignment.hit_def.split("|")[1]  # â†’ "P0A9B2"
        evalue=hsp.expect
        percentidentity=hsp.identities / hsp.align_length
        hit_dict={}
        hit_dict[uniprot_id]={
            'e-value':evalue,
            'percent identity':percentidentity
        }
        query_id=record.query.split(' ')[0]
        alignment_dict[query_id]=hit_dict
print(alignment_dict)



P04406 <unknown description>
[<Bio.Blast.NCBIXML.Alignment object at 0x798f1d96b9b0>, <Bio.Blast.NCBIXML.Alignment object at 0x798f1de1be90>]

P10599 <unknown description>
[<Bio.Blast.NCBIXML.Alignment object at 0x798f1da30260>, <Bio.Blast.NCBIXML.Alignment object at 0x798f1da30320>, <Bio.Blast.NCBIXML.Alignment object at 0x798f1da303e0>]

P68871 <unknown description>
[]

P04637 <unknown description>
[]
{'P04406': {'P0A9B6': {'e-value': 1.24916e-82, 'percent identity': 0.375}}, 'P10599': {'P77395': {'e-value': 2.36368e-06, 'percent identity': 0.25274725274725274}}}


In [15]:
#Exercise 2
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import h5py

def compute_similarity(query_embeddings,db_embeddings):

  query_matrix=np.array([query_embeddings[id] for id in Prot_IDs])

  db_ids=list(db_embeddings.keys())
  db_matrix=np.array([db_embeddings[id] for id in db_ids])

  comp=cosine_similarity(query_matrix,db_matrix)
  return comp

def getProteinName(hit_id):
  for record in SeqIO.parse('ecoli.fasta', "fasta"):
    if hit_id in str(record.id):
      return str(record.id).split('|')[2]

def tophits(comparison,sequences,db_embeddings):
  query_order=Prot_IDs
  db_ids=list(db_embeddings.keys())

  result=[]

  for query_idx,id in enumerate(query_order):
    hits=np.argsort(comparison[query_idx])[::-1][:10].tolist()

    for i, idx in enumerate(hits):
      hit_id=db_ids[idx]
      similarity=comparison[query_idx][idx]
      sequence=sequences[hit_id]

      name=getProteinName(hit_id)

      result.append({
          'query': id,
          'rank': i,
          'ref_id': hit_id,
          'name': name,
          'similarity': similarity,
          'sequence': sequence
    })
  return result

with h5py.File('query_proteins.h5', 'r') as file:
    query_embeddings={}
    for name in file.keys():
        query_embeddings[name]=file[name][:]
with h5py.File('per-protein.h5', 'r') as file:
  db_embeddings={}
  for name in file.keys():
    db_embeddings[name]=file[name][:]

similarities=compute_similarity(query_embeddings,db_embeddings)

sequences={}
for record in SeqIO.parse('ecoli.fasta', 'fasta'):
  parse=record.id.split('|')
  if len(parse)>=2:
    uniprot_id=parse[1]
  else:
    uniprot_id=parse[0]
  sequences[uniprot_id]=str(record.seq)

results=tophits(similarities,sequences,db_embeddings)
resultdf=pd.DataFrame(results)
display(resultdf)

Unnamed: 0,query,rank,ref_id,name,similarity,sequence
0,P04406,0,P0A9B2,G3P1_ECOLI,0.92705,MTIKVGINGFGRIGRIVFRAAQKRSDIEIVAINDLLDADYMAYMLK...
1,P04406,1,P61889,MDH_ECOLI,0.821558,MKVAVLGAAGGIGQALALLLKTQLPSGSELSLYDIAPVTPGVAVDL...
2,P04406,2,P0CE48,EFTU2_ECOLI,0.808739,MSKEKFERTKPHVNVGTIGHVDHGKTTLTAAITTVLAKTYGGAARA...
3,P04406,3,P0CE47,EFTU1_ECOLI,0.808287,MSKEKFERTKPHVNVGTIGHVDHGKTTLTAAITTVLAKTYGGAARA...
4,P04406,4,P0AGE9,SUCD_ECOLI,0.777293,MSILIDKNTKVICQGFTGSQGTFHSEQAIAYGTKMVGGVTPGKGGT...
5,P04406,5,P0A6P9,ENO_ECOLI,0.75975,MSKIVKIIGREIIDSRGNPTVEAEVHLEGGFVGMAAAPSGASTGSR...
6,P04406,6,P08200,IDH_ECOLI,0.752024,MESKVVVPAQGKKITLQNGKLNVPENPIIPYIEGDGIGVDVTPAML...
7,P04406,7,P0A817,METK_ECOLI,0.74662,MAKHLFTSESVSEGHPDKIADQISDAVLDAILEQDPKARVACETYV...
8,P04406,8,P0A6F5,CH60_ECOLI,0.746416,MAAKDVKFGNDARVKMLRGVNVLADAVKVTLGPKGRNVVLDKSFGA...
9,P04406,9,P0A6A6,LEUC_ECOLI,0.743258,MAKTLYEKLFDAHVVYEAENETPLLYIDRHLVHEVTSPQAFDGLRA...


In [None]:
#Exercise 3
for key,value in alignment_dict.items():
