In [None]:
import pandas as pd
import requests



In [None]:
general_protein_df = pd.read_csv("../data/UniProt/generalProteinBinding.tsv", sep="\t")

In [None]:
general_protein_df

In [None]:
num_samples = 15
general_protein_df = general_protein_df.sample(n=num_samples)
general_protein_df

In [None]:
ORGANISM_HUMAN_ID = 9606

def fetch_uniprot_sequence(gene_symbol):
    url = "https://rest.uniprot.org/uniprotkb/stream"
    params = {
        "query": f'(gene_exact:"{gene_symbol}" AND organism_id:{ORGANISM_HUMAN_ID})',
        "fields": "sequence",
        "format": "fasta",
    }

    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        fasta_text = response.text
        # Split the response into individual FASTA entries
        entries = fasta_text.strip().split('>')
        entries = [entry for entry in entries if entry]
        if entries:
            # keep obnly 1. entry
            first_entry = entries[0]
            lines = first_entry.split("\n")
            # remove header
            first_sequence = ''.join(lines[1:])
            return first_sequence
        else:
            return "No sequences found"
    else:
        return f"Error: {response.status_code}"

'''
# for testing
gene_symbol = "MAP2K4"
sequence = fetch_uniprot_sequence(gene_symbol)
print(sequence)  
'''



In [None]:
proteins = []

In [None]:


for index, row in general_protein_df.iterrows():
    seq1 = fetch_uniprot_sequence(row["OFFICIAL_SYMBOL_A"])
    seq2 = fetch_uniprot_sequence(row["OFFICIAL_SYMBOL_B"])
    proteins.append((seq1, seq2))



In [None]:
proteins_AA_df = pd.DataFrame(proteins, columns=["Protein 1 AA", "Protein 2 AA"])


In [None]:
proteins_AA_df

In [None]:
to_path = "../data/GeneralProteinBinding/"
file_name = "general_proteins.tsv"

In [None]:
proteins_AA_df.to_csv(to_path+file_name, sep="\n", index=False)