In [1]:
import pandas as pd
import requests



In [2]:
general_protein_df = pd.read_csv("./UniProt/generalProteinBinding.tsv", sep="\t")

In [3]:
general_protein_df

Unnamed: 0,OFFICIAL_SYMBOL_A,OFFICIAL_SYMBOL_B
0,MAP2K4,FLNC
1,MYPN,ACTN2
2,ACVR1,FNTA
3,GATA2,PML
4,RPA2,STAT3
...,...,...
1215116,TAZ,POLR2A
1215117,POLR2A,TAZ
1215118,ERBB2,CTNNB1
1215119,CTNNB1,ERBB2


In [4]:
num_samples = 15
general_protein_df = general_protein_df.sample(n=num_samples)
general_protein_df

Unnamed: 0,OFFICIAL_SYMBOL_A,OFFICIAL_SYMBOL_B
521621,APEX1,MAPK7
299027,TRRAP,BRINP1
150179,HCVgp1,NAP1L1
379490,GRAMD1B,FNDC3A
546651,CALCOCO1,CWF19L2
232725,KIR2DS2,IGHA1
660575,CIT,FXR1
513150,MGST1,SCCPDH
191948,STUB1,HSPA1B
703548,CIC,SMARCC1


In [5]:
ORGANISM_HUMAN_ID = 9606

import requests

def fetch_uniprot_sequence(gene_symbol):
    url = "https://rest.uniprot.org/uniprotkb/stream"
    params = {
        "query": f'(gene_exact:"{gene_symbol}" AND organism_id:{ORGANISM_HUMAN_ID})',
        "fields": "sequence",
        "format": "fasta",
    }

    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        fasta_text = response.text
        # Split the response into individual FASTA entries
        entries = fasta_text.strip().split('>')
        entries = [entry for entry in entries if entry]
        if entries:
            # keep obnly 1. entry
            first_entry = entries[0]
            lines = first_entry.split("\n")
            # remove header
            first_sequence = ''.join(lines[1:])
            return first_sequence
        else:
            return "No sequences found"
    else:
        return f"Error: {response.status_code}"

'''
# for testing
gene_symbol = "MAP2K4"
sequence = fetch_uniprot_sequence(gene_symbol)
print(sequence)  
'''



'\n# for testing\ngene_symbol = "MAP2K4"\nsequence = fetch_uniprot_sequence(gene_symbol)\nprint(sequence)  \n'

In [6]:
proteins = []

In [7]:


for index, row in general_protein_df.iterrows():
    seq1 = fetch_uniprot_sequence(row["OFFICIAL_SYMBOL_A"])
    seq2 = fetch_uniprot_sequence(row["OFFICIAL_SYMBOL_B"])
    proteins.append((seq1, seq2))



In [8]:
proteins_AA_df = pd.DataFrame(proteins, columns=["Protein 1 AA", "Protein 2 AA"])


In [9]:
proteins_AA_df

Unnamed: 0,Protein 1 AA,Protein 2 AA
0,MPKRGKKGAVAEDGDELRTEPEAKKSKTAAKKNDKEAAGEGPALYE...,MAEPLKEEDGEDGSAEPPGPVKAEPAHTAASVAAKNLALLKARSFD...
1,MAFVATQGATVVDQTTLMKKYLQFVAALTDVNTPDETKLKMMQEVS...,MNWRFVELLYFLFIWGRISVQPSHQEPAGTDQHVSKEFDWLISDRG...
2,No sequences found,MADIDNKEQSELDQDLDDVEEVEEEETGEETKLKARQLTVQMMQNP...
3,MKGFKLSCTASNSNRSTPACSPILRKRSRSPTPQNQDGDTMVEKGS...,MAEHPPLLDTTQILSSDISLLSAPIVSADGTQQVILVQVNPGEAFT...
4,MEESPLSRAPSRGGVNFLNVARTYIPNTKVECHYTLPPGTMPSASD...,MATSMAAASGRFESAKSIEERKEQTRNARAEVLRQAKANFEKEERR...
5,MSLMVVSMACVGFFLLQGAWPHEGVHRKPSLLAHPGPLVKSEETVI...,ASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQ...
6,MLKFKYGARNPLDAGAAEPIASRASRLNLFFQGKPPFMTQQQMSPL...,MAELTVEVRGSNGAFYKGFIKDVHEDSLTVVFENNWQPERQVPFNE...
7,MVDLTQVMDDEVFMAFASYATIILSKMMLMSTATAFYRLTRKVFAN...,MATEQRPFHLVVFGASGFTGQFVTEEVAREQVDPERSSRLPWAVAG...
8,MKGKEEKEGGARLGAGGGSPEKSPSAQELKEQGNRLFVGRKYPEAA...,MAKAAAIGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFTD...
9,MYSAHRPLMPASSAASRGLGMFVWTNVEPRSVAVFPWHSLVPFLAP...,MAAAAGGGGPGTAVGATGSGIAAAAAGLAVYRRKDGGPATKFWESP...


In [10]:
to_path = "./GeneralProteinBinding/"
file_name = "general_proteins.tsv"

In [11]:
proteins_AA_df.to_csv(to_path+file_name, sep="\n", index=False)