In [20]:
# !pip install biopython pandas requests -q

import requests, pandas as pd
from io import StringIO
from Bio import SeqIO

def fetch_fasta(url, label, max_records=25):
    """Fetch sequences from UniProt and return DataFrame with id,sequence,label."""
    r = requests.get(url)
    r.raise_for_status()
    records = list(SeqIO.parse(StringIO(r.text), "fasta"))[:max_records]
    print(f"✅ Retrieved {len(records)} sequences for label {label}")
    return pd.DataFrame({
        "uniprot_id": [r.id for r in records],
        "sequence": [str(r.seq) for r in records],
        "label": [label] * len(records)
    })

# ✅ Properly encoded UniProt endpoints (human only)
membrane_url = (
    "https://rest.uniprot.org/uniprotkb/stream?"
    "query=reviewed:true%20AND%20organism_id:9606%20AND%20keyword:Membrane"
    "&format=fasta"
)
soluble_url = (
    "https://rest.uniprot.org/uniprotkb/stream?"
    "query=reviewed:true%20AND%20organism_id:9606%20AND%20NOT%20keyword:Membrane"
    "&format=fasta"
)

# Fetch 25 each (→ 50 total)
df1 = fetch_fasta(membrane_url, 1, max_records=25)
df0 = fetch_fasta(soluble_url, 0, max_records=25)

# Combine, reset, and add sequential s1, s2... IDs
df = pd.concat([df1, df0]).reset_index(drop=True)
df.insert(0, "id", [f"s{i+1}" for i in range(len(df))])

# Save and preview
df.to_csv("train_medium.csv", index=False)
print("\n✅ Saved train_medium.csv with shape:", df.shape)
display(df)



✅ Retrieved 25 sequences for label 1
✅ Retrieved 25 sequences for label 0

✅ Saved train_medium.csv with shape: (50, 4)


Unnamed: 0,id,uniprot_id,sequence,label
0,s1,sp|A0A087X1C5|CP2D7_HUMAN,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,1
1,s2,sp|A0A0B4J2F0|PIOS1_HUMAN,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,1
2,s3,sp|A0A0K2S4Q6|CD3CH_HUMAN,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,1
3,s4,sp|A0A1B0GTW7|CIROP_HUMAN,MLLLLLLLLLLPPLVLRVAASRCLHDETQKSVSLLRPPFSQLPSKS...,1
4,s5,sp|A0AV02|S12A8_HUMAN,MTQMSQVQELFHEAAQQDALAQPQPWWKTQLFMWEPVLFGTWDGVF...,1
5,s6,sp|A0AVI4|TM129_HUMAN,MDSPEVTFTLAYLVFAVCFVFTPNEFHAAGLTVQNLLSGWLGSEDA...,1
6,s7,sp|A0FGR8|ESYT2_HUMAN,MTANRDAALSSHRHPGCAQRPRTPTFASSSQRRSAFGFDDGNFPGL...,1
7,s8,sp|A0FGR9|ESYT3_HUMAN,MRAEEPCAPGAPSALGAQRTPGPELRLSSQLLPELCTFVVRVLFYL...,1
8,s9,sp|A0M8Q6|IGLC7_HUMAN,GQPKAAPSVTLFPPSSEELQANKATLVCLVSDFNPGAVTVAWKADG...,1
9,s10,sp|A0PJK1|SC5AA_HUMAN,MAANSTSDLHTPGTQLSVADIIVITVYFALNVAVGIWSSCRASRNT...,1
