In [1]:
import os
from dataclasses import dataclass
from functools import cached_property
import scanpy as sc
import anndata as ad
import pandas as pd
import requests
import torch
from collections.abc import Callable, Iterable
from esm import FastaBatchedDataset, pretrained
from Bio import ExPASy
from Bio import SwissProt
import requests


try:
    import torch
    from torch.utils.data import DataLoader
    from transformers import AutoTokenizer, EsmModel
except ImportError as e:
    torch = None
    DataLoader = None
    AutoTokenizer = None
    EsmModel = None
    raise ImportError(
        "To use gene embedding, please install `transformers` and `torch` \
            e.g. via `pip install cfp['embedding']`."
    ) from e

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
adata = sc.read_h5ad("/lustre/groups/ml01/workspace/ot_perturbation/data/pbmc/adata_hvg2000_LV.h5ad")

In [3]:
cytokines = adata[adata.obs["cytokine"]!="PBS"].obs["cytokine"].unique()
len(cytokines)

90

In [4]:
def get_uniprot_id(cytokine_name, organism="Homo sapiens"):
    """
    Get the UniProt ID for a given cytokine name.

    Parameters:
        cytokine_name (str): Name of the cytokine (e.g., IL-12A).
        organism (str): Organism name (default: Homo sapiens).

    Returns:
        list: A list of UniProt IDs matching the query.
    """
    base_url = "https://rest.uniprot.org/uniprotkb/search"
    query = f'"{cytokine_name}" AND (organism_name:"{organism}")'

    params = {
        "query": query,
        "format": "json",
        "fields": "accession,id,protein_name,organism_name"
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        results = response.json().get("results", [])
        
        if not results:
            return []

        uniprot_ids = [
            {"id": result["primaryAccession"], 
             "name": result.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", "Unknown")}
            for result in results
        ]
        return uniprot_ids

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from UniProt: {e}")
        return []

# Example: Search for UniProt IDs for IL-12A
cytokine_name = "IL-12A"
uniprot_ids = get_uniprot_id(cytokine_name)

if uniprot_ids:
    for protein in uniprot_ids:
        print(f"UniProt ID: {protein['id']}, Name: {protein['name']}")
else:
    print(f"No UniProt IDs found for {cytokine_name}.")


UniProt ID: P29459, Name: {'value': 'Interleukin-12 subunit alpha'}
UniProt ID: E9PGR3, Name: {'evidences': [{'evidenceCode': 'ECO:0000256', 'source': 'ARBA', 'id': 'ARBA00014463'}, {'evidenceCode': 'ECO:0000256', 'source': 'RuleBase', 'id': 'RU363133'}], 'value': 'Interleukin-12 subunit alpha'}
UniProt ID: E7ENE1, Name: {'evidences': [{'evidenceCode': 'ECO:0000256', 'source': 'ARBA', 'id': 'ARBA00014463'}, {'evidenceCode': 'ECO:0000256', 'source': 'RuleBase', 'id': 'RU363133'}], 'value': 'Interleukin-12 subunit alpha'}
UniProt ID: O60595, Name: {'evidences': [{'evidenceCode': 'ECO:0000256', 'source': 'ARBA', 'id': 'ARBA00014463'}, {'evidenceCode': 'ECO:0000256', 'source': 'RuleBase', 'id': 'RU363133'}], 'value': 'Interleukin-12 subunit alpha'}


In [5]:
res = {}
cyto_not_found = []
for cytokine in cytokines:
    try:
        res[cytokine] = get_uniprot_id(cytokine)[0]['id']
    except IndexError:
        cyto_not_found.append(cytokine)

In [20]:
for k1,v1 in res.items():
    for k2,v2 in res.items():
        if (v1==v2) and k1!=k2:
            print(k1,k2)

IL-12 IL-35
IL-35 IL-12


('Q06643', 'Q06643')

In [6]:
cyto_not_found

['FGF-beta',
 'IFN-alpha1',
 'IFN-lambda2',
 'IFN-lambda3',
 'IL-32-beta',
 'IL-36-alpha',
 'LT-alpha1-beta2',
 'LT-alpha2-beta1']

In [23]:
manual_uniprot = {
    'IL-12': 'P29459',
    'EBI3': 'Q14213',
    'FGF-beta': 'P09038',
    "IL-32-beta": "P24001",
    "IFN-alpha1": "L0N195",
    'IFN-lambda2': "Q8IZJ0",
    'IFN-lambda3': "Q8IZI9",
    'IL-36-alpha': 'Q9JLA2',
    'LT-alpha': 'P01374', #TODO: concatenate with beta to get LT-alpha1-beta2, and LT-alpha2-beta1
    'LT-beta': 'Q06643', #TODO: concatenate with beta to get LT-alpha1-beta2, and LT-alpha2-beta1
}

In [24]:
res.update(manual_uniprot)

In [25]:
len(res)

91

In [26]:
def fetch_amino_acid_sequence(uniprot_id):
    """
    Fetch the amino acid sequence of a protein using its UniProt ID.
    
    Parameters:
        uniprot_id (str): The UniProt ID of the protein.

    Returns:
        str: Amino acid sequence of the protein.
    """
    try:
        # Fetch the record from UniProt
        handle = ExPASy.get_sprot_raw(uniprot_id)
        record = SwissProt.read(handle)
        
        # Extract the amino acid sequence
        sequence = record.sequence
        return sequence
    except Exception as e:
        print(f"Error fetching sequence: {e}")
        return None

In [27]:
amino_acid_seqs = {}
for cyto, uniprot_id in res.items():
    amino_acid_seqs[cyto] = fetch_amino_acid_sequence(uniprot_id)

In [28]:
amino_acid_seqs["LT-alpha1-beta2"] = amino_acid_seqs["LT-alpha"] + ":" + amino_acid_seqs["LT-beta"] + ":" + amino_acid_seqs["LT-beta"]
amino_acid_seqs["LT-alpha2-beta1"] = amino_acid_seqs["LT-alpha"] + ":" + amino_acid_seqs["LT-alpha"] + ":" + amino_acid_seqs["LT-beta"]
amino_acid_seqs["IL-35"] = amino_acid_seqs["IL-12"] + ":" + amino_acid_seqs["EBI3"] 

In [29]:
set(cytokines) - set(amino_acid_seqs.keys()) , set(amino_acid_seqs.keys())-set(cytokines)

(set(), {'EBI3', 'LT-alpha', 'LT-beta'})

In [30]:
out_dir = "/lustre/groups/ml01/workspace/ot_perturbation/data/pbmc"
with open(os.path.join(out_dir, "cytokines.fasta"), "w") as fasta_file:
    for cytokine, seq in amino_acid_seqs.items():
        fasta_file.write(f">{cytokine}\n")
        fasta_file.write(f"{seq}\n")