In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
from patch_gnn.data import load_ghesquire
import janitor

In [None]:
data = load_ghesquire()

In [None]:
from functools import singledispatch

@singledispatch
def split_delimiter(x, delimiter=';'):
    raise NotImplementedError("Unsupported type!")

@split_delimiter.register(float)
def _split_delimiter(x, delimiter=";"):
    return x

@split_delimiter.register(str)
def _split_delimiter(x, delimiter=";"):
    return x.split(delimiter)


In [None]:
from functools import partial
processed_data = (
    data
    .dropna(subset=["accession"])
    .transform_column("isoforms", split_delimiter)
    .explode("isoforms")
    .transform_column("isoforms", partial(split_delimiter, delimiter=" ("))
    .transform_column("isoforms", lambda x: x[0] if isinstance(x, list) else x)
    .transform_column("isoforms", lambda x: x.strip(" ") if isinstance(x, str) else x)
)

## download sequences from HitHub

We have an internal mirror of UniProt, hosted on HitHub. (Once again, CBTDS does all the right things!) We can query HH for UniProt sequences that way.


In [None]:
from psycopg2 import connect
from dotenv import load_dotenv

load_dotenv()
import os

con = connect(dsn=os.getenv("HH_CONNECTION_STRING"))

In [None]:
wanted_accessions = processed_data["accession"].dropna().tolist()
wanted_accessions.extend(processed_data["isoforms"].dropna().tolist())

In [None]:
import pandas as pd

accession_data = pd.read_sql(f"select * from uniprot.ref_proteome where uniprot_accn in {tuple(wanted_accessions)}", con=con)

In [None]:
import pandas_flavor as pf
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO

@pf.register_dataframe_method
def to_fasta(df, identifier_column_name, sequence_column_name, filename):
    seq_records = []
    for r, d in df.iterrows():
        seq = Seq(d[sequence_column_name])
        seq_record = SeqRecord(seq, id=d[identifier_column_name], description="", name="")
        seq_records.append(seq_record)
    SeqIO.write(seq_records, filename, format="fasta")


In [None]:
from pyprojroot import here
accession_data.to_fasta(identifier_column_name="uniprot_accn", sequence_column_name="seq", filename=here() / "data/ghesquire_2011/protein_sequences.fasta")