In [1]:
import requests
import pandas as pd
from os import environ

import openai
from openai.error import RateLimitError, InvalidRequestError
import backoff

import ast

In [2]:
openai.organization = environ.get('OPEN_AI_ORG')
openai.api_key = environ.get('OPEN_AI_API_KEY')

In [3]:
df_stabilizers = pd.read_pickle('./protein_stabilizers.pkl')

In [4]:
@backoff.on_exception(backoff.expo, RateLimitError)
def describe_protein(molecule, model="gpt-4-turbo-preview"):
    messages = []
    
    messages.append({"role": "system", "content":'''# who you are: you are research scientist, expert in biochemistry.
    # your task: for given name or abbreviation you will respond with type of given chemical compound.
    # you will take input as compound or substance name of abbreviation.
    # you will respond: {"name": <YUPAC name>, "type of substance": <type of substance>}.
    # examples: 1. input: glucose; output: {"name": "glucose", "type of substance": "monosaccharide"}'''})

    messages.append({"role": "user", "content": molecule})
    
    response = openai.ChatCompletion.create(model=model, 
                                            messages=messages, max_tokens=4095, temperature=1)

    # Print the response and add it to the messages list
    chat_message = response['choices'][0]['message']['content']
    print('*', end = "")
    # print(f"Bot: {chat_message}")
    # messages.append({"role": "assistant", "content": chat_message})
    return(chat_message)

In [5]:
df_stabilizers['gpt4_desc'] = df_stabilizers['molecule']\
.map(lambda x: describe_protein(x))

***********************************

In [6]:
df_stabilizers['gpt4_name'] = \
df_stabilizers['gpt4_desc'].map(lambda x: ast.literal_eval(x).get('name'))

df_stabilizers['gpt4_type_of_substance'] = \
df_stabilizers['gpt4_desc'].map(lambda x: ast.literal_eval(x).get('type of substance'))

In [7]:
df_stabilizers.loc[df_stabilizers['molecule']=='human serum albumin', 'molecule'] = 'albumin'

In [8]:
df_stabilizers['responses_human_proteins'] = df_stabilizers['molecule']\
.map(lambda x: requests.get('https://rest.uniprot.org/uniprotkb/search?query=reviewed:true%20AND%20protein_name:(' + x.lower().replace(' ', '+') + ')%20AND%20organism_id:9606'))

In [9]:
df_stabilizers_human_prots = \
df_stabilizers[df_stabilizers['responses_human_proteins']\
.map(lambda x: x.json().get('results') != [])].copy()

In [10]:
df_stabilizers_human_prots['results'] = \
df_stabilizers_human_prots['responses_human_proteins']\
.map(lambda x: x.json().get('results'))

In [11]:
df_stabilizers_human_prots_exploded = df_stabilizers_human_prots.explode('results')

In [12]:
df_stabilizers_human_prots_exploded['results'].map(lambda x: x.keys()).iloc[0]

dict_keys(['entryType', 'primaryAccession', 'secondaryAccessions', 'uniProtkbId', 'entryAudit', 'annotationScore', 'organism', 'proteinExistence', 'proteinDescription', 'genes', 'comments', 'features', 'keywords', 'references', 'uniProtKBCrossReferences', 'sequence', 'extraAttributes'])

In [13]:
df_stabilizers_human_prots_exploded['uniprot_name'] = \
df_stabilizers_human_prots_exploded['results'].map(lambda x: x.get('proteinDescription')\
                                                   .get('recommendedName')\
                                                   .get('fullName').get('value'))

In [14]:
df_stabilizers_human_prots_exploded['uniprot_id'] = \
df_stabilizers_human_prots_exploded['results'].map(lambda x: x.get('primaryAccession'))

In [15]:
df_stabilizers_human_prots_exploded['molecule'].unique()

array(['lysozyme', 'transferrin', 'albumin', 'glutathione peroxidase',
       'insulin', 'keratin', 'sparc', 'psp', 'selenocysteine',
       'reductase protein', 'thioredoxin reductase'], dtype=object)

In [22]:
choosen_entries = ['Lysozyme C', 'Albumin', 'Serotransferrin', 'Glutathione peroxidase 1', 
                   'Insulin', 'Keratin, type II cytoskeletal 2 epidermal', 'SPARC', 
                   'Flavin reductase (NADPH)', 'Thioredoxin reductase 1, cytoplasmic']

In [34]:
df_stabilizers_selected = \
df_stabilizers_human_prots_exploded[\
df_stabilizers_human_prots_exploded['uniprot_name'].isin(choosen_entries)].copy()

In [35]:
df_stabilizers_selected['seq'] = \
df_stabilizers_selected['results'].map(lambda x: x.get('sequence').get('value'))

In [43]:
df_stabilizers_selected['function'] = \
df_stabilizers_selected['results'].map(lambda x: x.get('comments'))\
.map(lambda x: [c.get('texts')[0].get('value') for c in x if c.get('commentType')=='FUNCTION'][0])

In [45]:
df_stabilizers_selected['comment_misc'] = \
df_stabilizers_selected['results'].map(lambda x: x.get('comments'))\
.map(lambda x: [c.get('texts')[0].get('value') for c in x if c.get('commentType')=='MISCELLANEOUS'])

In [48]:
df_stabilizers_selected['simmilarity'] = \
df_stabilizers_selected['results'].map(lambda x: x.get('comments'))\
.map(lambda x: [c.get('texts')[0].get('value') for c in x if c.get('commentType')=='SIMILARITY'])

In [50]:
df_stabilizers_selected['ctn_by_feature_type'] = \
df_stabilizers_selected['results']\
.map(lambda x: x.get('extraAttributes').get('countByFeatureType'))

In [55]:
df_stabilizers_selected['interactions'] = \
df_stabilizers_selected['results'].map(lambda x: x.get('comments'))\
.map(lambda x: [c.get('interactions') for c in x if c.get('commentType')=='INTERACTION'])

In [65]:
df_stabilizers_selected['resource_url'] = \
df_stabilizers_selected['results'].map(lambda x: x.get('comments'))\
.map(lambda x: [c.get('resourceUrl') for c in x if c.get('commentType')=='WEB RESOURCE'][0])

In [60]:
# import pprint
pprint.pprint(df_stabilizers_selected['results'].iloc[0])

In [70]:
df_stabilizers_selected[['uniprot_name', 'uniprot_id', 'seq', 'function', 
                         'simmilarity', 'ctn_by_feature_type', 'interactions', 
                         'resource_url']].explode('simmilarity')

Unnamed: 0,uniprot_name,uniprot_id,seq,function,simmilarity,ctn_by_feature_type,interactions,resource_url
1,Lysozyme C,P61626,MKALIVLGLVLLSVTVQGKVFERCELARTLKRLGMDGYRGISLANW...,Lysozymes have primarily a bacteriolytic funct...,Belongs to the glycosyl hydrolase 22 family,"{'Signal': 1, 'Chain': 1, 'Domain': 1, 'Active...",[[{'interactantOne': {'uniProtKBAccession': 'P...,https://en.wikipedia.org/wiki/Lysozyme
2,Serotransferrin,P02787,MRLAVGALLVCAVLGLCLAVPDKTVRWCAVSEHEATKCQSFRDHMK...,Transferrins are iron binding transport protei...,Belongs to the transferrin family,"{'Signal': 1, 'Chain': 1, 'Domain': 2, 'Bindin...",[[{'interactantOne': {'uniProtKBAccession': 'P...,https://en.wikipedia.org/wiki/Transferrin
4,Albumin,P02768,MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGEENFKAL...,"Binds water, Ca(2+), Na(+), K(+), fatty acids,...",Belongs to the ALB/AFP/VDB family,"{'Signal': 1, 'Propeptide': 1, 'Chain': 1, 'Do...",[[{'interactantOne': {'uniProtKBAccession': 'P...,https://albumin.org
7,Glutathione peroxidase 1,P07203,MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVA...,Catalyzes the reduction of hydroperoxides in a...,Belongs to the glutathione peroxidase family,"{'Chain': 1, 'Active site': 1, 'Site': 1, 'Non...",[[{'interactantOne': {'uniProtKBAccession': 'P...,http://egp.gs.washington.edu/data/gpx1/
8,Insulin,P01308,MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGER...,Insulin decreases blood glucose concentration....,Belongs to the insulin family,"{'Signal': 1, 'Peptide': 2, 'Propeptide': 1, '...",[[{'interactantOne': {'uniProtKBAccession': 'P...,https://www.lillydiabetes.com/assets/pdf/pp-ld...
11,"Keratin, type II cytoskeletal 2 epidermal",P35908,MSCQISCKSRGRGGGGGGFRGFSSGSAVVSGGSRRSTSSFSCLSRH...,Probably contributes to terminal cornification...,Belongs to the intermediate filament family,"{'Chain': 1, 'Domain': 1, 'Region': 9, 'Compos...",[[{'interactantOne': {'uniProtKBAccession': 'P...,https://en.wikipedia.org/wiki/Keratin_2A
14,SPARC,P09486,MRAWIFFLLCLAGRALAAPQQEALPDETEVVEETVAEVTEVSVGAN...,Appears to regulate cell growth through intera...,Belongs to the SPARC family,"{'Signal': 1, 'Chain': 1, 'Domain': 3, 'Bindin...",[[{'interactantOne': {'uniProtKBAccession': 'P...,https://en.wikipedia.org/wiki/Osteonectin
23,Flavin reductase (NADPH),P30043,MAVKKIAIFGATGQTGLTTLAQAVQAGYEVTVLVRDSSRLPSEGPR...,Broad specificity oxidoreductase that catalyze...,,"{'Initiator methionine': 1, 'Chain': 1, 'Bindi...",[[{'interactantOne': {'uniProtKBAccession': 'P...,http://egp.gs.washington.edu/data/blvrb/
25,"Thioredoxin reductase 1, cytoplasmic",Q16881,MGCAEGKAVAAAAPTELQTKGKNGDGRRRSAKDHHPGKTLPENPAG...,Reduces disulfideprotein thioredoxin (Trx) to ...,Belongs to the class-I pyridine nucleotide-dis...,"{'Chain': 1, 'Domain': 1, 'Region': 2, 'Compos...",[[{'interactantOne': {'uniProtKBAccession': 'Q...,http://egp.gs.washington.edu/data/txnrd1/


In [71]:
df_proteins_for_test_1 = \
df_stabilizers_selected[['uniprot_name', 'uniprot_id', 'seq', 'function', 
                         'simmilarity', 'ctn_by_feature_type', 'resource_url']]

In [94]:
df_proteins_for_test_1.columns

Index(['uniprot_name', 'uniprot_id', 'seq', 'function', 'simmilarity',
       'ctn_by_feature_type', 'resource_url'],
      dtype='object')

In [91]:
df_proteins_for_test_1.reset_index().to_dict().keys()

dict_keys(['index', 'uniprot_name', 'uniprot_id', 'seq', 'function', 'simmilarity', 'ctn_by_feature_type', 'resource_url'])

In [93]:
df_proteins_for_test_1.reset_index(drop=True).to_dict()

{'uniprot_name': {0: 'Lysozyme C', 1: 'Serotransferrin', 2: 'Albumin', 3: 'Glutathione peroxidase 1', 4: 'Insulin', 5: 'Keratin, type II cytoskeletal 2 epidermal', 6: 'SPARC', 7: 'Flavin reductase (NADPH)', 8: 'Thioredoxin reductase 1, cytoplasmic'}, 'uniprot_id': {0: 'P61626', 1: 'P02787', 2: 'P02768', 3: 'P07203', 4: 'P01308', 5: 'P35908', 6: 'P09486', 7: 'P30043', 8: 'Q16881'}, 'seq': {0: 'MKALIVLGLVLLSVTVQGKVFERCELARTLKRLGMDGYRGISLANWMCLAKWESGYNTRATNYNAGDRSTDYGIFQINSRYWCNDGKTPGAVNACHLSCSALLQDNIADAVACAKRVVRDPQGIRAWVAWRNRCQNRDVRQYVQGCGV', 1: 'MRLAVGALLVCAVLGLCLAVPDKTVRWCAVSEHEATKCQSFRDHMKSVIPSDGPSVACVKKASYLDCIRAIAANEADAVTLDAGLVYDAYLAPNNLKPVVAEFYGSKEDPQTFYYAVAVVKKDSGFQMNQLRGKKSCHTGLGRSAGWNIPIGLLYCDLPEPRKPLEKAVANFFSGSCAPCADGTDFPQLCQLCPGCGCSTLNQYFGYSGAFKCLKDGAGDVAFVKHSTIFENLANKADRDQYELLCLDNTRKPVDEYKDCHLAQVPSHTVVARSMGGKEDLIWELLNQAQEHFGKDKSKEFQLFSSPHGKDLLFKDSAHGFLKVPPRMDAKMYLGYEYVTAIRNLREGTCPEAPTDECKPVKWCALSHHERLKCDEWSVNSVGKIECVSAETTEDCIAKIMNGEADAMSLDGGFVYIAGKCGLVPVLAENYNKSDNCEDTPEAGYFAVA

In [95]:
df_proteins_for_test_1

Unnamed: 0,uniprot_name,uniprot_id,seq,function,simmilarity,ctn_by_feature_type,resource_url
1,Lysozyme C,P61626,MKALIVLGLVLLSVTVQGKVFERCELARTLKRLGMDGYRGISLANW...,Lysozymes have primarily a bacteriolytic funct...,[Belongs to the glycosyl hydrolase 22 family],"{'Signal': 1, 'Chain': 1, 'Domain': 1, 'Active...",https://en.wikipedia.org/wiki/Lysozyme
2,Serotransferrin,P02787,MRLAVGALLVCAVLGLCLAVPDKTVRWCAVSEHEATKCQSFRDHMK...,Transferrins are iron binding transport protei...,[Belongs to the transferrin family],"{'Signal': 1, 'Chain': 1, 'Domain': 2, 'Bindin...",https://en.wikipedia.org/wiki/Transferrin
4,Albumin,P02768,MKWVTFISLLFLFSSAYSRGVFRRDAHKSEVAHRFKDLGEENFKAL...,"Binds water, Ca(2+), Na(+), K(+), fatty acids,...",[Belongs to the ALB/AFP/VDB family],"{'Signal': 1, 'Propeptide': 1, 'Chain': 1, 'Do...",https://albumin.org
7,Glutathione peroxidase 1,P07203,MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVA...,Catalyzes the reduction of hydroperoxides in a...,[Belongs to the glutathione peroxidase family],"{'Chain': 1, 'Active site': 1, 'Site': 1, 'Non...",http://egp.gs.washington.edu/data/gpx1/
8,Insulin,P01308,MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGER...,Insulin decreases blood glucose concentration....,[Belongs to the insulin family],"{'Signal': 1, 'Peptide': 2, 'Propeptide': 1, '...",https://www.lillydiabetes.com/assets/pdf/pp-ld...
11,"Keratin, type II cytoskeletal 2 epidermal",P35908,MSCQISCKSRGRGGGGGGFRGFSSGSAVVSGGSRRSTSSFSCLSRH...,Probably contributes to terminal cornification...,[Belongs to the intermediate filament family],"{'Chain': 1, 'Domain': 1, 'Region': 9, 'Compos...",https://en.wikipedia.org/wiki/Keratin_2A
14,SPARC,P09486,MRAWIFFLLCLAGRALAAPQQEALPDETEVVEETVAEVTEVSVGAN...,Appears to regulate cell growth through intera...,[Belongs to the SPARC family],"{'Signal': 1, 'Chain': 1, 'Domain': 3, 'Bindin...",https://en.wikipedia.org/wiki/Osteonectin
23,Flavin reductase (NADPH),P30043,MAVKKIAIFGATGQTGLTTLAQAVQAGYEVTVLVRDSSRLPSEGPR...,Broad specificity oxidoreductase that catalyze...,[],"{'Initiator methionine': 1, 'Chain': 1, 'Bindi...",http://egp.gs.washington.edu/data/blvrb/
25,"Thioredoxin reductase 1, cytoplasmic",Q16881,MGCAEGKAVAAAAPTELQTKGKNGDGRRRSAKDHHPGKTLPENPAG...,Reduces disulfideprotein thioredoxin (Trx) to ...,[Belongs to the class-I pyridine nucleotide-di...,"{'Chain': 1, 'Domain': 1, 'Region': 2, 'Compos...",http://egp.gs.washington.edu/data/txnrd1/
