In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_json("./uniprot-query.json")
df = pd.json_normalize(df["results"])

In [3]:
def extract_function(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'FUNCTION':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_biotech(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'BIOTECHNOLOGY':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_caution(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'CAUTION':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_cofactor(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'COFACTOR':
            try:
                for text in item['cofactors']:
                    return text['name']
            except:
                continue
    return np.nan

def extract_disease(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'DISEASE':
            if 'disease' in item:
                return item['disease']["description"]
    return np.nan

def extract_domain(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'DOMAIN':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_induction(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'INDUCTION':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_gene_name(data):
    if data is None or isinstance(data, float):
        return np.nan
    if "geneName" in data[0]:
        return data[0]['geneName']['value']
    return np.nan

def extract_miscellaneous(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'MISCELLANEOUS':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_pathway(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'PATHWAY':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_pharmaceutical(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'PHARMACEUTICAL':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_polymorphism(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'POLYMORPHISM':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_ptm(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'PTM':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_similarity(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'SIMILARITY':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_subunit(data):
    if data is None or isinstance(data, float):
        return np.nan
    for item in data:
        if item['commentType'] == 'SUBUNIT':
            for text in item['texts']:
                return text['value']
    return np.nan

def extract_gene_encoding_type(data):
    if data is None or isinstance(data, float):
        return np.nan
    else:
        return data[0]["geneEncodingType"]
    return np.nan

def find_go_terms(data):
    CC = []
    BP = []
    MF = []
    
    for item in data:
        if item.get('database') == 'GO':
            properties = item.get('properties', [])
            for prop in properties:
                value = prop.get('value', '')
                if value.startswith('C'):
                    CC.append(item.get('id'))
                elif value.startswith('P'):
                    BP.append(item.get('id'))
                elif value.startswith('F'):
                    MF.append(item.get('id'))
    
    return CC, BP, MF

In [4]:
df.columns

Index(['entryType', 'primaryAccession', 'secondaryAccessions', 'uniProtkbId',
       'annotationScore', 'proteinExistence', 'genes', 'comments', 'features',
       'keywords',
       ...
       'proteinDescription.includes',
       'extraAttributes.countByCommentType.BIOTECHNOLOGY',
       'extraAttributes.countByCommentType.PHARMACEUTICAL',
       'proteinDescription.allergenName.value',
       'extraAttributes.countByCommentType.ALLERGEN',
       'extraAttributes.countByFeatureType.Non-standard residue',
       'proteinDescription.innNames',
       'extraAttributes.countByFeatureType.Non-terminal residue',
       'proteinDescription.allergenName.evidences',
       'extraAttributes.countByFeatureType.Non-adjacent residues'],
      dtype='object', length=103)

In [5]:
df['gen_name'] = df['genes'].apply(extract_gene_name)
df['function'] = df['comments'].apply(extract_function)
df['biotech'] = df['comments'].apply(extract_biotech)
df['caution'] = df['comments'].apply(extract_caution)
df['cofactor'] = df['comments'].apply(extract_cofactor)
df['disease'] = df['comments'].apply(extract_disease)
df['domain'] = df['comments'].apply(extract_domain)
df['induction'] = df['comments'].apply(extract_induction)
df['miscellaneous'] = df['comments'].apply(extract_miscellaneous)
df['pathway'] = df['comments'].apply(extract_pathway)
df['pharmaceutical'] = df['comments'].apply(extract_pharmaceutical)
df['polymorphism'] = df['comments'].apply(extract_polymorphism)
df['ptm'] = df['comments'].apply(extract_ptm)
df['similarity'] = df['comments'].apply(extract_similarity)
df['subunit'] = df['comments'].apply(extract_subunit)
df['gene_encoding_type'] = df["geneLocations"].apply(extract_gene_encoding_type)
df[['GO_terms_CC', 'GO_terms_BP', 'GO_terms_MF']] = df['uniProtKBCrossReferences'].apply(lambda x: pd.Series(find_go_terms(x)))

In [6]:
filtered_columns = [s for s in df.columns.to_list() if "countByCommentType" not in s]
filtered_columns = [s for s in filtered_columns if "entryAudit" not in s]
filtered_columns = [s for s in filtered_columns if "countByFeatureType" not in s]
filtered_columns = [s for s in filtered_columns if "recommendedName" not in s]
filtered_columns.remove("entryType")
filtered_columns.remove("secondaryAccessions")
filtered_columns.remove("uniProtkbId")
filtered_columns.remove("annotationScore")
filtered_columns.remove("organism.scientificName")
filtered_columns.remove("organism.commonName")
filtered_columns.remove("organism.taxonId")
filtered_columns.remove("organism.lineage")
filtered_columns.remove("sequence.length")
filtered_columns.remove("sequence.molWeight")
filtered_columns.remove("genes")
filtered_columns.remove("comments")
filtered_columns.remove("features")
filtered_columns.remove("keywords")
filtered_columns.remove("references")
filtered_columns.remove("sequence.crc64")
filtered_columns.remove("sequence.md5")
filtered_columns.remove("extraAttributes.uniParcId")
filtered_columns.remove("proteinDescription.alternativeNames")
filtered_columns.remove("geneLocations")
filtered_columns.remove("organism.evidences")
filtered_columns.remove("proteinDescription.flag")
filtered_columns.remove("proteinDescription.contains")
filtered_columns.remove("proteinDescription.cdAntigenNames")
filtered_columns.remove("proteinDescription.includes")
filtered_columns.remove("proteinDescription.allergenName.value")
filtered_columns.remove("proteinDescription.innNames")
filtered_columns.remove('proteinDescription.allergenName.evidences')
filtered_columns.remove('uniProtKBCrossReferences')

In [7]:
df_cleaned = df[filtered_columns].rename(columns={'sequence.value': 'sequence'})
df_cleaned.head()

Unnamed: 0,primaryAccession,proteinExistence,sequence,gen_name,function,biotech,caution,cofactor,disease,domain,...,pathway,pharmaceutical,polymorphism,ptm,similarity,subunit,gene_encoding_type,GO_terms_CC,GO_terms_BP,GO_terms_MF
0,A0A087X1C5,5: Uncertain,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,CYP2D7,May be responsible for the metabolism of many ...,,Could be the product of a pseudogene,heme,,,...,,,A rare double polymorphism may allow the expre...,,Belongs to the cytochrome P450 family,,,"[GO:0005737, GO:0043231, GO:0016020, GO:0005739]","[GO:0019369, GO:0042178, GO:0006805]","[GO:0070330, GO:0020037, GO:0005506, GO:0016712]"
1,A0A0B4J2F0,1: Evidence at protein level,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,PIGBOS1,Plays a role in regulation of the unfolded pro...,,,,,,...,,,,,,Homooligomer (PubMed:31653868). Interacts (via...,,[GO:0005741],"[GO:1900101, GO:0006986]",[]
2,A0A0B4J2F2,5: Uncertain,MVIMSEFSADPAGQGQGQQKPLRVGFYDIERTLGKGNFAVVKLARH...,SIK1B,Probable serine/threonine-protein kinase,,Product of a dubious gene prediction. The corr...,Mg(2+),,,...,,,,,Belongs to the protein kinase superfamily. CAM...,,,"[GO:0005737, GO:0005634]","[GO:0035556, GO:0006468]","[GO:0005524, GO:0000287, GO:0106310, GO:0004674]"
3,A0A0C5B5G6,1: Evidence at protein level,MRWQEMGYIFYPRKLR,MT-RNR1,Regulates insulin sensitivity and metabolic ho...,,This peptide has been shown to be biologically...,,,,...,,,,,,Interacts with transcription factors ATF1 and ...,Mitochondrion,"[GO:0005615, GO:0005739, GO:0005634]","[GO:0032147, GO:2001145, GO:0001649, GO:003368...","[GO:0003677, GO:0140297]"
4,A0A0K2S4Q6,1: Evidence at protein level,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,CD300H,May play an important role in innate immunity ...,,The reference genome assembly (GRCh38.p12) rep...,,,,...,,,Some individuals have a mutation (NM_001324073...,,Belongs to the CD300 family,Interacts with TYROBP and HCST,,"[GO:0005576, GO:0005886]",[GO:0030593],[GO:0004888]


In [9]:
df_cleaned.to_csv("Uniprot-query-extracted_comments.csv")

In [14]:
df_cleaned.shape

(20422, 22)

In [15]:
df_cleaned

Unnamed: 0,primaryAccession,proteinExistence,sequence,gen_name,function,biotech,caution,cofactor,disease,domain,...,pathway,pharmaceutical,polymorphism,ptm,similarity,subunit,gene_encoding_type,GO_terms_CC,GO_terms_BP,GO_terms_MF
0,A0A087X1C5,5: Uncertain,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,CYP2D7,May be responsible for the metabolism of many ...,,Could be the product of a pseudogene,heme,,,...,,,A rare double polymorphism may allow the expre...,,Belongs to the cytochrome P450 family,,,"[GO:0005737, GO:0043231, GO:0016020, GO:0005739]","[GO:0019369, GO:0042178, GO:0006805]","[GO:0070330, GO:0020037, GO:0005506, GO:0016712]"
1,A0A0B4J2F0,1: Evidence at protein level,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,PIGBOS1,Plays a role in regulation of the unfolded pro...,,,,,,...,,,,,,Homooligomer (PubMed:31653868). Interacts (via...,,[GO:0005741],"[GO:1900101, GO:0006986]",[]
2,A0A0B4J2F2,5: Uncertain,MVIMSEFSADPAGQGQGQQKPLRVGFYDIERTLGKGNFAVVKLARH...,SIK1B,Probable serine/threonine-protein kinase,,Product of a dubious gene prediction. The corr...,Mg(2+),,,...,,,,,Belongs to the protein kinase superfamily. CAM...,,,"[GO:0005737, GO:0005634]","[GO:0035556, GO:0006468]","[GO:0005524, GO:0000287, GO:0106310, GO:0004674]"
3,A0A0C5B5G6,1: Evidence at protein level,MRWQEMGYIFYPRKLR,MT-RNR1,Regulates insulin sensitivity and metabolic ho...,,This peptide has been shown to be biologically...,,,,...,,,,,,Interacts with transcription factors ATF1 and ...,Mitochondrion,"[GO:0005615, GO:0005739, GO:0005634]","[GO:0032147, GO:2001145, GO:0001649, GO:003368...","[GO:0003677, GO:0140297]"
4,A0A0K2S4Q6,1: Evidence at protein level,MTQRAGAAMLPSALLLLCVPGCLTVSGPSTVMGAVGESLSVQCRYE...,CD300H,May play an important role in innate immunity ...,,The reference genome assembly (GRCh38.p12) rep...,,,,...,,,Some individuals have a mutation (NM_001324073...,,Belongs to the CD300 family,Interacts with TYROBP and HCST,,"[GO:0005576, GO:0005886]",[GO:0030593],[GO:0004888]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20417,Q9Y3F1,5: Uncertain,MSLLWTPQILTISFVSYILSLFPSPFPSCYTSCWFETSITTEKELN...,,May be associated with TAP2 isoform activity,,Product of a dubious gene prediction,,,,...,,,,,,,,[],[],[]
20418,Q9Y6C7,5: Uncertain,MAHHSLNTFYIWHNNVLHTHLVFFLPHLLNQPFSRGSFLIWLLLCW...,LINC00312,,,Product of a dubious CDS prediction. May be a ...,,,,...,,,,,,,,[],[],[]
20419,Q9Y6Z2,4: Predicted,MGTAVGPHHSPAPHDSALPARLLTSDFPYGRSCQIEQVKYSVPDTG...,LINC01558,,,,,,,...,,,,,,,,[],[],[]
20420,X6R8D5,5: Uncertain,MGRKEHESPSQPHMCGWEDSQKPSVPSHGPKTPSCKGVKAPHSSRP...,GUCA1ANB,,,Product of a dubious gene prediction,,,,...,,,,,,,,[],[],[]
