From 7c98c33b094abfeb48c7e4f7220c116c2843b26e Mon Sep 17 00:00:00 2001 From: dvklopfenstein Date: Wed, 30 Nov 2022 11:55:52 -0500 Subject: [PATCH] Added notebook to address https://github.com/tanghaibao/goatools/issues/251 --- notebooks/annotations_gpad_file.ipynb | 255 ++++++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100755 notebooks/annotations_gpad_file.ipynb diff --git a/notebooks/annotations_gpad_file.ipynb b/notebooks/annotations_gpad_file.ipynb new file mode 100755 index 0000000..c206335 --- /dev/null +++ b/notebooks/annotations_gpad_file.ipynb @@ -0,0 +1,255 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reading a _Gene Product Association Data_ (GPAD) file" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Download a GPAD file\n", + "2. Load the GPAD file into the GpadReader\n", + "3. Get Annotations\n", + "\n", + "**Bonus: Each line in the GPAD file is stored in a namedtuple**:\n", + " * Namedtuple fields\n", + " * Print a subset of the namedtuple fields" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1) Download a GPAD file and GO Dag file" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from os.path import exists\n", + "if not exists('goa_human.gpad'):\n", + " !rm -f goa_human.gpad.gz\n", + " !wget http://current.geneontology.org/annotations/goa_human.gpad.gz\n", + " !gunzip goa_human.gpad.gz" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "if not exists('go-basic.obo'):\n", + " !wget http://geneontology.org/ontology/go-basic.obo" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2) Load the GPAD file into the GpadReader" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " EXISTS: go-basic.obo\n", + "go-basic.obo: fmt(1.2) rel(2022-11-03) 46,800 Terms\n", + "HMS:0:00:16.880690 635,765 annotations READ: goa_human.gpad \n" + ] + } + ], + "source": [ + "from goatools.base import get_godag\n", + "from goatools.anno.gpad_reader import GpadReader\n", + "\n", + "godag = get_godag(\"go-basic.obo\")\n", + "anno = GpadReader(\"goa_human.gpad\", godag=godag)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3) Get Annotations\n", + "The annotations will be stored in three dicts, one for each GODAG branch, where:\n", + " * the key is the protein ID and \n", + " * the value is a list of GO IDs associated with the protein." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "ns2assc = anno.get_ns2assc()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MF A0A024RBG1 : GO:0000298 GO:0008486 GO:0034431 GO:0034432 GO:0046872 GO:0050072\n", + "MF A0A075B6Q5 : GO:0003823 GO:0034987\n", + "MF A0A075B6R2 : GO:0003823 GO:0034987\n", + "CC A0A024RBG1 : GO:0005634 GO:0005737 GO:0005829\n", + "CC A0A075B6H5 : GO:0005886\n", + "CC A0A075B6H7 : GO:0005615 GO:0005886 GO:0019814\n", + "BP A0A024RBG1 : GO:0071543 GO:1901907 GO:1901909 GO:1901911\n", + "BP A0A075B6H5 : GO:0007166\n", + "BP A0A075B6H7 : GO:0002250 GO:0002377 GO:0006955\n" + ] + } + ], + "source": [ + "for namespace, associations in ns2assc.items():\n", + " for protein_id, go_ids in sorted(associations.items())[:3]:\n", + " print(\"{NS} {PROT:7} : {GOs}\".format(\n", + " NS=namespace,\n", + " PROT=protein_id,\n", + " GOs=' '.join(sorted(go_ids))))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Bonus: The GPAD is stored as a list of named tuples\n", + "The list of namedtuples is stored in the **GpadReader** data member named **_associations_**.\n", + "\n", + "Each namedtuple stores data for one line in the GPAD file." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ntgpadobj(DB='UniProtKB', DB_ID='A0A024RBG1', Qualifier={'enables'}, GO_ID='GO:0046872', DB_Reference={'GO_REF:0000043'}, ECO='ECO:0000501', Evidence_Code='IEA', With_From={'UniProtKB-KW:KW-0479'}, Taxon=None, Date=datetime.date(2022, 9, 7), Assigned_By='UniProt', Extension=None, Properties={}, NS='MF')\n" + ] + } + ], + "source": [ + "# Sort the list of GPAD namedtuples by ID\n", + "nts = sorted(anno.associations, key=lambda nt:nt.DB_ID)\n", + "\n", + "# Print one namedtuple\n", + "print(nts[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Namedtuple fields\n", + "```\n", + "DB # 0 required 1 UniProtKB\n", + "DB_ID # 1 required 1 P12345\n", + "DB_Symbol # 2 required 1 PHO3\n", + "Qualifier # 3 optional 0 or greater NOT\n", + "GO_ID # 4 required 1 GO:0003993\n", + "DB_Reference # 5 required 1 or greater PMID:2676709\n", + "Evidence_Code # 6 required 1 IMP\n", + "With_From # 7 optional 0 or greater GO:0000346\n", + "Aspect # 8 required 1 F\n", + "DB_Name # 9 optional 0 or 1 Toll-like receptor 4\n", + "DB_Synonym # 10 optional 0 or greater hToll|Tollbooth\n", + "DB_Type # 11 required 1 protein\n", + "Taxon # 12 required 1 or 2 taxon:9606\n", + "Date # 13 required 1 20090118\n", + "Assigned_By # 14 required 1 SGD\n", + "Annotation_Extension # 15 optional 0 or greater part_of(CL:0000576)\n", + "Gene_Product_Form_ID # 16 optional 0 or 1 UniProtKB:P12345-2\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print a subset of the namedtuple fields" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MF A0A024RBG1 GO:0046872 IEA 2022-09-07 UniProt\n", + "CC A0A024RBG1 GO:0005829 IDA 2016-12-04 HPA\n", + "BP A0A024RBG1 GO:1901907 IBA 2017-02-28 GO_Central\n", + "MF A0A024RBG1 GO:0000298 IBA 2017-02-28 GO_Central\n", + "MF A0A024RBG1 GO:0008486 IBA 2021-05-30 GO_Central\n", + "CC A0A024RBG1 GO:0005634 IBA 2017-02-28 GO_Central\n", + "BP A0A024RBG1 GO:1901911 IBA 2017-02-28 GO_Central\n", + "BP A0A024RBG1 GO:1901909 IBA 2017-02-28 GO_Central\n", + "MF A0A024RBG1 GO:0034432 IBA 2017-02-28 GO_Central\n", + "CC A0A024RBG1 GO:0005737 IBA 2017-02-28 GO_Central\n" + ] + } + ], + "source": [ + "fmtpat = '{NS} {DB_ID} {GO_ID} {Evidence_Code} {Date} {Assigned_By}'\n", + "for nt_line in nts[:10]:\n", + " print(fmtpat.format(**nt_line._asdict()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (C) 2010-present, DV Klopfenstein PhD, Haibao Tang. All rights reserved." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}