In [None]:
# Basic Python Notebook showing how you can quickly download and start working with a datafile and make simple plots.
# Bioinformatics 1 (2022-23) - Week 6 - Working with Biological Databases
# ian.simpson@ed.ac.uk

# Activity 2 - Protein-Protein Interactions & Pathways

#load in modules
import pandas as pd
import urllib as ul
import numpy as np

In [None]:
# Fetching KEGG pathway data

human_pathways = pd.read_csv(ul.request.urlopen('http://rest.kegg.jp/list/pathway/hsa'),sep='\t',header=0,names=['kegg_id','pathway_name'])

# we're looking for "Dopaminergic Synapse"

human_pathways.head()

In [None]:
pathway_id = human_pathways[human_pathways['pathway_name'].str.match('Dopaminergic synapse')]['kegg_id']

print(pathway_id.values)

In [None]:

# pull the pathway rntey from KEGG, note we are saving this to a file that we will use later
ul.request.urlretrieve('http://rest.kegg.jp/get/'+pathway_id.to_numpy()[0],'dop_synapse.txt')

# why not open this file and look at the contents. You will see the full pathway details including the gene names

# open the file
dop_file = open('dop_synapse.txt','r')

# I wanted to show you some basic python parsing and a simple for loop with a conditional in to demonstrate how you can quickly build simple parsers. There are quicker ways to do this, but this is a good learning example.

# create an empty dataframe
dop_df = pd.DataFrame()

# set a flag for our parser
flag=0

# work through the text file one line at a time
for line in dop_file:
    # find the start of the gene entries
    if 'GENE' in line:
        # add the first gene tp the dataframe
        dop_df = dop_df.append(pd.Series(line.strip('GENE').strip().split('  ')),ignore_index=True)
        # set the flag to 1, we are in the gene section of the file
        flag = 1
    # stop when we reach the end of the section and escape the file
    elif 'COMPOUND' in line:
        break
    # continue adding the genes to the dataframe
    elif flag == 1:
        dop_df = dop_df.append(pd.Series(line.strip().split('  ',2)),ignore_index=True)

# close the file
dop_file.close()

# name the columns
dop_df.columns = ['gene_id','description']

# view the file
dop_df.head()

# you now have the gene_ids (NCBI EntrezIDs for the genes in the pathway)
print('The Dopaminergic Synapse pathway has '+str(dop_df.shape[0])+' genes in it.\n')

# show the gene_ids
print(dop_df['gene_id'].to_numpy())

In [None]:
dop_df.head()

In [None]:
# lets practice writing out a simple gene_id file

f = open('dop_geneids.txt','w')

for i in dop_df['gene_id']:
     f.write(i+'\n')

f.close()

# now open this file and continue with the exercise.

# it is possible to download protein-protein interaction data from BioGrid and Intact and using methods learned in this and the previous notebook create a dataframe that you can merge with the gene_ids in this pathway file to extract all of the interactions between these proteins. It is also possible to write code to do this directly using an API provided by BioGrid, but this is beyond the scope of this course.

# you should now paste these gene_ids into either:
#   STRING - https://string-db.org/cgi/input?sessionId=bmMdkG6HNIwf&input_page_show_search=on
#   or
#   iRefWeb - http://wodaklab.org/iRefWeb/search/index
# to find the answers. I will show you how to do this in the video as well.