## KEGG Pathways Gene Selection

### Connecting to KEGG Pathways Database with BioPython 

In [3]:
from Bio.KEGG import REST
import pandas as pd
human_pathways = REST.kegg_list("pathway", "hsa").read()
human_pathways

'path:hsa00010\tGlycolysis / Gluconeogenesis - Homo sapiens (human)\npath:hsa00020\tCitrate cycle (TCA cycle) - Homo sapiens (human)\npath:hsa00030\tPentose phosphate pathway - Homo sapiens (human)\npath:hsa00040\tPentose and glucuronate interconversions - Homo sapiens (human)\npath:hsa00051\tFructose and mannose metabolism - Homo sapiens (human)\npath:hsa00052\tGalactose metabolism - Homo sapiens (human)\npath:hsa00053\tAscorbate and aldarate metabolism - Homo sapiens (human)\npath:hsa00061\tFatty acid biosynthesis - Homo sapiens (human)\npath:hsa00062\tFatty acid elongation - Homo sapiens (human)\npath:hsa00071\tFatty acid degradation - Homo sapiens (human)\npath:hsa00072\tSynthesis and degradation of ketone bodies - Homo sapiens (human)\npath:hsa00100\tSteroid biosynthesis - Homo sapiens (human)\npath:hsa00120\tPrimary bile acid biosynthesis - Homo sapiens (human)\npath:hsa00130\tUbiquinone and other terpenoid-quinone biosynthesis - Homo sapiens (human)\npath:hsa00140\tSteroid hormo

Following code retrieves the HSA Pathway name from the above text output and stores in a list, "Pathways"

In [7]:
# Filter all human pathways for repair pathways
pathways = []
for line in human_pathways.rstrip().split("\n"):
    entry, description = line.split("\t")
 #   if "repair" in description:
    pathways.append(entry)

Creates an empty dataframe with rows as pathways. Pathways will be filled in following steps with genes present from our Gene Expression matrix that are active in each pathway.

In [8]:
df = pd.DataFrame()
se = pd.Series(pathways)
df['pathways']=se.values
#df= df.transpose()
list_name = []
for i in range(1000):
    list_name.append('gene'+ str(i + 1))
list_name

#df['genes']=list
for gene in list_name:
    df[gene] = 0
df.head(1)

Unnamed: 0,pathways,gene1,gene2,gene3,gene4,gene5,gene6,gene7,gene8,gene9,...,gene991,gene992,gene993,gene994,gene995,gene996,gene997,gene998,gene999,gene1000
0,path:hsa00010,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Creates a function to fill each pathway with active genes by HGNC Symbol ID

In [9]:
def fill_gene(df, genes_total, start):
    for j in range(len(genes_total)):
        for i in range(len(genes_total[j])):
            df.iloc[j + start,i+1] = genes_total[j][i]
    return df

In [11]:
# Get the genes for pathways and add them to a list
genes_total = []
for i in range(10):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df= fill_gene(df, genes_total, 0)
df.head(1)

Unnamed: 0,pathways,gene1,gene2,gene3,gene4,gene5,gene6,gene7,gene8,gene9,...,gene991,gene992,gene993,gene994,gene995,gene996,gene997,gene998,gene999,gene1000
0,path:hsa00010,3101,3098,3099,80201,2645,2821,5213,5214,5211,...,0,0,0,0,0,0,0,0,0,0


The following code fills all pathways with their respective genes

In [12]:
genes_total = []
for i in range(10,17):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 10)

In [13]:
genes_total = []
for i in range(18,25):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 18)

In [14]:
genes_total = []
for i in range(25,26):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 25)


In [15]:
genes_total = []
for i in range(27,37):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 27)

In [16]:
genes_total = []
for i in range(37,47):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 37)

In [17]:
genes_total = []
for i in range(47,60):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 47)

In [18]:
genes_total = []
for i in range(60,65):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 60)


In [19]:
genes_total = []
for i in range(65,68):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 65)

In [20]:
genes_total = []
for i in range(69,80):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 69)

In [21]:
genes_total = []
for i in range(80,82):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 80)

In [22]:
genes_total = []
for i in range(84,90):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 84)

In [23]:
genes_total = []
for i in range(90,96):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 90)

In [24]:
genes_total = []
for i in range(97,107):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 97)

In [25]:
genes_total = []
for i in range(107,117):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 107)

In [26]:
genes_total = []
for i in range(117,130):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 117)

In [27]:
genes_total = []
for i in range(130,140):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 130)

In [28]:
genes_total = []
for i in range(140,144):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 140)

In [29]:
genes_total = []
for i in range(145,150):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 145)

In [30]:
genes_total = []
for i in range(150,160):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 150)

In [31]:
genes_total = []
for i in range(160,164):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 160)

In [32]:
genes_total = []
for i in range(165,169):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 165)

In [33]:
genes_total = []
for i in range(170,180):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 170)

In [34]:
genes_total = []
for i in range(180,190):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 180)

In [35]:
genes_total = []
for i in range(190,195):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 190)

In [36]:
genes_total = []
for i in range(196,200):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 196)

In [37]:
genes_total = []
for i in range(200,205):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 200)

In [38]:
genes_total = []
for i in range(206,209):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 206)

In [39]:
genes_total = []
for i in range(210,215):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 210)

In [40]:
genes_total = []
for i in range(216,219):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 216)

In [41]:
genes_total = []
for i in range(220,230):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 220)

In [42]:
genes_total = []
for i in range(230,240):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 230)

In [43]:
genes_total = []
for i in range(240,250):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 240)

In [44]:
genes_total = []
for i in range(250,260):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 250)

In [45]:
genes_total = []
for i in range(260,270):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 260)

In [46]:
genes_total = []
for i in range(270,280):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 270)

In [47]:
genes_total = []
for i in range(280,290):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 280)

In [48]:
genes_total = []
for i in range(290,300):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 290)

In [49]:
genes_total = []
for i in range(300,310):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 300)

In [50]:
genes_total = []
for i in range(310,320):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 310)

In [51]:
genes_total = []
for i in range(320,330):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 320)

In [52]:
genes_total = []
for i in range(86,95):
    genes = []
    pathway_file = REST.kegg_get(pathways[i]).read()  # query and read each pathway
    # iterate through each KEGG pathway file, keeping track of which section
    # of the file we're in, only read the gene in each pathway
    current_section = None
    for line in pathway_file.rstrip().split("\n"):
        section = line[:12].strip()  # section names are within 12 columns
        if not section == "":
            current_section = section

        if current_section == "GENE":
            gene_identifiers = line[12:].split("; ")[0]
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in genes:
                genes.append(gene_id)
    genes_total.append(genes)

df = fill_gene(df, genes_total, 86)

In [53]:
df.to_csv('df-geneid1.csv')