In [None]:
from bioservices.kegg import KEGG
import numpy
import pandas
from statsmodels.stats import proportion
import pickle
import os
k = KEGG()
from scipy import stats

from plotly import offline as plotly
from plotly import graph_objects
from plotly.subplots import make_subplots
from statsmodels.stats import multitest

import matplotlib
from matplotlib import pyplot

from capblood_seq import common
from capblood_seq import resources

In [None]:
NUM_TOP_GENES = 250

# Whether to include pathways that have 0 counts in one of the groups -
# diurnal, not diurnal, individual-specific, individiual non-specific.
# If set to False, pathways that have 0 in any of the groups will
# be skipped
INCLUDE_ZEROS = False

In [None]:
gene_diurnality = pandas.read_csv(
    os.path.join("data", "gene_diurnality.csv"), index_col=0
)
gene_individuality = pandas.read_csv(
    os.path.join("data", "gene_individuality.csv"), index_col=0
)

In [None]:
gene_list = gene_diurnality.index.values

In [None]:
# Load a previously saved gene pathway file
gene_pathway_dict = pickle.load(open(common.get_resource_path(resources, "gene_pathway_dict.pickle"), "rb"))

# Look for the pathways this gene belongs to in KEGG
for gene in gene_list:
    if gene in gene_pathway_dict:
        continue
    gene_pathway_dict[gene] = k.get_pathway_by_gene(gene, "hsa")

In [None]:
KEGG_top_level_classes = {}
KEGG_second_level_classes = {}

pathway_class_df = pandas.read_csv(common.get_resource_path(resources, "pathway_classes.csv"), index_col=0, header=0)

for row in pathway_class_df.iterrows():
    pathway = row[0]
    KEGG_top_level_classes[pathway] = row[1][0]
    KEGG_second_level_classes[pathway] = row[1][1]

In [None]:
pathway_class_list = [
    "Diseases – Immune",
    "Immune system",
    "Diseases – Infectious",
    "Signaling",
    "Diseases – Other",
    "Cellular Processes",
    "Organismal Systems",
    "Genetic Information Processing",
    "Metabolism"
]

pathway_class_list = pathway_class_list[::-1]

pathway_class_labels = {}

with open(common.get_resource_path(resources, "pathway_class_labels.tsv"), "r") as pathway_class_labels_file:

    pathway_class_labels_file.readline()
    for line in pathway_class_labels_file.readlines():
        line = line.strip()
        pathway_class_label = line.split("\t")
        pathway_class_labels[pathway_class_label[0]] = pathway_class_label[1]

In [None]:
pathway_set = set(KEGG_second_level_classes.keys())

pathway_groupings_by_level = {
    0: {pathway: pathway_class_labels[KEGG_second_level_classes[pathway]] for pathway in pathway_set},
    1: KEGG_top_level_classes,
    2: KEGG_second_level_classes,
    3: {pathway: pathway for pathway in pathway_set}
}

In [None]:
individual_specific_pathway_counts = {}
individual_nonspecific_pathway_counts = {}

diurnal_pathway_counts = {}
nondiurnal_pathway_counts = {}

pathway_num_genes = {}

for level in pathway_groupings_by_level:
    
    individual_specific_pathway_counts[level] = {}
    individual_nonspecific_pathway_counts[level] = {}
    diurnal_pathway_counts[level] = {}
    nondiurnal_pathway_counts[level] = {}
    pathway_num_genes[level] = {}
    
    for pathway_label in set(pathway_groupings_by_level[level].values()):
    
        individual_specific_pathway_counts[level][pathway_label] = 0
        individual_nonspecific_pathway_counts[level][pathway_label] = 0
        diurnal_pathway_counts[level][pathway_label] = 0
        nondiurnal_pathway_counts[level][pathway_label] = 0
        pathway_num_genes[level][pathway_label] = 0

for gene in gene_list:
    
    pathways = gene_pathway_dict[gene]
    
    if pathways is None:
        continue
    
    for pathway_id, pathway in pathways.items():
        
        for level in pathway_groupings_by_level:
            pathway_label = pathway_groupings_by_level[level][pathway]
            pathway_num_genes[level][pathway_label] += 1.0/len(pathways)

gene_index = 0

for row in gene_individuality.iterrows():
    
    gene = row[0]
    
    pathways = gene_pathway_dict[gene]
    
    gene_index += 1
    
    if pathways is None:
        continue
        
    for pathway_id, pathway in pathways.items():
        
        for level in pathway_groupings_by_level:
            pathway_label = pathway_groupings_by_level[level][pathway]
            if pathway_label == "Diseases – Immune":
                print(gene, gene_index)
            if gene_index <= NUM_TOP_GENES:
                individual_specific_pathway_counts[level][pathway_label] += 1.0/len(pathways)
            else:
                individual_nonspecific_pathway_counts[level][pathway_label] += 1.0/len(pathways)

gene_index = 0

for row in gene_diurnality.iterrows():
    
    gene = row[0]
    
    pathways = gene_pathway_dict[gene]
    
    gene_index += 1
    
    if pathways is None:
        continue
        
    for pathway_id, pathway in pathways.items():
        for level in pathway_groupings_by_level:
            pathway_label = pathway_groupings_by_level[level][pathway]
            if pathway_label == "Diseases – Immune":
                print(gene, gene_index)
            if gene_index <= NUM_TOP_GENES:
                diurnal_pathway_counts[level][pathway_label] += 1.0/len(pathways)
            else:
                nondiurnal_pathway_counts[level][pathway_label] += 1.0/len(pathways)

In [None]:
pathway_list = {x: [] for x in pathway_groupings_by_level}
pathway_individual_enrichments = {x: [] for x in pathway_groupings_by_level}
pathway_diurnal_enrichments = {x: [] for x in pathway_groupings_by_level}
pathway_num_genes_list = {x: [] for x in pathway_groupings_by_level}
pathway_individual_ps = {x: [] for x in pathway_groupings_by_level}
pathway_diurnal_ps = {x: [] for x in pathway_groupings_by_level}

individual_p = 0
diurnal_p = 0
num_skipped = 0
num_kept = 0

for level in pathway_groupings_by_level:
    
    level_pathways = set(pathway_groupings_by_level[level].values())
    
    for pathway in level_pathways:
        
        if not INCLUDE_ZEROS:
            if individual_specific_pathway_counts[level][pathway] == 0 or \
                    individual_nonspecific_pathway_counts[level][pathway] == 0 or \
                    diurnal_pathway_counts[level][pathway] == 0 or \
                    nondiurnal_pathway_counts[level][pathway] == 0:
                continue
            
        individual_enrichment, individual_p = proportion.proportions_ztest(
            [individual_specific_pathway_counts[level][pathway], individual_nonspecific_pathway_counts[level][pathway]],
            [sum(individual_specific_pathway_counts[level].values()),sum(individual_nonspecific_pathway_counts[level].values())]
        )

        diurnal_enrichment, diurnal_p = proportion.proportions_ztest(
            [diurnal_pathway_counts[level][pathway], nondiurnal_pathway_counts[level][pathway]],
            [sum(diurnal_pathway_counts[level].values()),sum(nondiurnal_pathway_counts[level].values())]
        )


        pathway_individual_enrichments[level].append(individual_enrichment)
        pathway_diurnal_enrichments[level].append(diurnal_enrichment)
        pathway_individual_ps[level].append(individual_p)
        pathway_diurnal_ps[level].append(diurnal_p)

        pathway_list[level].append(pathway)
        pathway_num_genes_list[level].append(pathway_num_genes[level][pathway])

    pathway_individual_enrichments[level] = numpy.array(pathway_individual_enrichments[level])
    pathway_diurnal_enrichments[level] = numpy.array(pathway_diurnal_enrichments[level])
    pathway_individual_ps[level] = numpy.array(pathway_individual_ps[level])
    pathway_diurnal_ps[level] = numpy.array(pathway_diurnal_ps[level])

In [None]:
PATHWAY_LEVEL = 3

pathway_class_colors = {}

cmap = pyplot.cm.get_cmap("inferno")
color_list = cmap(numpy.linspace(0, 1, len(set(pathway_class_labels.values()))))

for class_index, class_label in enumerate(pathway_class_list):
    pathway_class_colors[class_label] = "rgba(%.2f, %.2f, %.2f, %.2f)" % tuple(color_list[class_index])

if PATHWAY_LEVEL == 3:
    sizes = numpy.array(
        [x
         for x in (numpy.array(pathway_num_genes_list[3])+25)/(numpy.array(pathway_num_genes_list[3]).max()+25)*25
    ])
elif PATHWAY_LEVEL == 2:
    sizes = numpy.array(
        [x
         for x in (numpy.array(pathway_num_genes_list[2])+25)/(numpy.array(pathway_num_genes_list[2]).max()+25)*25
    ])

min_x = min(
    pathway_individual_enrichments[3].min(),
    pathway_individual_enrichments[0].min()
)
max_x = max(
    pathway_individual_enrichments[3].max(),
    pathway_individual_enrichments[0].max()
)
x_range = max_x - min_x
min_x -= x_range*0.1
max_x += x_range*0.1

min_y = min(
    pathway_diurnal_enrichments[3].min(),
    pathway_diurnal_enrichments[0].min()
)

max_y = max(
    pathway_diurnal_enrichments[3].max(),
    pathway_diurnal_enrichments[0].max()
)
y_range = max_y - min_y
min_y -= y_range*0.1
max_y += y_range*0.1

data = []

for class_index, class_label in enumerate(pathway_class_list):
    
    # First the class itself - level 0 of the hierarchy
    text_labels = [class_label]
    size = [50]
    x = [pathway_individual_enrichments[0][pathway_list[0].index(class_label)]]
    y = [pathway_diurnal_enrichments[0][pathway_list[0].index(class_label)]]
    
    if PATHWAY_LEVEL == 3:
        class_label_pathways = [pathway_groupings_by_level[0][pathway] == class_label for pathway in pathway_list[3]]
    elif PATHWAY_LEVEL == 2:
        class_label_pathways = [pathway_class_labels[pathway] == class_label for pathway in pathway_list[2]]
    
    x.extend(pathway_individual_enrichments[PATHWAY_LEVEL][class_label_pathways])
    y.extend(pathway_diurnal_enrichments[PATHWAY_LEVEL][class_label_pathways])
    size.extend(sizes[class_label_pathways])
    text_labels.extend(numpy.array(pathway_list[PATHWAY_LEVEL])[class_label_pathways])
        
    data.append(
        graph_objects.Scatter(
            x=x,
            y=y,
            mode="markers",
            text=text_labels,
            marker={
                "size": size,
                "line": {
                    "width": 1,
                    "color": "black"
                },
                "color": pathway_class_colors[class_label]
            },
            opacity=0.8,
            name=class_label
        )
    )

layout = graph_objects.Layout(
    title= "Pathway Enrichments",
    showlegend=True,
    autosize=False,
    width=850,
    height=850,
    xaxis=dict(
        title="Individual Enrichment"),
    yaxis=dict(
        title="Diurnal Enrichment"
    ),
    margin=dict(
        l=20,
        t=100),
    hovermode="closest",
    plot_bgcolor="rgba(255, 255, 255, 0)",
    paper_bgcolor="rgba(255, 255, 255, 0)",
    legend={
        "itemsizing":"constant"
    }
)

figure = graph_objects.Figure(data=data, layout=layout)

figure.add_shape(
    dict(
        type="line",
        x0=0,
        y0=min_y,
        x1=0,
        y1=max_y,
        line=dict(
            color="black",
            width=2
        )
    )
)

figure.add_shape(
    dict(
        type="line",
        x0=min_x,
        y0=0,
        x1=max_x,
        y1=0,
        line=dict(
            color="black",
            width=2
        )
    )
)

plotly.iplot(figure)

In [None]:
all_ps = numpy.concatenate((pathway_individual_ps[0], pathway_diurnal_ps[0]))
rejected, corrected_p_values, _, _ = multitest.multipletests(all_ps, method="fdr_bh")

In [None]:
figure.write_html(os.path.join("figures", "diurnal_vs_individual_pathway_enrichments.html"))
figure.write_image(os.path.join("figures", "diurnal_vs_individual_pathway_enrichments.svg"))

In [None]:
pandas.DataFrame(
    numpy.concatenate((
        corrected_p_values[0:len(pathway_list[0])].reshape((-1,1)),
        corrected_p_values[len(pathway_list[0]):].reshape((-1,1))
    ), axis=1),
    index=numpy.array(pathway_list[0]),
    columns=["Individual", "Diurnal"]
)