In [None]:
import os
import ipywidgets as widgets
from IPython.display import display

# PanCan TRIBE2 analysis

## Preliminary work
A baseline for pathway mutation averages needed to be established. In order to do this, a parser for files shipped with [PathwayMapper](http://www.pathwaymapper.org/) was hand-rolled along with a custom data structure describing the pathway contents and hierarchy. Here we show an example of a parsed pathway.

In [None]:
import pathways as lpw

@widgets.interact(pathway=[filename for filename in os.listdir('./pathways')])
def show_pathway(pathway):
    pw = lpw.parse_pathway('./pathways/' + pathway)
    print(f"Name: {pw[0]}, Contents:\n{pw[1]}")

The average mutation is calculated for any given pathway and patient by only considering pathogenic mutations and the maximum mutation percentage detected. Complexes or families count as a single gene towards the overall average mutation (no weights applied).

The average mutation on all pathways for a random patient follows:

In [None]:
import pandas
from analysis import calculate_patient_mutations

pathways = []
for pw in os.listdir('./pathways'):
    pathway = lpw.parse_pathway('./pathways/' + pw)
    pathways.append(pathway)

patients_log = pandas.read_csv('TRIBE2_db.csv')
mutations_data = pandas.read_csv('TRIBE2_seq_res.csv')

result = calculate_patient_mutations('CB224', mutations_data, pathways)
print(result)

Patients have been split into two groups, according to the treatment they had (arm0, arm1). This will hold for all future analysis.
Here we show statistics about mutations for the two groups of patients and how the mutations for each pathway correlate with **dpfs**

In [None]:
from analysis import process_patients

arm0_df = process_patients(patients_log[patients_log['arm'] == 0]['PatientFirstName'])
arm0_df.describe()

In [None]:
pandas.set_option("display.precision", 11)
arm0_db_df = arm0_df.join(patients_log.set_index('PatientFirstName'), on='PatientFirstName')
print(f"{arm0_db_df[['dpfs'] + [pw[0] for pw in pathways]].corr().iloc[0]}")

In [None]:
arm1_df = process_patients(patients_log[patients_log['arm'] == 1]['PatientFirstName'])
arm1_df.describe()

In [None]:
pandas.set_option("display.precision", 11)
arm1_db_df = arm1_df.join(patients_log.set_index('PatientFirstName'), on='PatientFirstName')
print(f"{arm1_db_df[['dpfs'] + [pw[0] for pw in pathways]].corr().iloc[0]}")

## Conversion of pathway data
Pathways are parsed from pathway files shipped with [PathwayMapper](http://www.pathwaymapper.org/). At this stage, the obtained data is transformed into a NetworkX graph giving each gene its own vertex: complexes and families are not represented explicitly. The resulting graph is directed.

In [None]:
import networkx as nx
import pathways_nx as pnx
import matplotlib.pyplot as plt
import pylab
import logging as log

plt.rcParams['figure.dpi'] = 90

@widgets.interact(pathway=[filename for filename in os.listdir('./pathways')])
def show_pathway(pathway):
    pw = pnx.pathway_to_nx('pathways/' + pathway)

    edge_labels=dict([((u,v,),d['label'])
                 for u,v,d in pw[1].edges(data=True)])
    labels = nx.get_node_attributes(pw[1], 'label')
    pos=nx.spring_layout(pw[1], 8)

    plt.figure(1,figsize=(12,12)) 
    nx.draw_networkx_edge_labels(pw[1], pos, edge_labels=edge_labels)
    nx.draw(pw[1], pos, node_size=1700, labels=labels, with_labels=True, node_shape="o",  node_color="none", bbox=dict(facecolor="skyblue", edgecolor='black', boxstyle='round,pad=0.4'))

## Computing weighted averages
In order to improve the correlation between **dpfs** and pathway mutations, we can employ weights on each gene. These are derived from various centrality measures

In [1]:
import pathways_nx as pnx
import networkx as nx
import os
import pandas

nx_pathways = []
for filename in os.listdir('./pathways'):
    nx_pathways.append(pnx.pathway_to_nx('pathways/' + filename))

patients_log = pandas.read_csv('TRIBE2_db.csv')
mutations_data = pandas.read_csv('TRIBE2_seq_res.csv')

In [2]:
from analysis_nx import process_patients_with_f

process_patients_with_f(patients_log[patients_log['arm'] == 0]['PatientFirstName'], nx.in_degree_centrality, nx_pathways, mutations_data)
#arm0_df.describe()

APC            0.000000
WIF1           0.000000
RNF43          0.000000
CTNNB1         0.185185
WNT ligands    0.222222
GSK3B          0.111111
AMER1          0.000000
DKK1           0.000000
DKK2           0.000000
DKK4           0.000000
DKK3           0.000000
TLE1           0.000000
TLE4           0.000000
TLE2           0.000000
TLE3           0.000000
LRP5           0.222222
FZDs           0.222222
LRP6           0.222222
TCF7L1         0.148148
TCF7L2         0.148148
TCF7           0.148148
AXIN1          0.111111
AXIN2          0.111111
SFRP3          0.000000
SFRP1          0.000000
SFRP2          0.000000
SFRP5          0.000000
SFRP4          0.000000
dtype: float64 Biomarker
APC    74.0
Name: NGS_PercentMutated, dtype: float64 0.0 1.8518518518518516
PTPN11    0.000000
CBL       0.000000
NF1       0.000000
ERRFI1    0.000000
RAC1      0.000000
MAPK1     0.088235
RASA1     0.000000
SOS1      0.529412
NRAS      0.088235
HRAS      0.088235
KRAS      0.088235
RIT1      0.088235

Unnamed: 0,PatientFirstName,TGF-Beta,HIPPO,WNT,NRF2,MYC,RTK-RAS,TP53,NOTCH,PI3K,Cell Cycle
0,LS276,0.0000,0.0,0.00,0.0,0.0,1.068966,16.5,0.000000,0.000000,5.75
1,MB132,0.0000,0.0,0.00,0.0,0.0,0.000000,31.5,0.000000,6.833333,15.75
2,DC513,3.3125,0.0,0.00,0.0,0.0,1.482759,20.5,0.000000,7.291667,10.25
3,PZ544,0.0000,0.0,0.00,0.0,0.0,1.482759,0.0,0.000000,0.000000,0.00
4,SD664,0.0000,0.0,0.00,0.0,0.0,4.275862,0.0,2.622642,1.750000,0.00
...,...,...,...,...,...,...,...,...,...,...,...
157,SC071,0.0000,0.0,1.76,0.0,0.0,1.310345,16.0,0.000000,0.000000,8.00
158,SM650,5.0000,0.0,0.00,0.0,0.0,0.275862,0.0,0.000000,0.000000,0.00
159,SS036,7.2500,0.0,7.34,0.0,0.0,3.563218,14.0,4.113208,7.833333,14.25
160,ST285,15.0000,0.0,2.24,0.0,0.0,1.689655,0.0,2.716981,4.375000,0.00


In [None]:
print(nx.in_degree_centrality(nx_pathways[0][1]))

In [None]:
pw_names = nx.get_node_attributes(nx_pathways[0][1], 'label')
print(pw_names.values())

In [None]:
w = nx.in_degree_centrality(nx_pathways[0][1])
d = {pw_names[k]: w[k] for k, v in pw_names.items()}
print(w, d, sum(d.values()))