In [1]:
import os
import ipywidgets as widgets
from IPython.display import display

if not os.path.exists("pathways"):
    raise RuntimeError("pathways folder missing, analysis not possible!")

# PanCan TRIBE2 analysis

## Preliminary work
A baseline for pathway mutation averages needed to be established. In order to do this, a parser for files shipped with [PathwayMapper](http://www.pathwaymapper.org/) was hand-rolled along with a custom data structure describing the pathway contents and hierarchy. Here we show an example of a parsed pathway.

In [2]:
import pathways as lpw


@widgets.interact(pathway=[filename for filename in os.listdir("./pathways")])
def show_pathway(pathway):
    pw = lpw.parse_pathway("./pathways/" + pathway)
    print(f"Name: {pw[0]}, Contents:\n{pw[1]}")

interactive(children=(Dropdown(description='pathway', options=('TGF-Beta.txt', 'HIPPO.txt', 'WNT.txt', 'NRF2.t…

The average mutation is calculated for any given pathway and patient by only considering pathogenic mutations and the maximum mutation percentage detected. Complexes or families count as a single gene towards the overall average mutation (no weights applied).

The average mutation on all pathways for a random patient follows:

In [3]:
import pandas
from analysis import calculate_patient_mutations

pathways = []
for pw in os.listdir("./pathways"):
    pathway = lpw.parse_pathway("./pathways/" + pw)
    pathways.append(pathway)

pathways.sort(key=lambda x: x[0])

patients_log = pandas.read_csv("TRIBE2_db.csv")
mutations_data = pandas.read_csv("TRIBE2_seq_res.csv")

result = calculate_patient_mutations("CB224", mutations_data, pathways)
print(result)

{'Cell Cycle': 2.5, 'HIPPO': 0.0, 'MYC': 0.0, 'NOTCH': 0.0, 'NRF2': 0.0, 'PI3K': 0.0, 'RTK-RAS': 3.4166666666666665, 'TGF-Beta': 0.0, 'TP53': 4.166666666666667, 'WNT': 2.3076923076923075}


Patients have been split into two groups, according to the treatment they had (arm0, arm1). This will hold for all future analysis.
Here we show statistics about mutations for the two groups of patients and how the mutations for each pathway correlate with **dpfs**

In [4]:
from analysis import process_patients

baseline = {}

arm0_df = process_patients(patients_log[patients_log["arm"] == 0]["PatientFirstName"])
arm0_df.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,4.721605,0.025926,0.746914,1.49177,0.415638,2.170595,4.190329,2.254321,7.045267,3.664292
std,3.045467,0.193547,3.021788,2.307105,2.100991,3.117726,3.30484,4.078182,4.383254,2.437574
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0,0.0,0.0,2.25,0.0,4.5,2.153846
50%,4.8,0.0,0.0,0.0,0.0,0.0,3.583333,0.0,7.583333,3.346154
75%,6.675,0.0,0.0,2.833333,0.0,4.136364,5.541667,3.95,10.333333,5.076923
max,18.1,1.7,17.5,11.6,14.666667,20.727273,15.833333,16.6,19.666667,11.923077


In [5]:
baseline["arm0"] = arm0_df.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
baseline["arm0"] = (
    baseline["arm0"][["dpfs"] + [pw[0] for pw in pathways]].corr().iloc[0]
)
print(f"{baseline['arm0']}")

dpfs          1.000000
Cell Cycle   -0.044402
HIPPO         0.042459
MYC           0.268194
NOTCH         0.055629
NRF2          0.209496
PI3K          0.169456
RTK-RAS      -0.082699
TGF-Beta     -0.043767
TP53         -0.014431
WNT           0.126803
Name: dpfs, dtype: float64


In [6]:
arm1_df = process_patients(patients_log[patients_log["arm"] == 1]["PatientFirstName"])
arm1_df.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,4.249254,0.031343,0.910448,1.277114,0.482587,1.997286,4.869403,1.565672,6.539801,3.518944
std,2.93672,0.269619,3.089882,2.046328,2.540094,2.944207,3.698843,3.657809,4.772396,2.192086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.3,0.0,0.0,0.0,0.0,0.0,2.520833,0.0,0.5,2.307692
50%,4.85,0.0,0.0,0.0,0.0,0.0,3.75,0.0,7.416667,3.269231
75%,6.2,0.0,0.0,2.533333,0.0,3.227273,6.666667,0.0,9.833333,4.75
max,13.2,2.8,18.75,9.666667,17.333333,15.727273,19.916667,14.6,22.0,9.923077


In [7]:
baseline["arm1"] = arm1_df.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
baseline["arm1"] = (
    baseline["arm1"][["dpfs"] + [pw[0] for pw in pathways]].corr().iloc[0]
)
print(f"{baseline['arm1']}")

dpfs          1.000000
Cell Cycle   -0.126475
HIPPO         0.076182
MYC          -0.019669
NOTCH        -0.007660
NRF2          0.115409
PI3K          0.004712
RTK-RAS       0.135770
TGF-Beta     -0.082254
TP53         -0.041688
WNT          -0.000760
Name: dpfs, dtype: float64


## Conversion of pathway data
Pathways are parsed from pathway files shipped with [PathwayMapper](http://www.pathwaymapper.org/). At this stage, the obtained data is transformed into a NetworkX graph giving each gene its own vertex: complexes and families are not represented explicitly. The resulting graph is directed.

In [8]:
import networkx as nx
import pathways_nx as pnx
import matplotlib.pyplot as plt
import pylab
import logging as log

plt.rcParams["figure.dpi"] = 90


@widgets.interact(pathway=[filename for filename in os.listdir("./pathways")])
def show_pathway(pathway):
    pw = pnx.pathway_to_nx("pathways/" + pathway)

    edge_labels = dict(
        [
            (
                (
                    u,
                    v,
                ),
                d["label"],
            )
            for u, v, d in pw.graph.edges(data=True)
        ]
    )
    labels = nx.get_node_attributes(pw.graph, "label")
    pos = nx.spring_layout(pw.graph, 1.5 * len(pw.graph))

    plt.figure(1, figsize=(10, 10))
    nx.draw_networkx_edge_labels(pw.graph, pos, edge_labels=edge_labels)
    nx.draw(
        pw.graph,
        pos,
        node_size=1700,
        labels=labels,
        with_labels=True,
        node_shape="o",
        node_color="none",
        bbox=dict(facecolor="skyblue", edgecolor="black", boxstyle="round,pad=0.4"),
    )

interactive(children=(Dropdown(description='pathway', options=('TGF-Beta.txt', 'HIPPO.txt', 'WNT.txt', 'NRF2.t…

## Computing weighted averages with no complexes
In order to improve the correlation between **dpfs** and pathway mutations, we can employ weights on each gene.</br>
These are derived from various centrality measures and don't take into account the gene hierarchy.

In [9]:
import pathways_nx as pnx
import networkx as nx
import os
import pandas

pandas.set_option("display.precision", 11)

nx_pathways = []
for filename in os.listdir("./pathways"):
    nx_pathways.append(pnx.pathway_to_nx("pathways/" + filename))

nx_pathways.sort(key=lambda pw: pw.name)

patients_log = pandas.read_csv("TRIBE2_db.csv")
mutations_data = pandas.read_csv("TRIBE2_seq_res.csv")

results = {}
results["arm0"] = {}
results["arm1"] = {}

### In-degree

In [10]:
from analysis_nx import process_patients_with_f

arm0_df_indeg = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.in_degree_centrality,
    nx_pathways,
    mutations_data,
)
arm0_df_indeg.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,9.7037037037,0.0,0.0,1.16352201258,0.91358024691,2.35570987654,1.49255002129,2.62692901235,18.76388888889,0.6550617284
std,6.56313238433,0.0,0.0,2.29956675709,5.598796523,3.61499308582,1.0426796787,4.9530571154,12.26853237539,1.42037710281
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.75,0.0,0.0,0.0,0.0,0.0,0.8275862069,0.0,10.125,0.0
50%,10.75,0.0,0.0,0.0,0.0,0.0,1.36206896552,0.0,21.25,0.0
75%,14.0,0.0,0.0,0.61320754717,0.0,4.53125,2.10344827586,2.265625,27.5,0.0
max,30.0,0.0,0.0,11.09433962264,44.0,12.70833333333,4.7816091954,20.75,43.0,7.34


In [11]:
results["arm0"]["indegree"] = arm0_df_indeg.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm0"]["indegree"] = (
    results["arm0"]["indegree"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm0']['indegree']}")

dpfs          1.00000000000
Cell Cycle   -0.09226936015
HIPPO                   NaN
MYC                     NaN
NOTCH         0.07403685237
NRF2          0.26055653829
PI3K          0.03503757492
RTK-RAS      -0.11310601928
TGF-Beta     -0.05180275226
TP53         -0.05435628743
WNT          -0.02174170470
Name: dpfs, dtype: float64


In [12]:
from analysis_nx import process_patients_with_f

arm1_df_indeg = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.in_degree_centrality,
    nx_pathways,
    mutations_data,
)
arm1_df_indeg.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,8.92723880597,0.0,0.03616532721,1.1117994931,0.05970149254,2.22388059701,1.6881111683,1.6338619403,17.61473880597,0.66671641791
std,6.72526270743,0.0,0.25563721307,2.13433296243,0.69109474047,3.71690587683,1.17443857065,4.22081387824,13.64440010607,1.39379493649
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.93103448276,0.0,0.0,0.0
50%,10.625,0.0,0.0,0.0,0.0,0.0,1.51724137931,0.0,20.75,0.0
75%,13.75,0.0,0.0,0.81132075472,0.0,4.125,2.27586206897,0.0,27.5,0.0
max,21.0,0.0,2.19230769231,10.15094339623,8.0,17.91666666667,5.83908045977,18.25,44.125,7.54


In [13]:
results["arm1"]["indegree"] = arm1_df_indeg.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm1"]["indegree"] = (
    results["arm1"]["indegree"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm1']['indegree']}")

dpfs          1.00000000000
Cell Cycle   -0.12639137193
HIPPO                   NaN
MYC          -0.05265988914
NOTCH        -0.02592021718
NRF2          0.08352453560
PI3K          0.13209202648
RTK-RAS       0.14794631496
TGF-Beta     -0.06616595990
TP53         -0.11281684289
WNT          -0.05256678378
Name: dpfs, dtype: float64


### Out-degree

In [14]:
from analysis_nx import process_patients_with_f

arm0_df_outdeg = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.out_degree_centrality,
    nx_pathways,
    mutations_data,
)
arm0_df_outdeg.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,10.43904320988,0.01037037037,0.11490978158,0.70288842301,0.16666666667,1.47325102881,1.51035901802,0.19097222222,0.82330246914,0.9837037037
std,6.4232242763,0.07741867213,0.46489043298,1.52381859048,1.49948231232,2.16942021878,1.86145914401,1.15780437321,2.40214727723,0.9006504819
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.75,0.0,0.0,0.0,0.0,0.0,0.8275862069,0.0,0.0,0.485
50%,11.5,0.0,0.0,0.0,0.0,0.0,1.25287356322,0.0,0.0,0.78
75%,14.84375,0.0,0.0,0.0,0.0,2.75,1.75,0.0,0.0,1.28
max,25.0,0.68,2.69230769231,6.81132075472,14.5,11.08333333333,16.58620689655,9.9375,16.5,5.28


In [15]:
results["arm0"]["outdegree"] = arm0_df_outdeg.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm0"]["outdegree"] = (
    results["arm0"]["outdegree"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm0']['outdegree']}")

dpfs          1.00000000000
Cell Cycle   -0.04413363227
HIPPO         0.04245896105
MYC           0.26819358259
NOTCH         0.06066446533
NRF2         -0.04613419523
PI3K          0.09017649935
RTK-RAS      -0.09855698721
TGF-Beta      0.02890916087
TP53          0.00559572918
WNT           0.09316713412
Name: dpfs, dtype: float64


In [16]:
from analysis_nx import process_patients_with_f

arm1_df_outdeg = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.out_degree_centrality,
    nx_pathways,
    mutations_data,
)
arm1_df_outdeg.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,9.61660447761,0.01253731343,0.20034443169,0.69050971557,0.69402985075,1.42133084577,1.83136043918,0.32322761194,0.62313432836,0.97701492537
std,6.81259610129,0.10784761295,0.68159037286,1.50015255307,3.79993591545,2.27909035171,2.38327426401,1.56543007449,2.32614396399,0.94011519839
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.9375,0.0,0.0,0.0,0.0,0.0,0.81034482759,0.0,0.0,0.52
50%,11.125,0.0,0.0,0.0,0.0,0.0,1.33908045977,0.0,0.0,0.77
75%,14.75,0.0,0.0,0.0,0.0,2.3125,2.0,0.0,0.0,1.19
max,24.375,1.12,4.38461538462,7.94339622642,26.0,10.75,14.68965517241,10.3125,20.5,5.32


In [17]:
results["arm1"]["outdegree"] = arm1_df_outdeg.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm1"]["outdegree"] = (
    results["arm1"]["outdegree"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm1']['outdegree']}")

dpfs          1.00000000000
Cell Cycle   -0.13821505997
HIPPO         0.07618156944
MYC          -0.04663580626
NOTCH         0.08720395888
NRF2          0.10812386832
PI3K          0.07505846557
RTK-RAS      -0.02274674199
TGF-Beta     -0.06184457498
TP53          0.06770914810
WNT           0.01182285336
Name: dpfs, dtype: float64


### Betweenness

In [18]:
from analysis_nx import process_patients_with_f

arm0_df_bet = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.betweenness_centrality,
    nx_pathways,
    mutations_data,
)
arm0_df_bet.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,18.60141093474,0.0,0.0,1.125,0.0,2.45004572474,2.22161352986,0.25462962963,0.33641975309,0.17548500882
std,12.28116269145,0.0,0.0,3.50700828953,0.0,4.10633802364,1.47314564193,1.54373916427,2.53168139873,0.58189353177
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10.17857142857,0.0,0.0,0.0,0.0,0.0,1.37561924982,0.0,0.0,0.0
50%,20.75,0.0,0.0,0.0,0.0,0.0,2.34394904459,0.0,0.0,0.0
75%,27.5,0.0,0.0,0.0,0.0,5.0,3.14968152866,0.0,0.0,0.0
max,43.0,0.0,0.0,18.75,0.0,16.94444444444,6.56121726822,13.25,22.5,3.09523809524


In [19]:
results["arm0"]["betweenness"] = arm0_df_bet.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm0"]["betweenness"] = (
    results["arm0"]["betweenness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm0']['betweenness']}")

dpfs          1.00000000000
Cell Cycle   -0.05919601305
HIPPO                   NaN
MYC                     NaN
NOTCH         0.06796449064
NRF2                    NaN
PI3K         -0.00252672381
RTK-RAS      -0.10656893393
TGF-Beta      0.02890916087
TP53         -0.04865032728
WNT           0.01694620962
Name: dpfs, dtype: float64


In [20]:
from analysis_nx import process_patients_with_f

arm1_df_bet = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.betweenness_centrality,
    nx_pathways,
    mutations_data,
)
arm1_df_bet.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,17.4552238806,0.0,0.21699196326,1.75373134328,0.0,2.2878489497,2.3366007894,0.43097014925,0.63805970149,0.08742004264
std,13.60139625276,0.0,1.53382327841,4.72445496266,0.0,4.160560705,1.5468721006,2.08724009931,3.58641536277,0.41345394441
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.40207006369,0.0,0.0,0.0
50%,20.75,0.0,0.0,0.0,0.0,0.0,2.36883698986,0.0,0.0,0.0
75%,27.5,0.0,0.0,0.0,0.0,4.09722222222,3.25123849965,0.0,0.0,0.0
max,42.0,0.0,13.15384615385,22.75,0.0,23.88888888889,6.46956829441,13.75,27.5,2.52380952381


In [21]:
results["arm1"]["betweenness"] = arm1_df_bet.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm1"]["betweenness"] = (
    results["arm1"]["betweenness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm1']['betweenness']}")

dpfs          1.00000000000
Cell Cycle   -0.12418602217
HIPPO                   NaN
MYC          -0.05265988914
NOTCH         0.05031069590
NRF2                    NaN
PI3K          0.17710123936
RTK-RAS       0.12833767829
TGF-Beta     -0.06184457498
TP53          0.16705834475
WNT           0.10809996755
Name: dpfs, dtype: float64


### Closeness

In [22]:
from analysis_nx import process_patients_with_f

arm0_df_clos = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.closeness_centrality,
    nx_pathways,
    mutations_data,
)
arm0_df_clos.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,8.70921985816,0.0,0.0,1.08187134503,0.91358024691,1.84509887854,2.45586968214,2.69470899471,18.01333333333,0.73695257663
std,5.9429913448,0.0,0.0,2.13819365133,5.598796523,2.8267867145,1.55701006177,5.09432734482,11.77779108037,1.65284382062
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.13829787234,0.0,0.0,0.0,0.0,0.0,1.50727423468,0.0,9.72,0.0
50%,9.60638297872,0.0,0.0,0.0,0.0,0.0,2.52261371738,0.0,20.4,0.0
75%,12.51063829787,0.0,0.0,0.5701754386,0.0,3.62285452697,3.44371986304,2.28571428571,26.4,0.0
max,27.86170212766,0.0,0.0,10.31578947368,44.0,13.49997324661,7.35848279182,21.34285714286,41.28,8.09784126829


In [23]:
results["arm0"]["closeness"] = arm0_df_clos.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm0"]["closeness"] = (
    results["arm0"]["closeness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm0']['closeness']}")

dpfs          1.00000000000
Cell Cycle   -0.09395603602
HIPPO                   NaN
MYC                     NaN
NOTCH         0.07403685237
NRF2          0.26055653829
PI3K          0.09336284236
RTK-RAS      -0.11431322190
TGF-Beta     -0.05205554694
TP53         -0.05435628743
WNT          -0.01677642636
Name: dpfs, dtype: float64


In [24]:
from analysis_nx import process_patients_with_f

arm1_df_clos = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.closeness_centrality,
    nx_pathways,
    mutations_data,
)
arm1_df_clos.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,8.00301683074,0.0,0.02801717102,1.03377847604,0.05970149254,1.76261093893,2.63326260969,1.66823027719,16.91014925373,0.63744556724
std,6.027113343,0.0,0.19804138575,1.98455521068,0.69109474047,2.85228658469,1.66904695495,4.33893048562,13.09862410183,1.38394166607
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.59733983042,0.0,0.0,0.0
50%,9.49468085106,0.0,0.0,0.0,0.0,0.0,2.72852839314,0.0,19.92,0.0
75%,12.28723404255,0.0,0.0,0.75438596491,0.0,3.35717852833,3.60695492537,0.0,26.4,0.0
max,18.76595744681,0.0,1.69837422406,9.43859649123,8.0,12.78351753536,7.73485102251,18.77142857143,42.36,6.87745950847


In [25]:
results["arm1"]["closeness"] = arm1_df_clos.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm1"]["closeness"] = (
    results["arm1"]["closeness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm1']['closeness']}")

dpfs          1.00000000000
Cell Cycle   -0.12614421901
HIPPO                   NaN
MYC          -0.05265988914
NOTCH        -0.02592021718
NRF2          0.08352453560
PI3K          0.07809505960
RTK-RAS       0.14198272352
TGF-Beta     -0.06535374045
TP53         -0.11281684289
WNT          -0.00141656123
Name: dpfs, dtype: float64


### Eigenvector

In [26]:
from analysis_nx import process_patients_with_f

arm0_df_eigen = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.eigenvector_centrality_numpy,
    nx_pathways,
    mutations_data,
)
arm0_df_eigen.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,9.7037037037,-1.03437420958e-15,-1.10301309979e-13,-1.82338e-06,0.91358023916,2.06525077848,1.42479398184e-06,3.41769283012,36.88045985515,1.8698800332
std,6.56313238433,7.721988215109999e-15,8.79735415358e-13,1.590413e-05,5.59879645813,7.15729112337,3.411033137e-06,6.61900644666,24.73244676163,8.72974912303
min,-2.27595720048e-15,-6.782539459959999e-14,-5.82768727545e-12,-5.62034e-05,0.0,-9e-11,-1.55165209499e-15,0.0,0.0,-3.24664834389e-09
25%,5.75,0.0,0.0,0.0,0.0,0.0,1.54744463776e-09,0.0,20.24876125694,0.0
50%,10.75,0.0,0.0,0.0,0.0,0.0,2.46184274433e-09,0.0,40.99749192764,3.67678133233e-13
75%,14.0,0.0,0.0,1e-11,0.0,1.2e-10,3.78068718195e-09,2.939735e-05,54.9966355127,6.57677787614e-13
max,30.0,0.0,2.96420472389e-12,4.926663e-05,43.99999949373,36.45309178102,1.70063786499e-05,27.66664000096,85.99473916529,57.9557214904


In [27]:
results["arm0"]["eigenvector"] = arm0_df_eigen.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm0"]["eigenvector"] = (
    results["arm0"]["eigenvector"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm0']['eigenvector']}")

dpfs          1.00000000000
Cell Cycle   -0.09226936015
HIPPO        -0.04245896105
MYC          -0.15091093472
NOTCH        -0.03733695900
NRF2          0.26055653811
PI3K          0.11203052827
RTK-RAS      -0.06105156671
TGF-Beta     -0.05393331062
TP53         -0.04579083936
WNT          -0.05336429563
Name: dpfs, dtype: float64


In [28]:
from analysis_nx import process_patients_with_f

arm1_df_eigen = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.eigenvector_centrality_numpy,
    nx_pathways,
    mutations_data,
)
arm1_df_eigen.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,8.92723880597,-1.25051210412e-15,-1.15271073826e-11,4.0161e-07,0.05970150335,1.7216721541,1.85962986825e-06,2.03482501113,34.9083122067,1.67828966126
std,6.72526270743,1.07570689782e-14,8.03953951294e-11,1.570553e-05,0.69109473151,6.06801902125,4.08543831252e-06,5.63706395623,27.2011284432,8.10163248659
min,-1.3877787807799999e-15,-1.11712414635e-13,-6.89596973595e-10,-4.79111e-05,0.0,-1.69728678357e-13,-1.3409965751599998e-15,0.0,0.0,-3.71021999533e-09
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.7057057827e-09,0.0,1.5429213279600002e-12,0.0
50%,10.625,0.0,0.0,0.0,0.0,0.0,2.67285844191e-09,0.0,41.4974613414,3.52142437463e-13
75%,13.75,0.0,0.0,0.0,0.0,1.28291817602e-10,4.50165503669e-09,0.0,54.9966355127,5.61874330363e-13
max,21.0,0.0,2.29486817334e-12,4.272806e-05,7.99999990795,37.951164046,2.43285694568e-05,24.33330988049,83.9948615103,57.9557214904


In [29]:
results["arm1"]["eigenvector"] = arm1_df_eigen.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm1"]["eigenvector"] = (
    results["arm1"]["eigenvector"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm1']['eigenvector']}")

dpfs          1.00000000000
Cell Cycle   -0.12639137193
HIPPO        -0.07618156944
MYC           0.05274478828
NOTCH         0.05327933693
NRF2          0.08352454557
PI3K         -0.00631141313
RTK-RAS       0.05417602932
TGF-Beta     -0.05842357982
TP53         -0.12418602200
WNT          -0.00275684552
Name: dpfs, dtype: float64


## Hierarchy-aware evaluation
This time the hierarchy of a gene inside a complex or family is taken into account when computing the average mutations. This is represented with a weight, which is computed as the reciprocal of the product of the gene containers' cardinalities. For example, if a gene is contained in a family of 4, which is contained in a family of 6, it would have a weight of 1/4\*1/6 = 1/24.

In [30]:
import pathways_nx as pnx
import networkx as nx
import os
import pandas

nx_pathways = []
for filename in os.listdir("./pathways"):
    nx_pathways.append(pnx.pathway_to_nx("pathways/" + filename))

nx_pathways.sort(key=lambda pw: pw.name)

patients_log = pandas.read_csv("TRIBE2_db.csv")
mutations_data = pandas.read_csv("TRIBE2_seq_res.csv")

hresults = {}
hresults["arm0"] = {}
hresults["arm1"] = {}

### In-degree

In [31]:
from analysis_nx import process_patients_with_f

h_arm0_df_indeg = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.in_degree_centrality,
    nx_pathways,
    mutations_data,
    True,
)
h_arm0_df_indeg.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,9.7037037037,0.0,0.0,0.85265700483,0.91358024691,2.95500685871,0.86468595419,2.36334019204,21.3082010582,0.70694253925
std,6.56313238433,0.0,0.0,1.52048866488,5.598796523,4.79080843015,0.62285094125,4.4077537649,14.03997938684,1.83873948384
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.75,0.0,0.0,0.0,0.0,0.0,0.54545454545,0.0,11.57142857143,0.0
50%,10.75,0.0,0.0,0.0,0.0,0.0,0.8186026936,0.0,24.28571428571,0.0
75%,14.0,0.0,0.0,1.30434782609,0.0,5.925,1.17445286195,2.1875,31.42857142857,0.0
max,30.0,0.0,0.0,7.30434782609,44.0,20.33333333333,3.11784511785,18.44444444444,49.14285714286,10.74074074074


In [32]:
hresults["arm0"]["indegree"] = h_arm0_df_indeg.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
hresults["arm0"]["indegree"] = (
    hresults["arm0"]["indegree"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{hresults['arm0']['indegree']}")

dpfs          1.00000000000
Cell Cycle   -0.09226936015
HIPPO                   NaN
MYC                     NaN
NOTCH         0.05404772121
NRF2          0.26055653829
PI3K         -0.00620778660
RTK-RAS      -0.11536385382
TGF-Beta     -0.05061856973
TP53         -0.05144321928
WNT          -0.03280731563
Name: dpfs, dtype: float64


In [33]:
from analysis_nx import process_patients_with_f

h_arm1_df_indeg = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.in_degree_centrality,
    nx_pathways,
    mutations_data,
    True,
)
h_arm1_df_indeg.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,8.92723880597,0.0,0.05424799082,0.76216742375,0.05970149254,2.69469320066,0.9545328911,1.50020729685,20.13113006397,0.65284687673
std,6.72526270743,0.0,0.3834558196,1.31734227102,0.69109474047,4.90924707032,0.70336646216,3.77018454091,15.59360012122,1.69371953919
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.52840909091,0.0,0.0,0.0
50%,10.625,0.0,0.0,0.0,0.0,0.0,0.89772727273,0.0,23.71428571429,0.0
75%,13.75,0.0,0.0,1.21195652174,0.0,3.96666666667,1.26262626263,0.0,31.42857142857,0.0
max,21.0,0.0,3.28846153846,5.84782608696,8.0,28.66666666667,4.16161616162,16.22222222222,50.42857142857,10.74074074074


In [34]:
hresults["arm1"]["indegree"] = h_arm1_df_indeg.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
hresults["arm1"]["indegree"] = (
    hresults["arm1"]["indegree"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{hresults['arm1']['indegree']}")

dpfs          1.00000000000
Cell Cycle   -0.12639137193
HIPPO                   NaN
MYC          -0.05265988914
NOTCH        -0.05638212697
NRF2          0.08352453560
PI3K          0.18505394286
RTK-RAS       0.12814849061
TGF-Beta     -0.06964817020
TP53         -0.11281684289
WNT          -0.01808717964
Name: dpfs, dtype: float64


### Out-degree

In [35]:
from analysis_nx import process_patients_with_f

h_arm0_df_outdeg = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.out_degree_centrality,
    nx_pathways,
    mutations_data,
    True,
)
h_arm0_df_outdeg.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,10.43904320988,0.00997150997,0.12989801396,0.29103973765,0.16666666667,1.79744613895,0.93811238487,0.15277777778,0.80467372134,2.25587889477
std,6.4232242763,0.07444103089,0.52552831554,0.63095613512,1.49948231232,2.75267087997,2.64679131601,0.92624349856,2.48929503244,2.06658657491
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.75,0.0,0.0,0.0,0.0,0.0,0.29241071429,0.0,0.0,1.14285714286
50%,11.5,0.0,0.0,0.0,0.0,0.0,0.46701388889,0.0,0.0,1.85714285714
75%,14.84375,0.0,0.0,0.0,0.0,3.17401960784,0.65290178571,0.0,0.0,2.95238095238
max,25.0,0.65384615385,3.04347826087,2.8203125,14.5,12.16666666667,24.01884920635,7.95,18.85714285714,12.57142857143


In [36]:
hresults["arm0"]["outdegree"] = h_arm0_df_outdeg.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
hresults["arm0"]["outdegree"] = (
    hresults["arm0"]["outdegree"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{hresults['arm0']['outdegree']}")

dpfs          1.00000000000
Cell Cycle   -0.04413363227
HIPPO         0.04245896105
MYC           0.26819358259
NOTCH         0.06066446533
NRF2         -0.04613419523
PI3K          0.04951808483
RTK-RAS      -0.07037608918
TGF-Beta      0.02890916087
TP53          0.02219084567
WNT           0.09687513852
Name: dpfs, dtype: float64


In [37]:
from analysis_nx import process_patients_with_f

h_arm1_df_outdeg = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.out_degree_centrality,
    nx_pathways,
    mutations_data,
    True,
)
h_arm1_df_outdeg.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,9.61660447761,0.01205510907,0.26735885788,0.2859141791,0.69402985075,1.66937371964,1.31226308932,0.25858208955,0.71215351812,2.28251599147
std,6.81259610129,0.10369962784,0.99964471228,0.62115691651,3.79993591545,2.74853184274,3.44522650277,1.25234405959,2.65845024456,2.21676228121
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.9375,0.0,0.0,0.0,0.0,0.0,0.31919642857,0.0,0.0,1.20238095238
50%,11.125,0.0,0.0,0.0,0.0,0.0,0.50843253968,0.0,0.0,1.83333333333
75%,14.75,0.0,0.0,0.0,0.0,2.30882352941,0.69456845238,0.0,0.0,2.75
max,24.375,1.07692307692,7.4347826087,3.2890625,26.0,15.17647058824,22.82142857143,8.25,23.42857142857,12.66666666667


In [38]:
hresults["arm1"]["outdegree"] = h_arm1_df_outdeg.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
hresults["arm1"]["outdegree"] = (
    hresults["arm1"]["outdegree"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{hresults['arm1']['outdegree']}")

dpfs          1.00000000000
Cell Cycle   -0.13821505997
HIPPO         0.07618156944
MYC          -0.05116848131
NOTCH         0.08720395888
NRF2          0.10812386832
PI3K          0.10650559991
RTK-RAS      -0.06684990746
TGF-Beta     -0.06184457498
TP53          0.06770914810
WNT           0.00185709385
Name: dpfs, dtype: float64


### Betweenness

In [39]:
from analysis_nx import process_patients_with_f

h_arm0_df_bet = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.betweenness_centrality,
    nx_pathways,
    mutations_data,
    True,
)
h_arm0_df_bet.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,18.60141093474,0.0,0.0,1.125,0.0,2.9724560419,0.97767708549,0.25462962963,0.33641975309,0.13775417574
std,12.28116269145,0.0,0.0,3.50700828953,0.0,5.39867503644,0.65394810607,1.54373916427,2.53168139873,0.45135975553
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10.17857142857,0.0,0.0,0.0,0.0,0.0,0.59783206641,0.0,0.0,0.0
50%,20.75,0.0,0.0,0.0,0.0,0.0,1.01937331911,0.0,0.0,0.0
75%,27.5,0.0,0.0,0.0,0.0,4.09821428571,1.3617042853,0.0,0.0,0.0
max,43.0,0.0,0.0,18.75,0.0,23.76623376623,3.07808228312,13.25,22.5,2.38970588235


In [40]:
hresults["arm0"]["betweenness"] = h_arm0_df_bet.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
hresults["arm0"]["betweenness"] = (
    hresults["arm0"]["betweenness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{hresults['arm0']['betweenness']}")

dpfs          1.00000000000
Cell Cycle   -0.05919601305
HIPPO                   NaN
MYC                     NaN
NOTCH         0.06796449064
NRF2                    NaN
PI3K         -0.02763978883
RTK-RAS      -0.11106173460
TGF-Beta      0.02890916087
TP53         -0.04865032728
WNT           0.03514633955
Name: dpfs, dtype: float64


In [41]:
from analysis_nx import process_patients_with_f

h_arm1_df_bet = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.betweenness_centrality,
    nx_pathways,
    mutations_data,
    True,
)
h_arm1_df_bet.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,17.4552238806,0.0,0.26612221909,1.75373134328,0.0,2.6816081928,1.03715925606,0.43097014925,0.63805970149,0.06749341528
std,13.60139625276,0.0,1.88110402069,4.72445496266,0.0,5.42087517077,0.69955103719,2.08724009931,3.58641536277,0.3192107659
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.65690638128,0.0,0.0,0.0
50%,20.75,0.0,0.0,0.0,0.0,0.0,1.07657364806,0.0,0.0,0.0
75%,27.5,0.0,0.0,0.0,0.0,3.33116883117,1.3975295059,0.0,0.0,0.0
max,42.0,0.0,16.1320754717,22.75,0.0,33.50649350649,3.33786757351,13.75,27.5,1.94852941176


In [42]:
hresults["arm1"]["betweenness"] = h_arm1_df_bet.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
hresults["arm1"]["betweenness"] = (
    hresults["arm1"]["betweenness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{hresults['arm1']['betweenness']}")

dpfs          1.00000000000
Cell Cycle   -0.12418602217
HIPPO                   NaN
MYC          -0.05265988914
NOTCH         0.05031069590
NRF2                    NaN
PI3K          0.20645834212
RTK-RAS       0.12765317155
TGF-Beta     -0.06184457498
TP53          0.16705834475
WNT           0.10809996755
Name: dpfs, dtype: float64


### Closeness

In [43]:
from analysis_nx import process_patients_with_f

h_arm0_df_clos = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.closeness_centrality,
    nx_pathways,
    mutations_data,
    True,
)
h_arm0_df_clos.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,8.70921985816,0.0,0.0,0.72633744856,0.91358024691,2.21513200322,1.62366815306,2.44444444444,20.33964646465,0.79127111386
std,5.9429913448,0.0,0.0,1.2952310849,5.598796523,3.27768985009,1.08088562088,4.57476953515,13.40179850562,2.03898084662
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.13829787234,0.0,0.0,0.0,0.0,0.0,0.95964622557,0.0,11.04545454545,0.0
50%,9.60638297872,0.0,0.0,0.0,0.0,0.0,1.69495956723,0.0,23.18181818182,0.0
75%,12.51063829787,0.0,0.0,1.11111111111,0.0,4.56848697754,2.22378629254,2.21153846154,30.0,0.0
max,27.86170212766,0.0,0.0,6.22222222222,44.0,12.18263194011,5.21808371383,19.15384615385,46.90909090909,11.17142356855


In [44]:
hresults["arm0"]["closeness"] = h_arm0_df_clos.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
hresults["arm0"]["closeness"] = (
    hresults["arm0"]["closeness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{hresults['arm0']['closeness']}")

dpfs          1.00000000000
Cell Cycle   -0.09395603602
HIPPO                   NaN
MYC                     NaN
NOTCH         0.05404772121
NRF2          0.26055653829
PI3K          0.04122717585
RTK-RAS      -0.11385511784
TGF-Beta     -0.05102158129
TP53         -0.05144321928
WNT          -0.02494105771
Name: dpfs, dtype: float64


In [45]:
from analysis_nx import process_patients_with_f

h_arm1_df_clos = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.closeness_centrality,
    nx_pathways,
    mutations_data,
    True,
)
h_arm1_df_clos.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,8.00301683074,0.0,0.04458794905,0.64925373134,0.05970149254,2.03864652403,1.73607745205,1.54133180253,19.21607869742,0.64845042429
std,6.027113343,0.0,0.3151731205,1.12218045309,0.69109474047,3.36999319886,1.1686489973,3.90720096733,14.88480011571,1.7633476018
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.1088009712,0.0,0.0,0.0
50%,9.49468085106,0.0,0.0,0.0,0.0,0.0,1.82420659528,0.0,22.63636363636,0.0
75%,12.28723404255,0.0,0.0,1.03240740741,0.0,4.1540777763,2.35348094974,0.0,30.0,0.0
max,18.76595744681,0.0,2.70287900646,4.98148148148,8.0,17.17551388277,5.89668950965,16.84615384615,48.13636363636,11.17142356855


In [46]:
hresults["arm1"]["closeness"] = h_arm1_df_clos.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
hresults["arm1"]["closeness"] = (
    hresults["arm1"]["closeness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{hresults['arm1']['closeness']}")

dpfs          1.00000000000
Cell Cycle   -0.12614421901
HIPPO                   NaN
MYC          -0.05265988914
NOTCH        -0.05638212697
NRF2          0.08352453560
PI3K          0.15017770745
RTK-RAS       0.13051729697
TGF-Beta     -0.06851993237
TP53         -0.11281684289
WNT           0.01312932282
Name: dpfs, dtype: float64


### Eigenvector

In [47]:
from analysis_nx import process_patients_with_f

h_arm0_df_eigen = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.eigenvector_centrality_numpy,
    nx_pathways,
    mutations_data,
    True,
)
h_arm0_df_eigen.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,9.7037037037,1.23800375634e-14,-6.91031646903e-14,5.1777e-07,1.3954139,2.06237322948,2.846953e-05,3.41768668382,36.88098661369,1.87014708533
std,6.56313238433,9.24215852269e-14,1.15901054465e-12,1.09141e-06,7.0180443117,7.14732501226,6.770134e-05,6.61899311284,24.73280002814,8.73431522174
min,-9.71445146547e-16,0.0,-5.61808857337e-12,-1e-11,0.0,-4.7e-10,0.0,0.0,-1e-11,-7.71654719759e-09
25%,5.75,0.0,0.0,0.0,0.0,0.0,1.7826e-07,0.0,20.24905047468,-2.00728187304e-14
50%,10.75,0.0,0.0,0.0,0.0,0.0,2.836e-07,0.0,40.99807750429,-1.1086242980799999e-14
75%,14.0,0.0,0.0,4.5642e-07,0.0,0.0,4.3553e-07,5.690828e-05,54.99742104234,0.0
max,30.0,8.11776748799e-13,7.0641029190500004e-12,6.34192e-06,44.0,36.40233535469,0.00033780608,27.66658531471,85.99596744802,57.9861298516


In [48]:
hresults["arm0"]["eigenvector"] = h_arm0_df_eigen.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
hresults["arm0"]["eigenvector"] = (
    hresults["arm0"]["eigenvector"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{hresults['arm0']['eigenvector']}")

dpfs          1.00000000000
Cell Cycle   -0.09226936015
HIPPO         0.04245896105
MYC          -0.10676407664
NOTCH         0.02893786423
NRF2          0.17936777832
PI3K          0.11203049244
RTK-RAS      -0.06123955984
TGF-Beta     -0.05393329654
TP53         -0.04579083917
WNT          -0.05336746423
Name: dpfs, dtype: float64


In [49]:
from analysis_nx import process_patients_with_f

h_arm1_df_eigen = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.eigenvector_centrality_numpy,
    nx_pathways,
    mutations_data,
    True,
)
h_arm1_df_eigen.describe()

Unnamed: 0,Cell Cycle,HIPPO,MYC,NOTCH,NRF2,PI3K,RTK-RAS,TGF-Beta,TP53,WNT
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,8.92723880597,1.49669110841e-14,-1.05722340196e-11,5.73887492076e-07,2.06614312254,1.7192730242,3.71065883881e-05,2.03482202008,34.90881080544,1.67881268348
std,6.72526270743,1.28747330308e-13,7.40516152603e-11,1.21498460196e-06,10.99636879645,6.05956977046,8.10928828977e-05,5.63705243904,27.2015169628,8.10595759106
min,-7.21644966006e-16,0.0,-6.35098037589e-10,-7.32306054614e-12,0.0,-6.56499423479e-10,-4.6298666514199994e-14,0.0,-3e-11,-8.81891871375e-09
25%,1.11022302463e-16,0.0,0.0,0.0,0.0,-2.04074658859e-13,1.96495487174e-07,0.0,0.0,-1.99048453519e-14
50%,10.625,0.0,0.0,0.0,0.0,0.0,3.07910984129e-07,0.0,41.49805405922,-1.15901631163e-14
75%,13.75,0.0,0.0,3.88824332907e-09,0.0,0.0,5.18586318869e-07,0.0,54.99742104234,0.0
max,21.0,1.33704405685e-12,5.468982905070001e-12,5.18009361034e-06,75.16604988083,37.8983217391,0.000483250364765,24.33326178368,83.9960612283,57.9861298516


In [50]:
hresults["arm1"]["eigenvector"] = h_arm1_df_eigen.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
hresults["arm1"]["eigenvector"] = (
    hresults["arm1"]["eigenvector"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{hresults['arm1']['eigenvector']}")

dpfs          1.00000000000
Cell Cycle   -0.12639137193
HIPPO         0.07618156944
MYC           0.05197160468
NOTCH        -0.00548360585
NRF2          0.11326751032
PI3K         -0.00631136523
RTK-RAS       0.05437000338
TGF-Beta     -0.05842363849
TP53         -0.12418602217
WNT          -0.00277937146
Name: dpfs, dtype: float64


## Result comparison

In [51]:
import matplotlib.pyplot as plt
import numpy as np
import ipywidgets as widgets
import math

plt.rcParams["figure.figsize"] = [14, 8]

outputs = [widgets.Output() for _ in range(len(nx_pathways))]


def replace_nans(vals):
    return [0 if math.isnan(x) else x for x in vals]


def make_corr_graph(name):
    arm0_data = []
    for measure in results["arm0"]:
        num = results["arm0"][measure][name]
        arm0_data.append(num)
    arm1_data = []
    for measure in results["arm1"]:
        num = results["arm1"][measure][name]
        arm1_data.append(num)
    h_arm0_data = []
    for measure in hresults["arm0"]:
        num = hresults["arm0"][measure][name]
        h_arm0_data.append(num)
    h_arm1_data = []
    for measure in hresults["arm1"]:
        num = hresults["arm1"][measure][name]
        h_arm1_data.append(num)

    arm0_data = replace_nans(arm0_data)
    arm1_data = replace_nans(arm1_data)
    h_arm0_data = replace_nans(h_arm0_data)
    h_arm1_data = replace_nans(h_arm1_data)

    fig, ax = plt.subplots()

    width = 0.15
    labels = results["arm0"].keys()
    x1 = np.arange(len(labels))
    x2 = [x + width for x in x1]
    x3 = [x + width for x in x2]
    x4 = [x + width for x in x3]

    r1 = ax.bar(x1, arm0_data, width, label="arm0")
    r2 = ax.bar(x2, arm1_data, width, label="arm1")
    r3 = ax.bar(x3, h_arm0_data, width, label="arm0 (hierarchy)")
    r4 = ax.bar(x4, h_arm1_data, width, label="arm1 (hierarchy)")

    l1 = ax.axhline(
        baseline["arm0"][name],
        label="arm0 baseline",
        color="#ff0000",
        alpha=0.5,
        dashes=(3, 2),
        ls="--",
    )
    ax.text(
        -0.28,
        baseline["arm0"][name] + 0.001,
        "arm0: {:.6f}".format(baseline["arm0"][name]),
        ha="left",
    )
    l2 = ax.axhline(
        baseline["arm1"][name],
        label="arm1 baseline",
        color="#00ff00",
        alpha=0.5,
        dashes=(3, 2),
        ls="--",
    )
    ax.text(
        -0.28,
        baseline["arm1"][name] + 0.001,
        "arm1: {:.6f}".format(baseline["arm1"][name]),
        ha="left",
    )

    ax.set_ylabel("dpfs correlation")
    ax.set_title(f"Correlation with dpfs for {name}")
    ax.set_xticks(np.arange(len(labels)) + 0.30)
    ax.set_xticklabels(results["arm0"].keys())

    for container in [r1, r2, r3, r4]:
        for idx, rect in enumerate(container):
            height = rect.get_height()
            txt = (
                "0"
                if container.datavalues[idx] == 0
                else "{:.6f}".format(container.datavalues[idx])
            )
            ax.text(
                rect.get_x() + rect.get_width() / 2 + 0.01,
                rect.get_y() + (math.copysign(0.002, height) if height != 0 else 0),
                txt,
                fontsize="11",
                fontweight="regular",
                ha="center",
                va=("top" if height < 0 else "bottom"),
                rotation=90,
            )

    ax.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, -0.05),
        fancybox=True,
        shadow=True,
        ncol=5,
    )

    plt.close(fig)
    return fig


tab = widgets.Tab(children=outputs)
for num, out in enumerate(outputs):
    name = nx_pathways[num].name
    tab.set_title(num, name)
    with out:
        display(make_corr_graph(name))

display(tab)

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output…

As shown by the plots above, gene-hierarchy does not seem to make a difference in improving the dpfs-mutation correlation for most pathways.
It is also clear that there is no common pattern among different pathways.

Based on these considerations, we now produce two datasets.
In the first one, we choose the best measure for each pathway and compute mutations for all patients. In the second one, all measures are kept in.

### Dataset production

In [52]:
def nan_swap(x):
    return (x[0], 0) if math.isnan(x[1]) else x


print("Pathway, arm0 measure, arm1 measure")
for pw in nx_pathways:
    arm0_res = [("baseline", baseline["arm0"][pw.name])]
    arm1_res = [("baseline", baseline["arm1"][pw.name])]
    for measure in results["arm0"]:
        arm0_res.append(nan_swap((measure, results["arm0"][measure][pw.name])))
        arm0_res.append(
            nan_swap((measure + " (H)", hresults["arm0"][measure][pw.name]))
        )
    for measure in results["arm1"]:
        arm1_res.append(nan_swap((measure, results["arm1"][measure][pw.name])))
        arm1_res.append(
            nan_swap((measure + " (H)", hresults["arm1"][measure][pw.name]))
        )
    arm0_res.sort(key=lambda x: x[1])
    arm1_res.sort(key=lambda x: x[1])
    print(
        f"{pw.name}: {arm0_res[-1][0]} ({arm0_res[-1][1]}), {arm1_res[-1][0]} ({arm1_res[-1][1]})"
    )

Pathway, arm0 measure, arm1 measure
Cell Cycle: outdegree (H) (-0.044133632271152884), betweenness (H) (-0.1241860221712861)
HIPPO: baseline (0.042458961050619734), baseline (0.07618156944415712)
MYC: outdegree (H) (0.26819358258640064), eigenvector (0.05274478828133718)
NOTCH: indegree (0.07403685237045422), outdegree (H) (0.087203958884403)
NRF2: closeness (H) (0.260556538294026), baseline (0.11540925246081832)
PI3K: baseline (0.16945628396551027), betweenness (H) (0.20645834212068123)
RTK-RAS: eigenvector (-0.06105156670534166), indegree (0.14794631495585203)
TGF-Beta: outdegree (H) (0.028909160870003083), eigenvector (-0.05842357982158331)
TP53: outdegree (H) (0.022190845674342475), betweenness (H) (0.1670583447477449)
WNT: baseline (0.1268032385837714), betweenness (0.1080999675521082)


In [83]:
from collections import namedtuple
import networkx as nx
from analysis_nx import process_patients_with_config, PathwayConfig
import pathways as lpw
import pathways_nx as pnx
import os

arm0_config = {
    "Cell Cycle": PathwayConfig(nx.out_degree_centrality, True),
    "HIPPO": PathwayConfig("baseline", False),
    "MYC": PathwayConfig(nx.out_degree_centrality, True),
    "NOTCH": PathwayConfig(nx.in_degree_centrality, False),
    "NRF2": PathwayConfig(nx.closeness_centrality, True),
    "PI3K": PathwayConfig("baseline", False),
    "RTK-RAS": PathwayConfig(nx.eigenvector_centrality_numpy, False),
    "TGF-Beta": PathwayConfig(nx.out_degree_centrality, True),
    "TP53": PathwayConfig(nx.out_degree_centrality, True),
    "WNT": PathwayConfig("baseline", False),
}
arm1_config = {
    "Cell Cycle": PathwayConfig(nx.betweenness_centrality, True),
    "HIPPO": PathwayConfig("baseline", False),
    "MYC": PathwayConfig(nx.eigenvector_centrality_numpy, False),
    "NOTCH": PathwayConfig(nx.out_degree_centrality, True),
    "NRF2": PathwayConfig("baseline", False),
    "PI3K": PathwayConfig(nx.betweenness_centrality, True),
    "RTK-RAS": PathwayConfig(nx.in_degree_centrality, False),
    "TGF-Beta": PathwayConfig(nx.eigenvector_centrality_numpy, False),
    "TP53": PathwayConfig(nx.betweenness_centrality, True),
    "WNT": PathwayConfig(nx.betweenness_centrality, False),
}

pathways = []
for pw in os.listdir("./pathways"):
    pathway = lpw.parse_pathway("./pathways/" + pw)
    pathways.append(pathway)
pathways.sort(key=lambda x: x[0])

nx_pathways = []
for filename in os.listdir("./pathways"):
    nx_pathways.append(pnx.pathway_to_nx("pathways/" + filename))
nx_pathways.sort(key=lambda pw: pw.name)

patients_log = pandas.read_csv("TRIBE2_db.csv")
mutations_data = pandas.read_csv("TRIBE2_seq_res.csv")

dpfs = patients_log[["PatientFirstName", "arm", "dpfs"]]
dpfs.set_index("PatientFirstName")

arm0_mixed_df = process_patients_with_config(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx_pathways,
    pathways,
    mutations_data,
    arm0_config,
)
arm0_mixed_df = arm0_mixed_df.set_index("PatientFirstName")
arm0_mixed_df = arm0_mixed_df.join(
    dpfs[dpfs["arm"] == 0][["PatientFirstName", "dpfs"]].set_index("PatientFirstName"), on="PatientFirstName"
)

arm1_mixed_df = process_patients_with_config(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx_pathways,
    pathways,
    mutations_data,
    arm1_config,
)
arm1_mixed_df = arm1_mixed_df.set_index("PatientFirstName")
arm1_mixed_df = arm1_mixed_df.join(
    dpfs[dpfs["arm"] == 1][["PatientFirstName", "dpfs"]].set_index("PatientFirstName"), on="PatientFirstName"
)

In [84]:
import arff
arff.dump('arm0_best_mutations.arff', arm0_mixed_df.values, relation="TRIBE2, ARM0, pathways with best correlations", names=arm0_mixed_df.columns)
arff.dump('arm1_best_mutations.arff', arm1_mixed_df.values, relation="TRIBE2, ARM1, pathways with best correlations", names=arm1_mixed_df.columns)