## Constructing a coexpression network on GPL570-55999

In [1]:
from network_builder import make_pathway_from_thres
import networkx as nx
import os
import pandas as pd
import numpy as np
import numpy.ma as ma

pd.set_option("display.precision", 11)

gpl570 = pd.read_csv("GPL570-stripped.txt", sep="\t", low_memory=False)[
    ["ID", "Gene Symbol"]
]
gse = pd.read_csv("GSE40367-stripped.txt", sep="\t", low_memory=False)


tribe2_seq = pd.read_csv("TRIBE2_seq_res.csv")
dataset_genes = tribe2_seq["Biomarker"].unique()
print(f"Filtered {len(dataset_genes)} unique genes in the dataset")

gpl570 = gpl570[gpl570["Gene Symbol"].isin(dataset_genes)].set_index("ID")
gene_data = gpl570.join(gse.set_index("ID_REF"))
gene_expression = gene_data.groupby("Gene Symbol").mean()
print(f"Will compute coexpression on {len(gpl570)} genes")

coexpression = gene_expression.T.corr()
coexpression.to_csv("gse40367-coexpression.csv")

npc = coexpression.to_numpy()
coexpression_values = ma.array(
    npc, mask=np.triu(np.ones_like(npc, dtype=bool))
).flatten()

nx_pathways = [
    make_pathway_from_thres(thres, coexpression)
    for thres in np.linspace(40, 70, num=15) / 100
]

patients_log = pd.read_csv("TRIBE2_db.csv")
mutations_data = pd.read_csv("TRIBE2_seq_res.csv")

results = {}
results["arm0"] = {}
results["arm1"] = {}

Filtered 596 unique genes in the dataset
Will compute coexpression on 1744 genes


### Baseline

In [2]:
from analysis_nx import process_patients_with_f

def unweight(g):
    nodes = g.nodes()
    return {node: 1 for node in nodes}

arm0_df_baseline = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    unweight,
    nx_pathways,
    mutations_data,
)
arm0_df_baseline.describe()

Unnamed: 0,GPL570-0.4,GPL570-0.4214285714285715,GPL570-0.44285714285714284,GPL570-0.4642857142857143,GPL570-0.4857142857142857,GPL570-0.5071428571428571,GPL570-0.5285714285714286,GPL570-0.55,GPL570-0.5714285714285714,GPL570-0.5928571428571429,GPL570-0.6142857142857143,GPL570-0.6357142857142857,GPL570-0.6571428571428573,GPL570-0.6785714285714286,GPL570-0.7
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,1.23654055013,1.2219640853,1.22203932327,1.16399098718,1.17389911535,1.19626764599,1.16927451796,1.15719349713,1.13544973545,1.16423056278,1.06430660915,0.91119176128,0.93319700068,0.88803746275,0.87341960434
std,0.95558683712,0.95633093701,0.95007561792,0.96154194295,0.98057174163,0.98792626808,0.98821115974,0.95279684208,0.98456263141,1.02590773416,0.97391984378,0.94753859975,0.98439874158,1.02621724147,1.07693620827
min,0.26391382406,0.26727272727,0.23148148148,0.23764258555,0.249500998,0.26371308017,0.20224719101,0.17821782178,0.08571428571,0.04347826087,0.05220883534,0.0,0.0,0.0,0.0
25%,0.87208258528,0.84727272727,0.82731481481,0.77376425856,0.77594810379,0.79746835443,0.75786516854,0.72772277228,0.71571428571,0.71571906355,0.63253012048,0.48366834171,0.46779141104,0.36422413793,0.15662650602
50%,1.0592459605,1.06636363636,1.04907407407,0.99809885932,0.96606786427,1.00210970464,0.99101123596,0.96658415842,0.94571428571,0.95484949833,0.87550200803,0.6959798995,0.68711656442,0.64224137931,0.60240963855
75%,1.30026929982,1.26727272727,1.25833333333,1.23003802281,1.23902195609,1.2753164557,1.22808988764,1.26670792079,1.20571428571,1.2525083612,1.12048192771,0.98869346734,1.10736196319,1.07327586207,1.16265060241
max,7.10592459605,7.15272727273,7.12962962963,7.10266159696,7.11576846307,7.20675105485,7.3393258427,6.95297029703,7.03428571429,7.60869565217,6.859437751,6.96984924623,6.70552147239,6.68965517241,7.50602409639


In [3]:
results["arm0"]["baseline"] = arm0_df_baseline.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm0"]["baseline"] = (
    results["arm0"]["baseline"][["dpfs"] + [pw.name for pw in nx_pathways]].corr().iloc[0]
)
print(f"{results['arm0']['baseline']}")

dpfs                          1.00000000000
GPL570-0.4                    0.14372996926
GPL570-0.4214285714285715     0.14501653701
GPL570-0.44285714285714284    0.13658084676
GPL570-0.4642857142857143     0.13478455722
GPL570-0.4857142857142857     0.13408585740
GPL570-0.5071428571428571     0.12864718616
GPL570-0.5285714285714286     0.11939435425
GPL570-0.55                   0.12043392858
GPL570-0.5714285714285714     0.11368638984
GPL570-0.5928571428571429     0.09512848175
GPL570-0.6142857142857143     0.10993481780
GPL570-0.6357142857142857     0.12752062171
GPL570-0.6571428571428573     0.13360433047
GPL570-0.6785714285714286     0.11128867243
GPL570-0.7                    0.05615121800
Name: dpfs, dtype: float64


In [4]:
from analysis_nx import process_patients_with_f

arm1_df_baseline = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    unweight,
    nx_pathways,
    mutations_data,
)
arm1_df_baseline.describe()

Unnamed: 0,GPL570-0.4,GPL570-0.4214285714285715,GPL570-0.44285714285714284,GPL570-0.4642857142857143,GPL570-0.4857142857142857,GPL570-0.5071428571428571,GPL570-0.5285714285714286,GPL570-0.55,GPL570-0.5714285714285714,GPL570-0.5928571428571429,GPL570-0.6142857142857143,GPL570-0.6357142857142857,GPL570-0.6571428571428573,GPL570-0.6785714285714286,GPL570-0.7
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,1.25454862135,1.24402985075,1.24513543394,1.19376028602,1.19717281854,1.21803954909,1.17851752474,1.1672085119,1.17091684435,1.20076873159,1.1392135707,0.9642241056,0.95774196502,0.87352032939,0.93229634958
std,1.02242166382,1.0220806702,1.02422158847,1.04211472413,1.04940447188,1.07657544833,1.10387315302,1.08465304519,1.1316370741,1.15768458858,1.20033901852,1.14773055536,1.17878148477,1.15534546092,1.18252661189
min,0.49012567325,0.48363636364,0.49259259259,0.41254752852,0.43313373253,0.42827004219,0.37078651685,0.35148514851,0.28285714286,0.22408026756,0.04016064257,0.0,0.0,0.0,0.0
25%,0.8473967684,0.84454545455,0.85925925926,0.78897338403,0.81087824351,0.80327004219,0.77359550562,0.74443069307,0.70571428571,0.74498327759,0.67670682731,0.45854271357,0.43558282209,0.33836206897,0.32831325301
50%,1.0197486535,1.02363636364,1.01018518519,0.98098859316,0.96806387226,0.97784810127,0.95842696629,0.94801980198,0.94285714286,0.95150501672,0.91365461847,0.74371859296,0.71779141104,0.62068965517,0.6265060241
75%,1.2697486535,1.26409090909,1.24537037037,1.17633079848,1.18512974052,1.19725738397,1.1606741573,1.21349009901,1.19714285714,1.23160535117,1.15060240964,1.07035175879,1.08895705521,1.06034482759,1.20180722892
max,9.80430879713,9.86727272727,9.90185185185,10.0,10.11377245509,10.41772151899,10.62696629213,10.18316831683,10.44571428571,10.86622073579,11.33333333333,10.42713567839,11.13496932515,10.27586206897,10.25301204819


In [5]:
results["arm1"]["baseline"] = arm1_df_baseline.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm1"]["baseline"] = (
    results["arm1"]["baseline"][["dpfs"] + [pw.name for pw in nx_pathways]].corr().iloc[0]
)
print(f"{results['arm1']['baseline']}")

dpfs                          1.00000000000
GPL570-0.4                    0.09016054575
GPL570-0.4214285714285715     0.08560869230
GPL570-0.44285714285714284    0.08039369881
GPL570-0.4642857142857143     0.08684749484
GPL570-0.4857142857142857     0.08970408891
GPL570-0.5071428571428571     0.08792015551
GPL570-0.5285714285714286     0.07645129994
GPL570-0.55                   0.06592234750
GPL570-0.5714285714285714     0.06073064187
GPL570-0.5928571428571429     0.06357373859
GPL570-0.6142857142857143     0.05710582605
GPL570-0.6357142857142857     0.06871864975
GPL570-0.6571428571428573     0.06371521019
GPL570-0.6785714285714286     0.04799822907
GPL570-0.7                    0.08277337822
Name: dpfs, dtype: float64


### Degree

In [6]:
from analysis_nx import process_patients_with_f

arm0_df_outdeg = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.degree_centrality,
    nx_pathways,
    mutations_data,
)
arm0_df_outdeg.describe()

Unnamed: 0,GPL570-0.4,GPL570-0.4214285714285715,GPL570-0.44285714285714284,GPL570-0.4642857142857143,GPL570-0.4857142857142857,GPL570-0.5071428571428571,GPL570-0.5285714285714286,GPL570-0.55,GPL570-0.5714285714285714,GPL570-0.5928571428571429,GPL570-0.6142857142857143,GPL570-0.6357142857142857,GPL570-0.6571428571428573,GPL570-0.6785714285714286,GPL570-0.7
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,1.10699716279,1.09706533696,1.07190785139,1.05534992733,1.03438453571,1.01498085417,1.00051067959,0.99722156693,0.97023491249,0.95462083522,0.93921120864,0.88835174954,0.87852645148,0.89803921569,0.88331167425
std,0.98547243455,0.99018949688,0.98721425562,0.99267411472,0.99308938933,1.01015039072,1.01248485801,1.01702072102,1.02922951432,1.0472362674,1.09283178514,1.11216477021,1.13788308568,1.25504492491,1.31791393982
min,0.15429606625,0.14320293079,0.12951607557,0.10651142734,0.0637755102,0.05629013978,0.07032057911,0.06652512385,0.04138594803,0.01763907734,0.02504816956,0.0,0.0,0.0,0.0
25%,0.66811594203,0.63911422216,0.60436692078,0.59414079345,0.56951530612,0.52521722705,0.51072905895,0.51503892427,0.47256977863,0.40162822252,0.35886319846,0.26553867403,0.25652610442,0.17058823529,0.11842105263
50%,0.9192805383,0.91560905404,0.90706827975,0.89111686072,0.847930839,0.78570079335,0.76098759049,0.78096249115,0.71366698749,0.72693351425,0.65606936416,0.61049723757,0.56024096386,0.51764705882,0.43859649123
75%,1.16092132505,1.1514621222,1.15793834935,1.18113949978,1.17010345805,1.19399319985,1.17069544984,1.17639773531,1.18731953802,1.20742876526,1.10187861272,1.07527624309,1.13002008032,1.12720588235,1.11842105263
max,7.30170807453,7.33638623577,7.31479946967,7.27899956878,7.26828231293,7.32262939176,7.35599793175,7.4550601557,7.35178055823,7.21031207598,7.44701348748,7.70994475138,8.07630522088,8.82941176471,9.28947368421


In [7]:
results["arm0"]["degree"] = arm0_df_outdeg.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm0"]["degree"] = (
    results["arm0"]["degree"][["dpfs"] + [pw.name for pw in nx_pathways]].corr().iloc[0]
)
print(f"{results['arm0']['degree']}")

dpfs                          1.00000000000
GPL570-0.4                    0.11678438934
GPL570-0.4214285714285715     0.11256713461
GPL570-0.44285714285714284    0.11179861934
GPL570-0.4642857142857143     0.10658910127
GPL570-0.4857142857142857     0.10214060672
GPL570-0.5071428571428571     0.09547367286
GPL570-0.5285714285714286     0.08552707482
GPL570-0.55                   0.08797168311
GPL570-0.5714285714285714     0.08847755470
GPL570-0.5928571428571429     0.08924820687
GPL570-0.6142857142857143     0.08255354991
GPL570-0.6357142857142857     0.07710187429
GPL570-0.6571428571428573     0.08400400368
GPL570-0.6785714285714286     0.08803912089
GPL570-0.7                    0.07844890275
Name: dpfs, dtype: float64


In [8]:
from analysis_nx import process_patients_with_f

arm1_df_outdeg = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.degree_centrality,
    nx_pathways,
    mutations_data,
)
arm1_df_outdeg.describe()

Unnamed: 0,GPL570-0.4,GPL570-0.4214285714285715,GPL570-0.44285714285714284,GPL570-0.4642857142857143,GPL570-0.4857142857142857,GPL570-0.5071428571428571,GPL570-0.5285714285714286,GPL570-0.55,GPL570-0.5714285714285714,GPL570-0.5928571428571429,GPL570-0.6142857142857143,GPL570-0.6357142857142857,GPL570-0.6571428571428573,GPL570-0.6785714285714286,GPL570-0.7
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,1.15165361083,1.13454365618,1.11027869436,1.09741155156,1.07228102684,1.04925034818,1.04024795876,1.04943435688,1.02210075704,0.9984305069,0.99283208236,0.9297229323,0.90497812144,0.91108428446,0.93303220738
std,1.09221020195,1.10265590233,1.11647642592,1.12396036836,1.13254834198,1.14386429141,1.16754541325,1.18238927589,1.1891145434,1.19431075361,1.23992517467,1.27607506763,1.27685826221,1.35785576021,1.46506364842
min,0.30797101449,0.30930262986,0.28148823334,0.26509271238,0.19628684807,0.1820929354,0.15796277146,0.12314225053,0.07747834456,0.04545454545,0.03564547206,0.0,0.0,0.0,0.0
25%,0.73694358178,0.72023420123,0.69698790189,0.65734152652,0.63534580499,0.59392708727,0.56237073423,0.54131280962,0.49627045236,0.42469470828,0.38631984586,0.28383977901,0.30070281124,0.17647058824,0.1798245614
50%,0.94358178054,0.91616511841,0.88444647,0.87149633463,0.84063208617,0.79231205138,0.76732161324,0.76804670913,0.77598652551,0.71879240163,0.6767822736,0.60842541436,0.53514056225,0.48970588235,0.48684210526
75%,1.12210144928,1.11620109904,1.11940669539,1.10368154377,1.09842687075,1.09076312807,1.12797311272,1.16020877565,1.18407122233,1.15349389417,1.2119460501,1.05559392265,1.08885542169,1.16029411765,1.17214912281
max,10.4683747412,10.53087792752,10.66349022207,10.7035360069,10.72052154195,10.76860596902,10.97931747673,11.05767869781,10.89605389798,10.98507462687,11.03082851638,11.10773480663,10.94578313253,10.74705882353,11.8201754386


In [9]:
results["arm1"]["degree"] = arm1_df_outdeg.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm1"]["degree"] = (
    results["arm1"]["degree"][["dpfs"] + [pw.name for pw in nx_pathways]].corr().iloc[0]
)
print(f"{results['arm1']['degree']}")

dpfs                          1.00000000000
GPL570-0.4                    0.09188495871
GPL570-0.4214285714285715     0.09218505212
GPL570-0.44285714285714284    0.08986588750
GPL570-0.4642857142857143     0.09116139757
GPL570-0.4857142857142857     0.08962773709
GPL570-0.5071428571428571     0.08871169073
GPL570-0.5285714285714286     0.09178350750
GPL570-0.55                   0.09219674248
GPL570-0.5714285714285714     0.09064349328
GPL570-0.5928571428571429     0.09695961012
GPL570-0.6142857142857143     0.08049848110
GPL570-0.6357142857142857     0.09812845839
GPL570-0.6571428571428573     0.09683854816
GPL570-0.6785714285714286     0.08054760158
GPL570-0.7                    0.10413972100
Name: dpfs, dtype: float64


### Betweenness

In [10]:
from analysis_nx import process_patients_with_f

arm0_df_bet = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.betweenness_centrality,
    nx_pathways,
    mutations_data,
)
arm0_df_bet.describe()

Unnamed: 0,GPL570-0.4,GPL570-0.4214285714285715,GPL570-0.44285714285714284,GPL570-0.4642857142857143,GPL570-0.4857142857142857,GPL570-0.5071428571428571,GPL570-0.5285714285714286,GPL570-0.55,GPL570-0.5714285714285714,GPL570-0.5928571428571429,GPL570-0.6142857142857143,GPL570-0.6357142857142857,GPL570-0.6571428571428573,GPL570-0.6785714285714286,GPL570-0.7
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,1.09135717644,1.10051100834,1.04830605176,1.0134962156,1.03002886657,1.04051296171,1.00286239404,1.08549494606,1.03409168076,0.99898502003,0.97995711229,0.93357110372,0.88477371806,0.97719267788,0.94495557949
std,1.00926227014,1.04037904649,1.01301073915,0.99220304437,1.0043460274,1.03468492194,1.01579144254,1.13081386142,1.0878693103,1.1254679975,1.23714771434,1.38032349471,1.87191595091,2.00378083822,2.11136413923
min,0.12645392434,0.08905940085,0.03701282173,0.03736537048,0.04319762582,0.01573937444,0.05775690127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.61427333414,0.5922031854,0.56203876914,0.52435753333,0.55247852137,0.53538273216,0.49455481211,0.45082645947,0.378167693,0.28226117896,0.16685871677,0.01460482632,0.00333095408,0.0,0.0
50%,0.87975778956,0.88844394344,0.88098298722,0.82190072323,0.82510267453,0.83509743404,0.79406124275,0.82625774901,0.77884559886,0.75381415199,0.5496349301,0.34175882973,0.20197763415,0.09371810468,0.0
75%,1.28156312789,1.24890094186,1.20264625,1.16671531104,1.17262383626,1.15758570841,1.18741717118,1.28553383671,1.34587079408,1.31762347868,1.31378366017,1.47392375779,0.6820116313,1.0410033321,0.8341536167
max,7.19391650539,7.53263020736,7.41218704036,6.88875345684,7.38604319779,7.15916539228,6.8837209211,8.00502976152,7.12783500753,7.30789937598,7.14584577003,7.50936705843,9.94309359334,10.95630084829,11.64429530201


In [11]:
results["arm0"]["betweenness"] = arm0_df_bet.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm0"]["betweenness"] = (
    results["arm0"]["betweenness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm0']['betweenness']}")

dpfs                          1.00000000000
GPL570-0.4                    0.11854103059
GPL570-0.4214285714285715     0.10519153358
GPL570-0.44285714285714284    0.10301075498
GPL570-0.4642857142857143     0.10817949221
GPL570-0.4857142857142857     0.11656647025
GPL570-0.5071428571428571     0.11603392406
GPL570-0.5285714285714286     0.11247796776
GPL570-0.55                   0.10595166365
GPL570-0.5714285714285714     0.09646768187
GPL570-0.5928571428571429     0.10809712394
GPL570-0.6142857142857143     0.09508943638
GPL570-0.6357142857142857     0.12311933974
GPL570-0.6571428571428573     0.13297045424
GPL570-0.6785714285714286     0.16796101375
GPL570-0.7                    0.18546705067
Name: dpfs, dtype: float64


In [12]:
from analysis_nx import process_patients_with_f

arm1_df_bet = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.betweenness_centrality,
    nx_pathways,
    mutations_data,
)
arm1_df_bet.describe()

Unnamed: 0,GPL570-0.4,GPL570-0.4214285714285715,GPL570-0.44285714285714284,GPL570-0.4642857142857143,GPL570-0.4857142857142857,GPL570-0.5071428571428571,GPL570-0.5285714285714286,GPL570-0.55,GPL570-0.5714285714285714,GPL570-0.5928571428571429,GPL570-0.6142857142857143,GPL570-0.6357142857142857,GPL570-0.6571428571428573,GPL570-0.6785714285714286,GPL570-0.7
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,1.16273803893,1.15398378409,1.11204206258,1.07553591543,1.06722096186,1.05793084197,1.02684633857,1.12850650726,1.09985420207,1.09537285762,1.11670426625,1.09065773668,1.11452197148,1.17252093939,1.19737765312
std,1.13443066098,1.15892605336,1.18726898873,1.16989121778,1.18887450839,1.2031014769,1.24966407567,1.36410113767,1.36792238867,1.39788562645,1.54437104883,1.85969735747,2.1293325042,2.25572216074,2.6396495952
min,0.15791334733,0.12634216041,0.09247039332,0.08536074797,0.0951262947,0.0330437452,0.05949642612,0.02518466685,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.6802349203,0.68696428555,0.57132372422,0.5734049676,0.53799756805,0.50871452148,0.44328884108,0.52302924143,0.40987095318,0.34456508471,0.28065580413,0.02839125078,0.01356174161,0.0,0.0
50%,0.95064347893,0.9110912983,0.85885848614,0.85569147218,0.79225711649,0.76920681619,0.67349603426,0.75534665666,0.74460223876,0.75015252525,0.64257586056,0.41665536834,0.30267990396,0.18999915918,0.00984340045
75%,1.27232217409,1.25033161556,1.22358100356,1.15084936693,1.21656043509,1.16477051557,1.20061169162,1.41504037889,1.3197246919,1.31807275971,1.34480600113,1.44947071719,1.14156556844,1.30688181214,0.90302013423
max,11.02733247864,11.18308916593,11.39762701903,11.26503767734,11.39772554545,11.41317097626,11.96214218336,12.75760936274,11.75052636204,11.80670897215,12.55669302801,15.84405181151,15.10746102706,14.61153823523,16.65212527964


In [13]:
results["arm1"]["betweenness"] = arm1_df_bet.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm1"]["betweenness"] = (
    results["arm1"]["betweenness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm1']['betweenness']}")

dpfs                          1.00000000000
GPL570-0.4                    0.09442391610
GPL570-0.4214285714285715     0.09575111109
GPL570-0.44285714285714284    0.09622849179
GPL570-0.4642857142857143     0.10264417509
GPL570-0.4857142857142857     0.10009675461
GPL570-0.5071428571428571     0.09005214806
GPL570-0.5285714285714286     0.09987663232
GPL570-0.55                   0.11346883967
GPL570-0.5714285714285714     0.11994151434
GPL570-0.5928571428571429     0.11645801248
GPL570-0.6142857142857143     0.08776146693
GPL570-0.6357142857142857     0.14106763665
GPL570-0.6571428571428573     0.10256774851
GPL570-0.6785714285714286     0.09540877221
GPL570-0.7                    0.13280171954
Name: dpfs, dtype: float64


### Closeness

In [14]:
from analysis_nx import process_patients_with_f

arm0_df_clos = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.closeness_centrality,
    nx_pathways,
    mutations_data,
)
arm0_df_clos.describe()

Unnamed: 0,GPL570-0.4,GPL570-0.4214285714285715,GPL570-0.44285714285714284,GPL570-0.4642857142857143,GPL570-0.4857142857142857,GPL570-0.5071428571428571,GPL570-0.5285714285714286,GPL570-0.55,GPL570-0.5714285714285714,GPL570-0.5928571428571429,GPL570-0.6142857142857143,GPL570-0.6357142857142857,GPL570-0.6571428571428573,GPL570-0.6785714285714286,GPL570-0.7
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,1.21891080703,1.2037479889,1.19820193109,1.15686702633,1.16649895337,1.17518665772,1.13906023778,1.11889276494,1.11424117288,1.09684685839,1.05079933717,0.89705335265,0.85426751049,0.88942797257,0.88459976028
std,0.95814234448,0.95615787376,0.95254622415,0.96300448656,0.97623549101,0.98603079806,0.9896396699,0.96697219658,0.99561848496,0.99695962872,0.99116770685,1.07659559894,1.15513161683,1.23705781179,1.28853957886
min,0.26618898325,0.27099455294,0.24094370985,0.2241659382,0.22498494446,0.16741551316,0.18207699149,0.18161394238,0.04864550012,0.05302577109,0.06187973186,0.0,0.0,0.0,0.0
25%,0.86332675702,0.81895964468,0.78977778491,0.7617109435,0.76527197519,0.76459906323,0.76255337019,0.69599485201,0.67380973636,0.67229194146,0.59700094951,0.35131228252,0.18340401704,0.10565202761,0.06608977992
50%,1.0525314141,1.02515407867,1.02620322021,0.97398136643,0.98297770769,0.97420271805,0.95770381634,0.92292000043,0.90922232146,0.89769560653,0.87122425195,0.68552457161,0.57079038712,0.59538329525,0.54390071155
75%,1.27523251816,1.25047312698,1.24861671559,1.19039840879,1.22766516067,1.25224609328,1.18539706486,1.19347902392,1.18469252759,1.20466634621,1.11729662827,1.09740119531,1.08952069686,1.14063240395,1.18250449918
max,7.12172960163,7.14667242821,7.10357668559,7.07677816096,7.11351686032,7.21451195018,7.3831510208,7.20931347698,7.39650889617,7.61839972944,7.0871045238,8.10392996828,8.38987022779,8.65158992064,8.68848920612


In [15]:
results["arm0"]["closeness"] = arm0_df_clos.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm0"]["closeness"] = (
    results["arm0"]["closeness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm0']['closeness']}")

dpfs                          1.00000000000
GPL570-0.4                    0.13942178192
GPL570-0.4214285714285715     0.13875182911
GPL570-0.44285714285714284    0.13278195802
GPL570-0.4642857142857143     0.13184874409
GPL570-0.4857142857142857     0.12739056404
GPL570-0.5071428571428571     0.12008446896
GPL570-0.5285714285714286     0.10856819814
GPL570-0.55                   0.10434250077
GPL570-0.5714285714285714     0.09806099684
GPL570-0.5928571428571429     0.09868932302
GPL570-0.6142857142857143     0.10653595509
GPL570-0.6357142857142857     0.07004092959
GPL570-0.6571428571428573     0.07633230112
GPL570-0.6785714285714286     0.06783857633
GPL570-0.7                    0.06806131030
Name: dpfs, dtype: float64


In [16]:
from analysis_nx import process_patients_with_f

arm1_df_clos = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.closeness_centrality,
    nx_pathways,
    mutations_data,
)
arm1_df_clos.describe()

Unnamed: 0,GPL570-0.4,GPL570-0.4214285714285715,GPL570-0.44285714285714284,GPL570-0.4642857142857143,GPL570-0.4857142857142857,GPL570-0.5071428571428571,GPL570-0.5285714285714286,GPL570-0.55,GPL570-0.5714285714285714,GPL570-0.5928571428571429,GPL570-0.6142857142857143,GPL570-0.6357142857142857,GPL570-0.6571428571428573,GPL570-0.6785714285714286,GPL570-0.7
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,1.24071776499,1.22790702464,1.2235661511,1.1855317312,1.18900667011,1.20061026619,1.15659405714,1.14557830514,1.1583299028,1.13777809401,1.12184532101,0.96343312503,0.88072718815,0.86871649089,0.90906708192
std,1.02987350448,1.03096903202,1.03519831168,1.05632920234,1.06276007405,1.08679978575,1.10318933661,1.08914946393,1.1273073969,1.13743016065,1.20481018109,1.23956463101,1.30774738992,1.31404538443,1.31652817622
min,0.47993473704,0.46319145429,0.46448895675,0.42139071149,0.4362132803,0.42072664286,0.36708440919,0.31652789725,0.23992431286,0.09930368455,0.00483492508,0.0,0.0,0.0,0.0
25%,0.84414278173,0.84218938954,0.83918719773,0.79685020277,0.78940851043,0.79522205257,0.74946260439,0.71437674161,0.6821196035,0.69285418969,0.62992325405,0.39121923073,0.19367314101,0.12205872364,0.08612990674
50%,1.00437923422,1.00550533831,1.00084726754,0.97314683496,0.97051609219,0.96547286829,0.93404795644,0.92542045879,0.94627538954,0.91995884964,0.90378050813,0.68813400365,0.62714582366,0.52772802179,0.53797456601
75%,1.23755107,1.24078377249,1.22553659677,1.15647689608,1.15508661613,1.14212006212,1.1397098943,1.14653405995,1.16111529658,1.20111777353,1.2095382552,1.15102736661,1.02117638425,1.13282435059,1.34253558189
max,9.89121388689,9.94786601793,9.97274013428,10.13271697114,10.17239234782,10.43838589598,10.51958348198,10.20013349458,10.50351179308,10.67731900617,11.18403072303,11.2727547583,12.19983498807,11.22264944118,10.96943676579


In [17]:
results["arm1"]["closeness"] = arm1_df_clos.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm1"]["closeness"] = (
    results["arm1"]["closeness"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm1']['closeness']}")

dpfs                          1.00000000000
GPL570-0.4                    0.09033770435
GPL570-0.4214285714285715     0.08643332256
GPL570-0.44285714285714284    0.08389899065
GPL570-0.4642857142857143     0.08842990520
GPL570-0.4857142857142857     0.09076266220
GPL570-0.5071428571428571     0.08602605963
GPL570-0.5285714285714286     0.07295237056
GPL570-0.55                   0.06304213315
GPL570-0.5714285714285714     0.05886824831
GPL570-0.5928571428571429     0.06002413531
GPL570-0.6142857142857143     0.06029867910
GPL570-0.6357142857142857     0.08285079755
GPL570-0.6571428571428573     0.08021812622
GPL570-0.6785714285714286     0.06390983073
GPL570-0.7                    0.08137356331
Name: dpfs, dtype: float64


### Eigenvector

In [18]:
from analysis_nx import process_patients_with_f

arm0_df_eigen = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.eigenvector_centrality_numpy,
    nx_pathways,
    mutations_data,
)
arm0_df_eigen.describe()

Unnamed: 0,GPL570-0.4,GPL570-0.4214285714285715,GPL570-0.44285714285714284,GPL570-0.4642857142857143,GPL570-0.4857142857142857,GPL570-0.5071428571428571,GPL570-0.5285714285714286,GPL570-0.55,GPL570-0.5714285714285714,GPL570-0.5928571428571429,GPL570-0.6142857142857143,GPL570-0.6357142857142857,GPL570-0.6571428571428573,GPL570-0.6785714285714286,GPL570-0.7
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,1.0328393562,1.01424326219,0.97861892026,0.95778374909,0.93226065513,0.89985675852,0.88051029107,0.88195739467,0.87716077969,0.87023345024,0.88240428957,0.841751580766,0.825447336727,0.831333716998,0.774194320432
std,0.99382246826,0.99736666477,0.998809897,1.02138712983,1.04037886188,1.08436563869,1.13068004496,1.21312546177,1.28282404119,1.32151190613,1.39431324086,1.45651743234,1.49336499502,1.56534121939,1.60477242166
min,0.11155798583,0.10648974861,0.10810793397,0.06620798931,0.02861253005,0.02954014524,0.01094936485,0.00539559598,0.00029333271,0.00026115818,4.737649e-05,-6.63633443626e-17,-4.56817126709e-16,-7.78112409694e-16,-4.2752378226e-16
25%,0.56118326977,0.53436175446,0.48751115244,0.45261941537,0.40434701648,0.31856234892,0.25591180013,0.1933906674,0.12503753989,0.09640579152,0.04295154742,0.0112754942912,7.75202377829e-17,0.0,0.0
50%,0.83562504827,0.81267273836,0.79488335276,0.72714716515,0.67471643667,0.5753007942,0.53449347166,0.44493025438,0.31927247173,0.25621166012,0.24381664791,0.166830802469,0.0314456868554,0.0297676923679,6.85677665062e-16
75%,1.15446659107,1.13372151734,1.12673214979,1.12493637068,1.12722788758,1.14344491983,1.1379831991,1.18242551649,1.29724370259,1.34817567983,1.38345667996,1.27011104955,1.06996484506,0.975684077679,0.767476154789
max,7.55505221594,7.61668680257,7.63602806636,7.74315895573,7.844655041,7.8995959117,8.01551172353,8.45522819327,8.67564331517,8.5949644152,8.87465769591,9.87606549256,10.3446178251,10.8397552473,11.3220175168


In [19]:
results["arm0"]["eigenvector"] = arm0_df_eigen.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm0"]["eigenvector"] = (
    results["arm0"]["eigenvector"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm0']['eigenvector']}")

dpfs                          1.00000000000
GPL570-0.4                    0.11432971199
GPL570-0.4214285714285715     0.10963618317
GPL570-0.44285714285714284    0.10914652218
GPL570-0.4642857142857143     0.10108976923
GPL570-0.4857142857142857     0.09638422531
GPL570-0.5071428571428571     0.09396389375
GPL570-0.5285714285714286     0.08810309036
GPL570-0.55                   0.08623719358
GPL570-0.5714285714285714     0.08182107023
GPL570-0.5928571428571429     0.07605271352
GPL570-0.6142857142857143     0.06690074446
GPL570-0.6357142857142857     0.05199487423
GPL570-0.6571428571428573     0.05391136746
GPL570-0.6785714285714286     0.04965188273
GPL570-0.7                    0.07168890804
Name: dpfs, dtype: float64


In [20]:
from analysis_nx import process_patients_with_f

arm1_df_eigen = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.eigenvector_centrality_numpy,
    nx_pathways,
    mutations_data,
)
arm1_df_eigen.describe()

Unnamed: 0,GPL570-0.4,GPL570-0.4214285714285715,GPL570-0.44285714285714284,GPL570-0.4642857142857143,GPL570-0.4857142857142857,GPL570-0.5071428571428571,GPL570-0.5285714285714286,GPL570-0.55,GPL570-0.5714285714285714,GPL570-0.5928571428571429,GPL570-0.6142857142857143,GPL570-0.6357142857142857,GPL570-0.6571428571428573,GPL570-0.6785714285714286,GPL570-0.7
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,1.09302515169,1.06914090259,1.03504204003,1.01532821567,0.98657289952,0.93766056636,0.92624342589,0.92361136337,0.89927360227,0.84577490141,0.831727256296,0.766668638334,0.715475742977,0.702105329234,0.663100459715
std,1.11397785206,1.12651225877,1.13741819943,1.15911518438,1.17482734306,1.1933010149,1.24008446304,1.30004945495,1.31840134348,1.33319414133,1.38901790619,1.40219976722,1.43902486231,1.51939627035,1.57777290093
min,0.24013812848,0.21187591734,0.16659901938,0.13692996908,0.08653779057,0.06230626283,0.03925513089,0.00622591721,0.00137268913,2.036278e-05,-2.86514569295e-17,-5.47497058961e-17,-4.671955313549999e-16,-1.18832493817e-15,-3.09687714624e-16
25%,0.66937069787,0.62561933518,0.56731313365,0.52114455284,0.46295796388,0.37686577857,0.32890338018,0.25865342637,0.16136100168,0.1225051162,0.0762841746362,0.0243503084979,1.82749995357e-16,0.0,0.0
50%,0.86263937176,0.82618734327,0.79246106715,0.74059589004,0.70265944316,0.6614622015,0.61327652722,0.58107307176,0.54720065244,0.43106544965,0.366318275982,0.249053591295,0.0874916232727,0.0450297557848,0.00542140979771
75%,1.09164600779,1.07316178398,1.11745136917,1.13572422374,1.13611230835,1.08572705444,1.08412030105,1.09729534992,1.04515179236,1.02747960373,0.955380776516,0.868825147044,0.680871202166,0.599252466318,0.558890845004
max,10.58532450814,10.64880301041,10.69663075292,10.77326761021,10.73359936153,10.67692730366,10.75134334069,10.89273858298,10.49456119396,10.1499170554,9.90052964024,9.37360100882,9.1676094762,9.80590202547,10.6632715493


In [21]:
results["arm1"]["eigenvector"] = arm1_df_eigen.join(
    patients_log.set_index("PatientFirstName"), on="PatientFirstName"
)
results["arm1"]["eigenvector"] = (
    results["arm1"]["eigenvector"][["dpfs"] + [pw.name for pw in nx_pathways]]
    .corr()
    .iloc[0]
)
print(f"{results['arm1']['eigenvector']}")

dpfs                          1.00000000000
GPL570-0.4                    0.08978066186
GPL570-0.4214285714285715     0.08700120081
GPL570-0.44285714285714284    0.08141141802
GPL570-0.4642857142857143     0.07612063386
GPL570-0.4857142857142857     0.07480248066
GPL570-0.5071428571428571     0.06978959129
GPL570-0.5285714285714286     0.06586671687
GPL570-0.55                   0.05701591338
GPL570-0.5714285714285714     0.05417172632
GPL570-0.5928571428571429     0.05685813655
GPL570-0.6142857142857143     0.04712799304
GPL570-0.6357142857142857     0.05165155739
GPL570-0.6571428571428573     0.06093395158
GPL570-0.6785714285714286     0.04164017947
GPL570-0.7                    0.05115249846
Name: dpfs, dtype: float64


In [22]:
import matplotlib.pyplot as plt
import numpy as np
import ipywidgets as widgets
import math

plt.rcParams["figure.figsize"] = [14, 8]

outputs = [widgets.Output() for _ in range(len(nx_pathways))]


def replace_nans(vals):
    return [0 if math.isnan(x) else x for x in vals]


def make_corr_graph(name):
    arm0_data = []
    for measure in results["arm0"]:
        num = results["arm0"][measure][name]
        arm0_data.append(num)
    arm1_data = []
    for measure in results["arm1"]:
        num = results["arm1"][measure][name]
        arm1_data.append(num)

    arm0_data = replace_nans(arm0_data)
    arm1_data = replace_nans(arm1_data)

    fig, ax = plt.subplots()

    width = 0.15
    labels = results["arm0"].keys()
    x1 = np.arange(len(labels))
    x2 = [x + width for x in x1]

    r1 = ax.bar(x1, arm0_data, width, label="arm0")
    r2 = ax.bar(x2, arm1_data, width, label="arm1")

    ax.set_ylabel("dpfs correlation")
    ax.set_title(f"Correlation with dpfs for {name}")
    ax.set_xticks(np.arange(len(labels)))
    ax.set_xticklabels(results["arm0"].keys())

    for container in [r1, r2]:
        for idx, rect in enumerate(container):
            height = rect.get_height()
            txt = (
                "0"
                if container.datavalues[idx] == 0
                else "{:.6f}".format(container.datavalues[idx])
            )
            ax.text(
                rect.get_x() + rect.get_width() / 2 + 0.01,
                rect.get_y() + (math.copysign(0.002, height) if height != 0 else 0),
                txt,
                fontsize="11",
                fontweight="regular",
                ha="center",
                va=("top" if height < 0 else "bottom"),
                rotation=90,
            )

    ax.legend(
        loc="upper center",
        bbox_to_anchor=(0.5, -0.05),
        fancybox=True,
        shadow=True,
        ncol=5,
    )

    plt.close(fig)
    return fig


tab = widgets.Tab(children=outputs)
for num, out in enumerate(outputs):
    name = nx_pathways[num].name
    tab.set_title(num, name)
    with out:
        display(make_corr_graph(name))

display(tab)

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output(), Output…

In [23]:
def nan_swap(x):
    return (x[0], 0) if math.isnan(x[1]) else x


measure_res = {}
for pw in nx_pathways:
    arm0_res = []
    arm1_res = []

    for measure in results["arm0"]:
        arm0_res.append(nan_swap((measure, results["arm0"][measure][pw.name])))

    for measure in results["arm1"]:
        arm1_res.append(nan_swap((measure, results["arm1"][measure][pw.name])))

    arm0_res.sort(key=lambda x: x[1])
    arm1_res.sort(key=lambda x: x[1])
    measure_res[pw.name] = {
        "arm0_type": arm0_res[-1][0],
        "arm0_value": arm0_res[-1][1],
        "arm1_type": arm1_res[-1][0],
        "arm1_value": arm1_res[-1][1],
    }

coexpression_measures = pd.DataFrame.from_dict(measure_res, orient='index')
coexpression_measures.sort_values('arm0_value', ascending=False)

Unnamed: 0,arm0_type,arm0_value,arm1_type,arm1_value
GPL570-0.7,betweenness,0.18546705067,betweenness,0.13280171954
GPL570-0.6785714285714286,betweenness,0.16796101375,betweenness,0.09540877221
GPL570-0.4214285714285715,baseline,0.14501653701,betweenness,0.09575111109
GPL570-0.4,baseline,0.14372996926,betweenness,0.0944239161
GPL570-0.44285714285714284,baseline,0.13658084676,betweenness,0.09622849179
GPL570-0.4642857142857143,baseline,0.13478455722,betweenness,0.10264417509
GPL570-0.4857142857142857,baseline,0.1340858574,betweenness,0.10009675461
GPL570-0.6571428571428573,baseline,0.13360433047,betweenness,0.10256774851
GPL570-0.5071428571428571,baseline,0.12864718616,betweenness,0.09005214806
GPL570-0.6357142857142857,baseline,0.12752062171,betweenness,0.14106763665


In [24]:
import networkx as nx
from analysis_nx import process_patients_with_config, PathwayConfig
import pathways_nx as pnx

patients_log = pd.read_csv("TRIBE2_db.csv")

dpfs = patients_log[["PatientFirstName", "arm", "dpfs"]]
dpfs.set_index("PatientFirstName")

arm0_mixed_df = process_patients_with_f(
    patients_log[patients_log["arm"] == 0]["PatientFirstName"],
    nx.betweenness_centrality,
    [pw for pw in nx_pathways if pw.name == 'GPL570-0.7'],
    mutations_data,
)
arm0_mixed_df = arm0_mixed_df.join(
    dpfs[dpfs["arm"] == 0][["PatientFirstName", "dpfs"]].set_index("PatientFirstName"),
    on="PatientFirstName",
)

arm1_mixed_df = process_patients_with_f(
    patients_log[patients_log["arm"] == 1]["PatientFirstName"],
    nx.betweenness_centrality,
    [pw for pw in nx_pathways if pw.name == 'GPL570-0.7'],
    mutations_data,
)
arm1_mixed_df = arm1_mixed_df.join(
    dpfs[dpfs["arm"] == 1][["PatientFirstName", "dpfs"]].set_index("PatientFirstName"),
    on="PatientFirstName",
)

arm0_mixed_df.to_csv("arm0_gpl0.7_mutations.csv", index=False)
arm1_mixed_df.to_csv("arm1_gpl0.7_mutations.csv", index=False)