In [1]:
import pandas as pd
import re
import numpy as np
import glob
import os
from collections import defaultdict
import networkx
from networkx.algorithms.components.connected import connected_components
from packages import npomix
import time
from datetime import datetime
from sklearn.metrics import jaccard_score

In [2]:
start = time.time()

In [3]:
edges_path = "./inputs/gnps_function/ProteoSAFe-METABOLOMICS-SNETS-V2-67f55c7d-download_cytoscape_data/bfcbd4d8e55c43e99283bb029c4bad89..selfloop"
nodes_path = "./inputs/gnps_function/ProteoSAFe-METABOLOMICS-SNETS-V2-67f55c7d-download_cytoscape_data/c985e9c2abe94283828bd01c94645a29.clustersummary"

In [4]:
nodes_df = pd.read_csv(nodes_path,sep='\t')

nodes_df[:5]

Unnamed: 0,AllGroups,DefaultGroups,EvenOdd,G1,G2,G3,G4,G5,G6,GNPSLinkout_Cluster,...,SpectrumID,UniqueFileSources,UniqueFileSourcesCount,cluster index,componentindex,number of spectra,parent mass,precursor charge,precursor mass,sum(precursor intensity)
0,,G1,1,8,0,0,0,0,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,...,,Bact_vulg_CL09T03C04_V1.3.mzXML|Bact_sp_9_1_42...,7,2,-1,8,84.043,0,84.043,12004.0
1,,G1,1,14,0,0,0,0,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,...,,Bacteroides_sp_1_1_30_V1.3.mzXML|Clos_bact_OBR...,12,4,-1,14,84.043,0,84.043,23389.0
2,,G1,1,20,0,0,0,0,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,...,,Bacteroides_sp_1_1_30_V1.3.mzXML|Bact_sp_1_1_6...,12,6,1846,20,84.044,0,84.044,46212.0
3,,G1,1,16,0,0,0,0,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,...,,Bacteroides_sp_1_1_30_V1.3.mzXML|Bact_sp_1_1_6...,13,11,1846,16,84.043,0,84.043,37704.0
4,,G1,1,6,0,0,0,0,0,https://gnps.ucsd.edu//ProteoSAFe/result.jsp?t...,...,,Bact_thet_CL09T03C10_V1.3.mzXML|Bacteroides_sp...,4,35,-1,6,84.043,0,84.043,12097.0


In [5]:
clusterindex_list = []

for i,r in nodes_df.iterrows():
    if type(r['LibraryID']) == float:
        clusterindex_list.append(r['cluster index'])
        
clusterindex_list

[2,
 4,
 6,
 11,
 35,
 734,
 741,
 742,
 745,
 1149,
 3315,
 3343,
 3355,
 3397,
 3695,
 3704,
 3812,
 3867,
 3868,
 3890,
 3988,
 4068,
 4609,
 5012,
 5161,
 5181,
 5183,
 5197,
 5224,
 5297,
 5303,
 5351,
 5352,
 5453,
 5618,
 5621,
 5625,
 5639,
 5640,
 5643,
 5693,
 5697,
 6027,
 6029,
 6040,
 6044,
 6046,
 6108,
 6191,
 6261,
 6434,
 6845,
 6941,
 6999,
 7207,
 7250,
 7251,
 7252,
 7265,
 7407,
 7411,
 7412,
 7435,
 7497,
 7714,
 7795,
 9852,
 10153,
 10154,
 10155,
 10156,
 10157,
 10159,
 10163,
 11250,
 11301,
 11315,
 11322,
 11325,
 11340,
 11341,
 11344,
 11376,
 12048,
 12075,
 12222,
 12386,
 12391,
 12393,
 12404,
 12410,
 12413,
 12418,
 12420,
 12424,
 12425,
 12426,
 12427,
 12428,
 12456,
 12469,
 12498,
 12513,
 12531,
 12535,
 12605,
 12614,
 12674,
 12675,
 12690,
 12733,
 12734,
 12736,
 12858,
 12861,
 12983,
 13220,
 13222,
 13224,
 13231,
 13236,
 13248,
 13282,
 13283,
 13294,
 13303,
 13333,
 13336,
 13342,
 13343,
 13355,
 13359,
 13393,
 13394,
 13395,
 133

In [6]:
edges_df = pd.read_csv(edges_path,sep='\t')

def get_neighbors(target,dataframe,column1,column2):
    subset1 = dataframe[(dataframe[column1]==target)]
    subcat = subset1.append(dataframe[(dataframe[column2]==target)])
    temp_list = []
    for index,row in subcat.iterrows():
        temp_list.append(subcat[column1][index])
        temp_list.append(subcat[column2][index])
    temp_list = list(np.unique(temp_list))
    return temp_list

def to_edges(l):
    it = iter(l)
    last = next(it)
    for current in it:
        yield last, current
        last = current

def to_graph(l):
    G = networkx.Graph()
    for part in l:
        G.add_nodes_from(part)
        G.add_edges_from(to_edges(part))
    return G

def get_family_dict(components_list,dataframe,dictionary,column1,column2,column3):
    count = 0
    for family in list(components_list):
        count += 1
        for fam_member in family:
            dictionary['MF%s'%count].append(fam_member)
    return dictionary

def main_get_families(gnps_df):
    targets_list = np.unique([gnps_df.CLUSTERID1,gnps_df.CLUSTERID2])
    neighbors_list = []
    for target in targets_list:
        neighbors_list.append(get_neighbors(target,gnps_df,'CLUSTERID1','CLUSTERID2'))
    G = to_graph(neighbors_list)
    C = connected_components(G)
    mf_dict = defaultdict(list)
    mf_dict = get_family_dict(C,gnps_df,mf_dict,'CLUSTERID1','CLUSTERID2','Cosine')
    return mf_dict

mf_dict = main_get_families(edges_df)

mf_dict

defaultdict(list,
            {'MF1': [2],
             'MF2': [4],
             'MF3': [6434,
              4068,
              13445,
              6,
              61689,
              5640,
              2060188,
              11,
              61648,
              3890,
              82035,
              17780,
              17788,
              5625,
              3868],
             'MF4': [35],
             'MF5': [734],
             'MF6': [741],
             'MF7': [742],
             'MF8': [3867, 745, 3355],
             'MF9': [1149],
             'MF10': [14875, 3315],
             'MF11': [3343],
             'MF12': [5181, 3397],
             'MF13': [3695],
             'MF14': [3704],
             'MF15': [7411, 3812, 7412, 7407],
             'MF16': [3988],
             'MF17': [4609],
             'MF18': [15552, 15585, 5012, 25748, 15546],
             'MF19': [5161],
             'MF20': [5297, 5197, 5183],
             'MF21': [5224],
             'MF22': [5303]

In [7]:
for ion in clusterindex_list:
    print(ion)

2
4
6
11
35
734
741
742
745
1149
3315
3343
3355
3397
3695
3704
3812
3867
3868
3890
3988
4068
4609
5012
5161
5181
5183
5197
5224
5297
5303
5351
5352
5453
5618
5621
5625
5639
5640
5643
5693
5697
6027
6029
6040
6044
6046
6108
6191
6261
6434
6845
6941
6999
7207
7250
7251
7252
7265
7407
7411
7412
7435
7497
7714
7795
9852
10153
10154
10155
10156
10157
10159
10163
11250
11301
11315
11322
11325
11340
11341
11344
11376
12048
12075
12222
12386
12391
12393
12404
12410
12413
12418
12420
12424
12425
12426
12427
12428
12456
12469
12498
12513
12531
12535
12605
12614
12674
12675
12690
12733
12734
12736
12858
12861
12983
13220
13222
13224
13231
13236
13248
13282
13283
13294
13303
13333
13336
13342
13343
13355
13359
13393
13394
13395
13398
13401
13403
13410
13412
13413
13414
13415
13418
13419
13425
13427
13429
13441
13445
13447
13451
13452
13481
13521
13533
13543
13564
13566
13569
13592
13598
13605
13607
13612
13614
13626
13627
13628
13634
13644
13652
13663
13665
13674
13678
13686
13696
13701
13718
1372

166932
167084
167125
167266
167298
167335
167395
167466
167481
167493
167657
167663
167721
167733
167759
167781
167783
167811
167906
167917
167938
167985
168034
168048
168096
168130
168193
168222
168251
168288
168303
168331
168414
168464
168531
168598
168609
168619
168759
168873
168876
168900
168902
169035
169055
169198
169207
169209
169351
169383
169391
169705
169805
170213
170283
170546
170599
170779
170892
171046
171152
171186
171289
171355
171445
171575
171755
171868
172077
172102
172221
172493
172514
172785
172875
173027
173064
173379
173420
173505
173624
173661
173755
173899
174076
174108
174216
174229
174239
174380
174475
174595
174645
174741
174960
174979
175077
175111
175112
175507
175592
175611
175682
175747
175829
175850
175934
176031
176144
176439
176479
176798
176962
176984
177070
177964
178456
178511
178514
178557
178660
178795
178832
178897
179240
179609
180805
181244
181709
181849
182008
182401
182408
182757
182880
183101
183155
183174
183382
183425
183426
183943
184567

570726
570781
570789
570802
570807
570818
571188
571433
571456
571645
571922
573565
573574
573656
573663
573676
573678
573687
573714
573733
573758
573764
573826
573845
573950
573961
573973
574068
574092
574100
574180
574200
574203
574218
574219
574245
574345
574412
574464
574466
574552
574556
574626
574637
574698
574986
574987
574993
575028
575036
575038
575047
575048
575452
575570
575628
575636
575655
575712
575791
575795
575798
575827
575853
575886
575934
576086
576111
576113
576115
576121
576145
576148
576149
576170
576175
576242
576286
576288
576355
576398
576649
576668
576739
576741
576742
576800
576802
576925
576965
577016
577679
578778
579467
579477
579961
580471
584022
584028
584030
584032
584077
584079
584080
584081
584082
584083
584091
584092
584104
584105
584158
584164
584236
584237
584242
584244
585731
585739
585789
585821
585825
585915
585963
585969
585978
585985
585986
586005
586014
586096
586267
586301
586995
587000
587001
587022
587023
587028
587043
587098
587138
587158

890036
890063
890097
890112
890169
890170
890171
890173
890177
890185
890194
890211
890212
890214
890215
890217
890218
890256
890259
890264
890268
890287
890324
890325
890380
890403
890509
890530
890758
890764
890856
890866
890908
890920
890921
890926
890949
891035
891076
891112
891124
891127
891132
891135
891139
891148
891150
891162
891168
891169
891172
891178
891188
891190
891192
891199
891203
891214
891218
891239
891255
891271
891278
891282
891284
891297
891303
891318
891319
891349
891351
891352
891367
891381
891414
891421
891462
891506
891539
891553
891626
891743
892107
892108
892110
892131
892135
892143
892147
892154
892155
892160
892166
892363
892376
892378
892380
892381
892385
892386
892424
892658
892680
892694
892762
892799
892805
892851
892987
893066
893294
893367
893969
893977
894109
894124
894157
894258
894287
894305
894306
894447
894451
894452
894454
894477
894481
894514
894535
894586
894599
894635
894667
894735
894756
894761
894778
894785
894801
894820
894827
894828
894832

1200023
1200024
1200041
1200061
1200096
1200097
1200116
1200143
1200242
1200453
1200581
1200681
1200684
1200700
1200701
1200728
1200775
1200934
1201254
1201272
1201278
1201279
1201694
1201732
1202584
1204902
1205139
1205193
1205289
1205609
1205614
1205619
1205640
1205672
1205739
1205752
1205787
1205823
1205846
1205867
1205911
1205982
1206003
1206080
1206126
1206133
1206155
1206205
1206782
1206783
1206791
1206792
1206820
1206889
1207159
1207173
1207174
1207182
1207183
1207207
1207328
1207788
1208157
1208160
1208164
1208286
1208297
1208339
1208343
1208344
1208348
1208378
1208387
1208420
1208424
1208450
1208463
1208656
1208765
1208767
1208773
1208780
1208818
1208824
1208828
1209042
1209078
1209097
1209104
1209117
1209129
1209131
1209144
1209151
1209182
1209198
1209450
1209545
1209548
1209594
1209628
1209631
1209632
1209677
1209721
1209734
1209738
1209739
1209752
1209756
1209757
1209758
1209761
1209814
1209822
1209863
1210122
1210123
1210124
1210136
1210155
1210167
1210185
1210192
1210194


1611431
1611436
1611437
1611516
1611541
1611557
1611659
1611684
1611768
1611814
1611823
1611826
1611832
1611839
1611879
1611889
1611905
1611908
1612077
1612078
1612079
1612080
1612081
1612082
1612089
1612092
1612100
1612194
1612256
1612411
1612875
1612971
1612998
1613064
1613165
1613214
1613780
1613979
1614134
1614809
1615599
1615663
1615713
1615794
1615871
1616017
1616019
1616040
1616048
1616057
1616196
1616249
1616259
1616341
1616495
1616512
1616517
1616520
1616543
1616562
1616584
1616594
1616615
1616624
1616652
1616666
1616718
1616953
1617069
1617133
1617135
1617393
1617396
1617444
1617511
1617519
1617525
1617527
1617534
1617548
1617573
1617577
1617592
1617593
1617603
1617643
1617654
1617666
1617686
1617794
1617817
1617820
1617830
1617834
1617873
1617875
1617883
1617885
1617897
1617900
1617907
1617941
1617942
1618039
1618040
1618082
1618121
1618127
1618258
1618343
1618348
1618356
1618373
1618391
1618487
1618526
1618559
1618583
1618693
1618757
1618800
1618968
1619020
1619129
1619305


1803161
1803175
1803179
1803208
1803447
1803535
1803550
1803584
1803585
1804070
1804073
1804078
1804082
1804091
1804094
1804102
1804107
1804199
1804200
1804202
1804204
1804254
1804267
1804269
1804284
1804302
1804303
1804310
1804324
1804329
1804337
1804370
1804421
1804427
1804445
1804531
1804558
1804579
1804674
1804708
1804710
1804714
1804729
1804741
1804751
1804754
1804756
1804762
1804766
1804768
1804774
1804785
1804802
1804805
1804825
1804856
1804858
1804861
1804897
1805188
1805229
1805242
1805314
1805652
1805660
1805661
1805687
1805763
1805817
1805890
1806145
1806154
1806181
1806325
1806329
1806331
1806332
1806391
1806393
1806425
1806433
1806463
1806549
1806720
1806766
1806904
1808547
1808550
1808551
1808555
1808562
1808569
1808614
1808618
1808619
1808620
1808622
1808624
1808627
1808629
1808631
1808632
1808634
1808636
1808644
1808648
1808655
1808656
1808661
1808684
1808685
1808697
1808699
1808734
1808739
1808755
1808756
1808767
1808769
1808774
1808791
1808810
1808830
1808852
1808862


1994315
1994323
1994335
1994340
1994417
1994419
1994444
1994461
1994463
1994465
1994485
1994487
1994489
1994494
1994662
1994669
1994691
1994708
1994712
1994740
1994980
1995044
1995058
1995130
1995134
1995138
1995145
1995157
1995159
1995160
1995170
1995172
1995175
1995185
1995193
1995242
1995263
1995272
1995275
1995281
1995283
1995290
1995292
1995294
1995299
1995301
1995320
1995325
1995339
1995357
1995364
1995372
1995450
1995661
1995672
1995679
1995682
1995689
1995692
1995706
1995708
1995741
1995764
1995792
1995800
1995803
1995888
1995902
1995904
1995906
1995911
1995943
1996149
1996158
1996159
1996721
1997133
1997174
1997175
1997178
1997181
1997184
1997186
1997188
1997227
1997228
1997257
1997281
1997293
1997318
1997325
1997342
1997351
1997392
1997396
1997408
1997420
1997426
1997430
1997435
1997452
1997543
1997766
1997768
1997780
1997783
1997792
1997846
1997852
1997897
1997903
1997914
1997924
1997936
1997942
1998034
1998043
1998054
1998058
1998060
1998065
1998068
1998074
1998075
1998079


2160445
2160448
2160452
2160481
2160500
2160569
2160653
2160654
2160675
2160683
2160689
2160691
2160714
2160716
2160956
2160974
2161205
2161241
2161252
2161262
2161321
2161351
2161357
2161361
2161391
2161449
2161467
2161482
2161536
2161545
2161549
2161593
2161615
2161616
2161644
2161650
2161670
2161677
2161711
2161911
2161912
2161914
2161915
2161916
2161929
2162080
2162087
2162099
2162108
2162109
2162116
2162131
2162157
2162185
2162191
2162202
2162207
2162241
2162251
2162263
2162322
2162329
2162380
2162383
2162385
2162400
2162405
2162444
2162459
2162504
2162508
2162512
2162613
2162764
2162808
2162813
2162814
2162821
2162822
2162824
2162832
2162886
2162967
2163019
2163731
2163882
2163934
2163937
2163944
2163959
2163969
2164027
2164038
2164042
2164058
2164078
2164084
2164099
2164122
2164133
2164156
2164202
2164203
2164204
2164208
2164216
2164670
2164671
2164679
2164691
2164700
2164703
2164704
2164707
2164711
2164714
2164720
2164721
2164723
2164740
2164742
2164745
2164816
2165157
2165173


2328066
2328067
2328069
2328078
2328104
2328145
2328171
2328231
2328232
2328237
2328240
2328251
2328259
2328511
2328612
2328637
2328709
2328768
2328801
2328807
2329126
2329172
2329431
2330011
2330164
2330285
2330289
2330315
2330326
2330554
2330674
2330812
2330869
2330918
2330920
2330953
2330954
2330957
2330997
2331031
2331143
2331217
2331224
2331226
2331230
2331241
2331248
2331262
2331267
2331290
2331292
2331313
2331386
2331510
2331587
2331591
2331636
2331637
2331638
2331642
2331769
2331774
2331846
2332091
2332121
2332137
2332207
2332216
2332246
2332281
2332283
2332308
2332381
2332397
2332532
2332549
2332559
2332761
2332981
2333014
2333017
2333018
2333023
2333112
2333127
2333132
2333135
2333141
2333144
2333158
2333171
2333185
2333252
2333268
2333270
2333274
2333279
2333341
2333385
2333388
2333393
2333402
2333412
2333421
2333422
2333425
2333433
2333447
2333452
2333577
2333580
2333583
2333587
2333610
2333636
2333641
2333642
2333643
2333928
2333931
2333936
2333949
2333950
2333954
2333956


2459658
2459694
2459695
2459698
2459700
2459701
2459741
2459744
2459766
2459767
2459768
2459800
2459851
2459869
2460029
2460030
2460048
2460064
2460078
2460098
2460102
2460110
2460123
2460136
2460167
2460173
2460317
2460321
2460335
2460340
2460342
2460343
2460359
2460364
2460393
2460407
2460409
2460427
2460430
2460464
2460524
2460536
2460537
2460538
2460685
2460701
2460798
2460800
2460859
2460891
2460903
2460933
2460956
2461004
2461009
2461023
2461035
2461048
2461049
2461050
2461078
2461116
2461156
2461177
2461200
2461201
2461203
2461216
2461220
2461255
2461423
2461425
2461457
2461459
2461491
2461492
2461528
2461565
2461661
2461664
2461700
2461724
2461802
2461915
2462042
2462107
2462110
2462111
2462156
2462170
2462172
2462173
2462174
2462271
2462386
2462390
2462395
2462432
2462455
2462470
2462516
2462523
2462574
2462576
2462584
2462600
2462679
2462880
2462901
2462917
2463108
2463111
2463117
2463135
2463146
2463162
2463164
2463178
2463185
2463186
2463187
2463190
2463193
2463197
2463205


In [8]:
lcms_file_list = []

for item in nodes_df['UniqueFileSources']:
    for lcms_file in item.split('|'):
        if lcms_file not in lcms_file_list:
            print(lcms_file)
            lcms_file_list.append(lcms_file)

Bact_vulg_CL09T03C04_V1.3.mzXML
Bact_sp_9_1_42FAA_V2.3.mzXML
Blongum44Bv1.3.mzXML
SspCM7v1.2.mzXML
Clos_clos_2_1_49FAA_V1.3.mzXML
SspSR1v1.2.mzXML
Clos_orbi_1_3_50AFAA_V1.3.mzXML
Bacteroides_sp_1_1_30_V1.3.mzXML
Clos_bact_OBRC5-5_V1.2.mzXML
SspOBRC6v1.2.mzXML
Bact_frag_CL07T12C05_V1.3.mzXML
Bact_cacc_CL03T12C61_V1.3.mzXML
SspCM6v1.2.mzXML
Para_merd_CL03T12C32_V1.3.mzXML
Bact_dore_CL02T00C15_V1.3.mzXML
Bact_sp_1_1_6_V2.3.mzXML
Bact_cell_CL02T12C19_V1.3.mzXML
Bact_frag_CL07T00C01_V1.3.mzXML
Bact_ster_CC31F_V1.3.mzXML
Acti_vis_C505_V3.3.mzXML
Bact_thet_CL09T03C10_V1.3.mzXML
GCA_000012265.2.mzXML
GCA_000012265.1.mzXML
GCA_000506385.2.mzXML
GCA_003324555.1.mzXML
GCA_001562525.1.mzXML
GCA_001562525.2.mzXML
GCA_000506385.4.mzXML
GCA_003324555.2.mzXML
Blongum35Bv1.3.mzXML
Bact_dore_CL02T12C06_V1.3.mzXML
Bifi_brev_HPH0326_V1.3.mzXML
SspBS29av1.2.mzXML
Bact_frag_CL03T12C07_V1.3.mzXML
ERX2291258.1.mzXML
ERX2291921.1.mzXML
ERX2291789.1.mzXML
BspMSTE12v1.3.mzXML
B111.4.mzML
7961.3.mzML
M92.1.mzML
M

651324085.2.mzXML
GCA_000144465.1.mzXML
GCA_000158335.4.mzXML
647533241.3.mzXML
2513237359.1.mzXML
GCA_000340725.4.mzXML
GCA_000144545.4.mzXML
GCA_000273015.2.mzXML
649989921.1.mzXML
643886104.4.mzXML
GCA_000162455.2.mzXML
2537561578.3.mzXML
2534682202.1.mzXML
GCA_000263115.4.mzXML
GCA_000144205.4.mzXML
GCA_000189615.1.mzXML
GCA_000411515.4.mzXML
2531839423.2.mzXML
649989912.4.mzXML
GCA_000145355.1.mzXML
GCA_000466585.3.mzXML
GCA_000183585.2.mzXML
GCA_000157075.4.mzXML
GCA_000145575.1.mzXML
2519103067.1.mzXML
GCA_000145455.2.mzXML
GCA_000273115.1.mzXML
GCA_000145075.1.mzXML
GCA_000144875.1.mzXML
GCA_000273465.3.mzXML
2537561582.1.mzXML
GCA_000507825.3.mzXML
2537561583.4.mzXML
649989921.4.mzXML
2537561578.4.mzXML
2531839424.1.mzXML
GCA_000145195.3.mzXML
2541047568.1.mzXML
2531839320.1.mzXML
GCA_000763055.1.mzXML
GCA_000090945.1.mzXML
647000294.4.mzXML
2531839304.2.mzXML
GCA_000145195.1.mzXML
GCA_000145195.2.mzXML
2531839114.3.mzXML
GCA_000414425.4.mzXML
2600255234.4.mzXML
647000295.2.mz

ERX2291863.1.mzXML
Sele_noxi_F0398_V1.2.mzXML
ERX2291766.1.mzXML
ERX2291904.1.mzXML
ERX2291787.1.mzXML
ERS4346493.1.mzXML
ERX2291809.1.mzXML
ERS4356307.1.mzXML
Kleb_oxyt_10-5250_V1.1.mzXML
ERS4341463.1.mzXML
ERS4346539.1.mzXML
ERS4341666.1.mzXML
ERS4341455.1.mzXML
ERS4346538.1.mzXML
ERS4341437.1.mzXML
ERS4346468.1.mzXML
ERS4341555.1.mzXML
ERS4341691.1.mzXML
ERS4341584.1.mzXML
ERS4341510.1.mzXML
ERS4341447.1.mzXML
ERS4341615.1.mzXML
ERS4341715.1.mzXML
ERS4341459.1.mzXML
ERS4341517.1.mzXML
ERS4341589.1.mzXML
ERS4346519.1.mzXML
ERS4341445.1.mzXML
ERS4341399.1.mzXML
ERS4346473.1.mzXML
ERS4346470.1.mzXML
ERS4346410.1.mzXML
ERS4346445.1.mzXML
ERS4346552.1.mzXML
ERS4341609.1.mzXML
ERS4341461.1.mzXML
ERS4341534.1.mzXML
ERS4341710.1.mzXML
ERS4341672.1.mzXML
ERS4356189.1.mzXML
ERS4341462.1.mzXML
ERS4346545.1.mzXML
ERS4341436.1.mzXML
GCA_010692685.7.mzXML
GCA_010672385.3.mzXML
GCA_010672455.3.mzXML
ERS4341374.1.mzXML
ERS4341364.1.mzXML
ERS4341656.1.mzXML
ERS4346563.1.mzXML
ERS4346536.1.mzXML
ERS4

ERS4341474.1.mzXML
ERS4346560.1.mzXML
ERS4341457.1.mzXML
ERS4346497.1.mzXML
ERS4346591.1.mzXML
ERX2291622.1.mzXML
ERX2291852.1.mzXML
ERX2291374.1.mzXML
ERS4346443.1.mzXML
ERS4356177.1.mzXML
ERS4356301.1.mzXML
ERS4356324.1.mzXML
ERS4346557.1.mzXML
ERS4346422.1.mzXML
2791354866.1.mzXML
2791354855.1.mzXML
ERS4341630.1.mzXML
ERS4341625.1.mzXML
ERS4341543.1.mzXML
ERX2291642.1.mzXML
GCF_000714595.4.mzML
GCF_000714595.3.mzML
GCF_000714595.5.mzML
GCF_000714595.6.mzML
GCF_000714595.2.mzML
ERS4356305.1.mzXML
ERS4356166.1.mzXML
ERX2291869.1.mzXML
ERX2291903.1.mzXML
ERX2291466.1.mzXML
ERS4341469.1.mzXML
ERX2291786.1.mzXML
ERX2291901.1.mzXML
GCA_000314005.1.mzML
GCA_000021805.1.mzML
GCA_000010065.1.mzML
GCA_000315585.1.mzML
GCA_000316575.1.mzML
GCA_000317635.1.mzML
GCA_000316665.1.mzML
GCA_003326215.1.mzML
GCA_000022045.1.mzML
GCA_000180455.1.mzML
GCA_000176895.1.mzML
GCA_000332235.1.mzML
GCA_000309385.1.mzML
GCA_000332315.1.mzML
GCA_000317085.1.mzML
GCA_000332195.1.mzML
GCA_000332195.2.mzML
GCA_00

GCA_003610995.5.mzXML
GCA_000156695.5.mzXML
2524614807.17.mzXML
GCA_000158915.7.mzXML
GCA_000377965.13.mzXML
2515154177.17.mzXML
2517572194.4.mzXML
2517572194.5.mzXML
GCA_000426325.1.mzXML
GCA_000701285.1.mzXML
GCA_000426165.4.mzXML
2561511112.1.mzXML
GCA_000240165.43.mzXML
2517572165.4.mzXML
GCA_000377105.4.mzXML
GCA_000424825.1.mzXML
GCA_000377165.4.mzXML
GCA_000240165.13.mzXML
GCA_000377125.6.mzXML
GCA_000424965.5.mzXML
GCA_000514715.1.mzXML
2517572165.5.mzXML
GCA_000377105.5.mzXML
GCA_000424825.4.mzXML
GCA_000514995.1.mzXML
GCA_000377125.4.mzXML
GCA_000424845.4.mzXML
GCA_000426325.6.mzXML
2561511112.4.mzXML
GCA_000377145.6.mzXML
GCA_000213055.6.mzXML
GCA_000515055.9.mzXML
GCA_000424765.4.mzXML
GCA_000482585.6.mzXML
GCA_000377545.6.mzXML
GCA_000377125.3.mzXML
GCA_000424845.6.mzXML
GCA_000426325.4.mzXML
GCA_000158975.6.mzXML
GCA_000701285.4.mzXML
2548876909.2.mzXML
GCA_000377105.6.mzXML
GCA_000527195.4.mzXML
GCA_000527195.6.mzXML
2517572165.6.mzXML
GCA_000424765.6.mzXML
GCA_000156435

In [9]:
len(lcms_file_list)

3240

In [10]:
def get_presence_files(nodes_df):
    for unique_list in list(nodes_df[nodes_df['cluster index'] == ion]['UniqueFileSources']):
        return list(unique_list.split('|'))

all_rows_list,testing_indexes_list = [],[]
for ion in clusterindex_list:
    subset_edges = edges_df[(edges_df.CLUSTERID1 == ion) | (edges_df.CLUSTERID2 == ion)]
    cosine = round(max(subset_edges['Cosine']),2)
    presence_list = get_presence_files(nodes_df)
    single_row_list = []
    for lcms_file in lcms_file_list:
        if lcms_file in presence_list:
            single_row_list.append(cosine)
        else:
            single_row_list.append(0)
    all_rows_list.append(single_row_list)
    testing_indexes_list.append(ion)
    
all_rows_list

pre_testing_df = pd.DataFrame(all_rows_list,index=testing_indexes_list,columns=lcms_file_list)

pre_testing_df

Unnamed: 0,Bact_vulg_CL09T03C04_V1.3.mzXML,Bact_sp_9_1_42FAA_V2.3.mzXML,Blongum44Bv1.3.mzXML,SspCM7v1.2.mzXML,Clos_clos_2_1_49FAA_V1.3.mzXML,SspSR1v1.2.mzXML,Clos_orbi_1_3_50AFAA_V1.3.mzXML,Bacteroides_sp_1_1_30_V1.3.mzXML,Clos_bact_OBRC5-5_V1.2.mzXML,SspOBRC6v1.2.mzXML,...,GCA_000087965.7.mzXML,GCA_000087965.5.mzXML,GCA_000087965.3.mzXML,GCA_000087965.2.mzXML,GCA_000087965.8.mzXML,GCA_000087965.9.mzXML,GCA_000087965.6.mzXML,GCA_000087965.1.mzXML,GCA_000377145.7.mzXML,2517434008.3.mzXML
2,1.00,1.00,1.0,1.00,1.0,1.00,1.0,0.00,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00,1.00,1.0,0.00,0.0,1.00,1.0,1.00,1.0,1.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.80,0.80,0.0,0.80,0.0,0.80,0.0,0.80,0.8,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.78,0.78,0.0,0.78,0.0,0.78,0.0,0.78,0.0,0.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,0.00,0.00,0.0,0.00,0.0,0.00,0.0,1.00,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2592184,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.00,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2592200,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.00,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2592203,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.00,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2592207,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.00,0.0,0.00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
def get_final_df(training_df,testing_df,neighbors_array,results_folder):
#     final_df = pd.DataFrame(columns=('metabolite_ID','predicted_GCFs','max_jaccard','parent_mass','peak_count'))
    final_df = pd.DataFrame(columns=('metabolite_ID','predicted_GCFs','max_jaccard'))
    for i,ccms_id in enumerate(testing_df.index):
        jaccard_scores = []
        for j in range(0,len(neighbors_array[i])):
            query_bgc = neighbors_array[i][j]
            bgc_fp = training_df[training_df['label'] == query_bgc].iloc[0]
            bgc_fp = bgc_fp.drop("label")
            ms_fp = testing_df.loc[ccms_id]
            bgc_binary = npomix.get_binary(bgc_fp)
            ms_binary = npomix.get_binary(ms_fp)
            jaccard_scores.append(jaccard_score(bgc_binary,ms_binary))
        max_jaccard = round(max(jaccard_scores),2)
#         pepmass,peak_count = get_ms2_metadata(ccms_id)
        final_df.loc[i] = ccms_id,neighbors_array[i],max_jaccard
#     final_df.to_csv("%sfinal_df-NPOmix1.0-%s.txt"%(results_folder,current_date),sep="\t",index_label=False)
    return final_df

In [12]:
ena_df_file = "./ena_dict-210315.csv"
input_bigscape_net = "./bigscape_all_c030.txt"
antismash_folder = "./inputs/gnps_function/antismash_only_gbk/"
results_folder = "./lastest_results/npomix2results_gnps_validation/"

current_date = datetime.today().strftime('%Y%m%d')

if not os.path.isdir(results_folder):
    os.mkdir(results_folder)

k_value = 3

merged_ispec_mat = npomix.get_merged_ispec_mat(pre_testing_df)
merged_ispec_mat = npomix.renaming_merged_ispec_mat(ena_df_file,merged_ispec_mat)
print('Obtaining BiG-SCAPE dataframe and BiG-SCAPE dictionary')
bigscape_df,bigscape_dict = npomix.get_bigscape_df(ena_df_file,input_bigscape_net)
bigscape_df,bigscape_dict2 = npomix.rename_bigscape_df(antismash_folder,bigscape_df,bigscape_dict)
print('BiG-SCAPE create with %s GCFs'%len(bigscape_dict))
strain_list,bgcs_list = npomix.get_strain_list(bigscape_df)
print('Creating training dataframe')
affinity_df,affinity_bgcs = npomix.get_pre_training_df(bigscape_df,bigscape_dict2,strain_list,bgcs_list)
affinity_df = npomix.renaming_affinity_df(affinity_df)
networked_cols = npomix.get_networked_cols(merged_ispec_mat,affinity_df)
training_df,training_bgcs = npomix.get_training_df(affinity_df,networked_cols,results_folder,affinity_bgcs)
bgcs_df = pd.DataFrame(training_bgcs, columns=['bgcs'])
testing_df = npomix.get_testing_df(merged_ispec_mat,networked_cols,results_folder)
print('Running KNN using k equals to %s'%k_value)
neighbors_array = npomix.running_knn(training_df,testing_df,k_value)
print('Creating final dataframe')
final_df = get_final_df(training_df,testing_df,neighbors_array,results_folder)

Obtaining BiG-SCAPE dataframe and BiG-SCAPE dictionary
BiG-SCAPE create with 997 GCFs
Creating training dataframe
Running KNN using k equals to 3
Creating final dataframe


In [13]:
final_df

Unnamed: 0,metabolite_ID,predicted_GCFs,max_jaccard
0,2,"[GCF78, GCF209, GCF168]",0.25
1,4,"[GCF376, GCF376, GCF377]",0.22
2,6,"[GCF376, GCF376, GCF377]",0.22
3,11,"[GCF209, GCF168, GCF134]",0.00
4,35,"[GCF111, GCF111, GCF209]",0.25
...,...,...,...
30180,2592054,"[GCF446, GCF446, GCF446]",1.00
30181,2592183,"[GCF484, GCF484, GCF209]",1.00
30182,2592184,"[GCF484, GCF484, GCF209]",1.00
30183,2592207,"[GCF365, GCF365, GCF365]",0.67


In [14]:
testing_df.shape

(30185, 1024)

In [15]:
mibig_df = pd.read_csv("./matched_mibig_gnps_update.tsv",sep='\t')

mibig_name_dict = dict(zip(mibig_df['mibig_id'],mibig_df['mibig_name']))

ccmsid_mibig_dict = dict(zip(mibig_df['# mgf_spectrum_id'],mibig_df['mibig_id']))

ccmsid_mibig_dict

{'CCMSLIB00000001552': 'BGC0001000',
 'CCMSLIB00000006865': 'BGC0000001',
 'CCMSLIB00000075009': 'BGC0000950',
 'CCMSLIB00000075016': 'BGC0000950',
 'CCMSLIB00000075305': 'BGC0000055',
 'CCMSLIB00000075306': 'BGC0000055',
 'CCMSLIB00000075307': 'BGC0000706',
 'CCMSLIB00000075308': 'BGC0000706',
 'CCMSLIB00000075309': 'BGC0000724',
 'CCMSLIB00000075310': 'BGC0000724',
 'CCMSLIB00000075311': 'BGC0000455',
 'CCMSLIB00000075312': 'BGC0000455',
 'CCMSLIB00000075313': 'BGC0001453',
 'CCMSLIB00000075320': 'BGC0000985',
 'CCMSLIB00000075321': 'BGC0000985',
 'CCMSLIB00000075322': 'BGC0000985',
 'CCMSLIB00000075323': 'BGC0000985',
 'CCMSLIB00000075324': 'BGC0000985',
 'CCMSLIB00000075325': 'BGC0000985',
 'CCMSLIB00000075331': 'BGC0000016',
 'CCMSLIB00000075332': 'BGC0000016',
 'CCMSLIB00000077217': 'BGC0001310',
 'CCMSLIB00000077218': 'BGC0001310',
 'CCMSLIB00000078898': 'BGC0000901',
 'CCMSLIB00000081213': 'BGC0000389',
 'CCMSLIB00000081265': 'BGC0000820',
 'CCMSLIB00000081266': 'BGC0000820',
 

In [23]:
for i,r in final_df[final_df['max_jaccard'] > 0.7].iterrows():
    libhit = str(nodes_df[nodes_df['cluster index'] == r['metabolite_ID']]['LibraryID'].item())
    specID = str(nodes_df[nodes_df['cluster index'] == r['metabolite_ID']]['SpectrumID'].item())
    bgchits_list = []
    match = False
    for gcf in r['predicted_GCFs']:
        for bgc in bigscape_dict2[gcf]:
            bgc = bgc.split('.')[0]
            if 'BGC' in bgc:
                if bgc in mibig_name_dict:
                    bgchits_list.append(mibig_name_dict[bgc])
                    if specID in ccmsid_mibig_dict:
                        match = True
                        if ccmsid_mibig_dict[specID] == bgc:
                            print(r['metabolite_ID'],'true',gcf,specID,ccmsid_mibig_dict[specID],bgc,libhit)
                        else:
                            print(r['metabolite_ID'],'false',gcf,specID,ccmsid_mibig_dict[specID],bgc,'mibig=%s'%mibig_name_dict[bgc],'gnps=%s'%libhit)
#                     else:
#                         print('missing %s'%specID)
#                 else:
#                     print('missing %s'%bgc)
    if match == True:
        print(r['predicted_GCFs'],'\n')

In [22]:
final_df[final_df['max_jaccard'] >= 0.7]

Unnamed: 0,metabolite_ID,predicted_GCFs,max_jaccard
14,3704,"[GCF361, GCF361, GCF361]",0.75
69,11301,"[GCF361, GCF362, GCF362]",0.75
86,12420,"[GCF93, GCF93, GCF93]",1.00
109,13220,"[GCF497, GCF466, GCF158]",1.00
116,13283,"[GCF92, GCF91, GCF91]",1.00
...,...,...,...
30176,2580307,"[GCF81, GCF81, GCF81]",0.84
30180,2592054,"[GCF446, GCF446, GCF446]",1.00
30181,2592183,"[GCF484, GCF484, GCF209]",1.00
30182,2592184,"[GCF484, GCF484, GCF209]",1.00


In [18]:
end = time.time()
hours, rem = divmod(end-start, 3600)
minutes, seconds = divmod(rem, 60)
run_time = "{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds)
print(run_time)

00:18:04.03


In [19]:
get_data_df = pd.read_csv("./test/get_data.sh")[:-2]

get_data_df

Unnamed: 0,mkdir -p LCMS_Files
0,wget -O LCMS_Files/GCA_000240165.1.mzXML https...
1,wget -O LCMS_Files/GCA_000240165.2.mzXML https...
2,wget -O LCMS_Files/GCA_000240165.3.mzXML https...
3,wget -O LCMS_Files/GCA_000240165.4.mzXML https...
4,wget -O LCMS_Files/GCA_000240165.5.mzXML https...
...,...
2984,wget -O LCMS_Files/GCA_000286575.26.mzML https...
2985,wget -O LCMS_Files/GCA_000959505.25.mzML https...
2986,wget -O LCMS_Files/GCA_000959505.26.mzML https...
2987,wget -O LCMS_Files/GCA_003568605.25.mzML https...


In [20]:
not_missing = []
missing_count = 0

for item in merged_ispec_mat.columns:
    for wget_file in get_data_df['mkdir -p LCMS_Files']:
        if item in str(wget_file):
            not_missing.append(item)

for item in merged_ispec_mat.columns:
    if item not in not_missing:
        if 'ERR' not in item:
            print(item)
            missing_count += 1

missing_count

B111
M92
B146
B81
GCA_000158375
GCA_000295235
GCA_000294005
GCA_000414585
GCA_000162135
GCA_000145355
GCA_000296385
GCA_000340725
GCA_000414445
GCA_000414525
GCA_000159875
GCA_000273155
GCA_000414505
GCA_000414425
GCA_000185665
GCA_000261205
GCA_000145535
GCA_000144735
GCA_000162215
GCA_000145515
GCA_000159075
GCA_000144445
GCA_000466545
GCA_000145055
GCA_000218325
GCA_000209465
GCA_000144345
GCA_000287675
GCA_000144125
GCA_000307495
GCA_000144385
GCA_000414705
GCA_000145115
GCA_000273465
GCA_000414645
GCA_000414605
GCA_000295575
GCA_000307375
GCA_000162935
GCA_000144465
GCA_000162275
GCA_000273295
GCA_000144265
GCA_000144025
GCA_000185585
GCA_000295415
GCA_000273035
GCA_000507825
GCA_000147235
GCA_000261265
GCA_000144505
GCA_000163715
GCA_000025765
GCA_000175315
GCA_000273115
GCA_000144545
GCA_000177055
GCA_000144795
GCA_000163495
GCA_000413395
GCA_000144895
GCA_000154085
GCA_000145455
GCA_000144305
GCA_000414685
GCA_000234055
GCA_000144875
GCA_000478805
GCA_000287695
GCA_000411435
GC

173