# Enrichment Project: hg-002 controldb SV check

Follow-up from enrichment (Layman & NTD) discussion with Nikhil (GGC) regarding large average # of SVs per patient. HG-002 (GM24385) is filtered then clustered.

An addition to this script is to test compilation of Enrichment Workflow Steps 1 & 2 in a util.py

##  Enrichment Single Clustering (Filter then cluster)
Troubleshooting clustering code in cohort enrichment project.

load packages

In [1]:
# load packages
from itertools import groupby
from matplotlib_venn import venn2, venn2_circles
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy
from enrichment_util import *

import copy
import re 
import os
import pathlib
working_dir = pathlib.Path().absolute()
os.chdir(working_dir)


%matplotlib inline
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
# Standardized headers for both bed files
# headers: chrom	chromStart	chromEnd	Gene	Index	strand	chromStart2	chromEnd	RGB
bed_headers = ["chr","RefStartPos","RefEndPos","Gene","Index","Strand","RefStartPos2","RefEndPos2","RGB"]

# Load hg38.knownCanonical.bed
bed_df = pd.read_csv(r"Input\ref_genes\hg38.knownCanonical.mapped_KH.bed", dtype=object, index_col=False, sep="\t")
bed_df = bed_df.astype({'RefStartPos':'float64', 'RefEndPos':'float64', 'chr':'int'})

# Load hg38_gaps.bed
gaps_df = pd.read_csv(r"Input\ref_genes\hg38_gaps.bed", dtype=object,  index_col=False, sep="\t", names=bed_headers)
gaps_df = gaps_df.astype({'RefStartPos':'float64', 'RefEndPos':'float64', 'chr':'int'})

# Load hg38 pseudogene list.bed
pseudo_df = pd.read_csv(r"Input\ref_genes\hg38_EncodeGencodePseudoGeneV41.bed",dtype=object,  index_col=False, sep="\t")
pseudo_df = pseudo_df.astype({'RefStartPos':'float64', 'RefEndPos':'float64', 'chr':'int'})

# set redstartstop as float for downstream filtering functions
for df in [bed_df, gaps_df, pseudo_df]:
    df = df.astype({'RefStartPos':'float64', 'RefEndPos':'float64', 'chr':'int'})

In [3]:
working_dir

WindowsPath('c:/Users/sshukor/OneDrive - Bionano Genomics/Documents/Enrichment Project/clustering_optimization_enrichment')

get smap column header and table rows, save as *_minimal.smap

In [52]:
in_path = "Input\controlsdb\GM24385\exp_refineFinal1_merged_filter_inversions.smap"
out_path = "Output\output_hg002_clustering_04Jul2023\GM24385_minimal.smap"
# get_smap_table(in_path, out_path)

load sv table

In [53]:
# read in hg002 .smap
# load csv as DF
hg002_path = "Output\output_hg002_clustering_04Jul2023\GM24385_minimal.smap"
df = pd.read_csv(hg002_path, sep='\t', dtype=object,  index_col=False)
df = df.assign(Sample_ID = "GM24385")
df.rename(columns={"RefcontigID1":"chr","SmapEntryID":"SmapID"}, inplace=True)
df = df.astype({'RefStartPos':'float64', 'RefEndPos':'float64', 'chr':'int', 'SVsize':'float64'})

In [54]:
display(df.head())
df.shape, df['Type'].value_counts()
# df.dtypes

Unnamed: 0,SmapID,QryContigID,chr,RefcontigID2,QryStartPos,QryEndPos,RefStartPos,RefEndPos,Confidence,Type,...,Genotype,GenotypeGroup,RawConfidence,RawConfidenceLeft,RawConfidenceRight,RawConfidenceCenter,SVsize,SVfreq,orientation,Sample_ID
0,4,3121,1,1,36313.3,60678.2,365535.0,387230.0,0.99,insertion,...,1,1,4.22,4.22,25.61,34.43,2669.9,0.07,,GM24385
1,5,681,1,1,2175119.8,2230505.2,0.0,617119.0,0.99,deletion,...,2,1,3.0,3.0,198.52,3669.43,561733.6,0.62,,GM24385
2,6,681,1,1,2119878.4,2128379.7,663861.0,672078.0,-1.0,insertion,...,1,2,1.19,9.24,148.7,1.19,284.4,0.443,,GM24385
3,7,681,1,1,2079978.7,2081775.4,710362.1,711828.9,-1.0,insertion,...,1,3,5.28,17.61,141.73,5.28,329.9,0.453,,GM24385
4,8,681,1,1,1985981.4,2003404.8,788500.0,803475.0,0.99,insertion,...,1,4,27.74,27.74,131.69,85.8,2448.5,0.491,,GM24385


((17515, 28),
 insertion                 9959
 deletion                  5388
 deletion_tiny              961
 insertion_tiny             499
 inversion_partial          173
 inversion                  172
 translocation_intrachr     123
 translocation_interchr      66
 duplication                 59
 duplication_inverted        53
 inversion_paired            42
 trans_intrachr_repeat       13
 duplication_split            6
 inversion_repeat             1
 Name: Type, dtype: int64)

flatten inversions

In [55]:
pd.unique(df['Type'])

array(['insertion', 'deletion', 'insertion_tiny', 'duplication_inverted',
       'inversion', 'inversion_partial', 'duplication', 'deletion_tiny',
       'inversion_paired', 'translocation_intrachr',
       'translocation_interchr', 'inversion_repeat',
       'trans_intrachr_repeat', 'duplication_split'], dtype=object)

In [56]:
inv_flat_df = flatten_df_inv(df)
inv_flat_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\GM24385_inv_flat_out.csv", index=False)
inv_flat_df.shape, inv_flat_df['Type'].value_counts()

Total: 17515
inversions, inversion nbase, and inversion & inversion_partials: 388


((17321, 28),
 insertion                 9959
 deletion                  5388
 deletion_tiny              961
 insertion_tiny             499
 inversion                  172
 translocation_intrachr     123
 translocation_interchr      66
 duplication                 59
 duplication_inverted        53
 inversion_paired            21
 trans_intrachr_repeat       13
 duplication_split            6
 inversion_repeat             1
 Name: Type, dtype: int64)

keep non-translocations and exclude gains/loss, *masked, *common, *tiny, *nbase and complex

In [57]:
# # Filter rare SVs and output selectd columns
# df_rareSV = filter_rare_svs(inv_flat_df, 1.0)
# # # df_rareSV = filter_rare_svs(case_df, 1.0)
# n_rare = len(df_rareSV) 

# exclude unwanted sv types
df_excluded = exclude_types_svs(inv_flat_df)
# df_excluded = exclude_types_svs(inv_flat_df)
# df_excluded = exclude_types_svs(case_df)

# filter for confidence intervals
df_sig = filter_sig_svs(df_excluded)
n_sig = len(df_sig)

df_sig.to_csv(rf"Output\output_hg002_clustering_04Jul2023\GM24385_case_filtered_out.csv", index=False)

In [58]:
df_sig.shape, df_sig['Type'].value_counts()

((6467, 28),
 insertion               4388
 deletion                1934
 duplication               59
 duplication_inverted      53
 inversion_paired          21
 inversion                  6
 duplication_split          6
 Name: Type, dtype: int64)

In [59]:
# Load compiled sv list
sig_df = pd.read_csv("Output\output_hg002_clustering_04Jul2023\GM24385_case_filtered_out.csv", dtype=object,  index_col=False)
display(sig_df.head())
sig_df = sig_df.astype({'RefStartPos':'float64', 'RefEndPos':'float64', 'chr':'int'})

Unnamed: 0,SmapID,QryContigID,chr,RefcontigID2,QryStartPos,QryEndPos,RefStartPos,RefEndPos,Confidence,Type,...,Genotype,GenotypeGroup,RawConfidence,RawConfidenceLeft,RawConfidenceRight,RawConfidenceCenter,SVsize,SVfreq,orientation,Sample_ID
0,707,42,1,1,16633490.1,16633490.1,103711691.0,103734414.0,0.99,inversion_paired,...,1,-1,-1.0,16069.91,5.25,-1.0,22723.0,0.455,,GM24385
1,959,211,1,1,531135.5,531135.5,149840746.0,149855154.0,0.98,inversion_paired,...,1,-1,-1.0,48.29,2.72,-1.0,14408.0,0.499,,GM24385
2,2795,12,2,2,30390774.8,30389965.3,211826848.0,211840496.0,0.9,inversion_paired,...,1,-1,-1.0,8376.11,4.8,-1.0,13648.0,0.556,,GM24385
3,7183,1860,7,7,249179.9,249179.9,5986390.0,6738295.0,0.86,inversion,...,1,2710,-1.0,61.41,33.66,-1.0,751905.0,0.189,,GM24385
4,7185,2402,7,7,218808.3,218808.3,5986390.0,6738295.0,0.86,inversion,...,2,2710,-1.0,31.7,49.95,-1.0,751905.0,0.192,,GM24385


In [60]:
# remove indel_ngaps
# True if overlaps gaps, false if not, remove Falses
sig_df["has_gaps"] = sig_df.apply(lambda x: filter_nogaps(x["Type"], x["chr"], x["RefStartPos"], x["RefEndPos"]), axis=1)
nogaps_df = sig_df.loc[sig_df["has_gaps"] == False]

num_true = len(sig_df[sig_df["has_gaps"] == True])
num_false = len(sig_df[sig_df["has_gaps"] == False])

print(f"{num_true} has gaps, and {num_false} has no gaps, for a total of {num_true + num_false} SVs")

# out_file(nogaps_df, "case_cleaned_out")
nogaps_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\GM24385_case_nogaps_out.csv", index=False)

234 has gaps, and 6233 has no gaps, for a total of 6467 SVs


In [61]:
# find overlap genes matching hg38knownCanonical.bed
nogaps_df = pd.read_csv("Output\output_hg002_clustering_04Jul2023\GM24385_case_nogaps_out.csv", dtype=object,  index_col=False)
nogaps_df = nogaps_df.astype({'RefStartPos':'float64','RefEndPos':'float64','chr':'int'})
nogaps_df.dtypes

nogaps_df["OverlapGenes"] = nogaps_df.apply(lambda x: filter_genes(bed_df, x["chr"], x["RefStartPos"], x["RefEndPos"]), axis=1)

no_overlaps = len(nogaps_df.loc[nogaps_df["OverlapGenes"] == "-"])
overlaps = len(nogaps_df.loc[nogaps_df["OverlapGenes"] != "-"])
print(f"From {no_overlaps + overlaps} cleaned SVs, {no_overlaps} SVs do not overlap any gene, while {overlaps} do.")

# out_file(nogaps_df, "control_cleaned_out")
nogaps_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\GM24385_case_cleaned_out.csv", index=False)

From 6233 cleaned SVs, 2202 SVs do not overlap any gene, while 4031 do.


In [62]:
sv_df = pd.read_csv("Output\output_hg002_clustering_04Jul2023\GM24385_case_cleaned_out.csv", dtype=object,  index_col=False)
sv_df = sv_df.astype({'RefStartPos':'float64', 'RefEndPos':'float64', 'SVsize':'float64', 'chr':'int'})

In [63]:
sv_df['OverlapGenes'] = sv_df.apply(lambda x: parse_sorted_unique_genes(x['OverlapGenes']), axis=1)
sv_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\GM24385_case_cleaned_out.csv", index=False)

In [64]:
sv_df.shape, sv_df['Type'].value_counts()

((6233, 30),
 insertion               4337
 deletion                1751
 duplication               59
 duplication_inverted      53
 inversion_paired          21
 inversion                  6
 duplication_split          6
 Name: Type, dtype: int64)

In [65]:
pd.options.mode.chained_assignment = None
# group_sv_by_gene(merged_df.head(5000))
in_df, out_df = group_sv_by_gene_single_cohort(sv_df)

28
39
60
101
126
142
180
213
226
237
265
285
311
325
362
384
407
417
439
463
479
486
509
544
558
575
576
594
624
625
635
647
661
682
683
701
710
733
750
759
768
30.0%/ 100% done
791
825
832
841
853
878
885
891
908
930
931
933
948
952
968
991
1003
1004
1014
1021
1028
1029
1038
1046
1052
1054
1065
1077
1078
1082
1091
1106
1114
1120
1128
1146
1151
1153
1156
1157
1169
1180
1186
1188
1194
1210
1215
1218
1228
1239
1247
1249
1250
1259
1272
1277
1296
1299
1366
1368
1371
1373
1374
1375
1376
1377
1378
1379
1380
1381
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
148

In [66]:
in_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\GM24385_case_grouped_sv_out.csv", index=False)
out_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\GM24385_case_grouped_cluster_out.csv", index=False)

breakdown of SV by type shared to AP for double checking

In [67]:
in_df['Type'].value_counts(),out_df['Type'].value_counts()

(insertion               4337
 deletion                1751
 duplication               59
 duplication_inverted      53
 inversion_paired          21
 inversion                  6
 duplication_split          6
 Name: Type, dtype: int64,
 insertion               2648
 deletion                1234
 duplication_inverted      30
 duplication               30
 inversion_paired          13
 duplication_split          4
 inversion                  3
 Name: Type, dtype: int64)

## Solve Clustering (Solve Cluster then filter)
compare with Solve clustering results

extract header and rows, ignoring other # lines

In [70]:
in_path = "Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_cluster_molecule_variant.txt"
out_path = "Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_minimal.smap"
get_smap_table(in_path, out_path)

output file path: Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_minimal.smap.smap


In [71]:
# load csv as DF
solve_df = pd.read_csv("Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_minimal.smap", sep='\t', dtype=object,  index_col=False)
solve_df = solve_df.assign(Sample_ID = "GM24385")
solve_df.rename(columns={"RefcontigID1":"chr","SmapEntryID":"SmapID"}, inplace=True)
solve_df = solve_df.astype({'RefStartPos':'float64', 'RefEndPos':'float64', 'chr':'int', 'SVsize':'float64'})

variant clustering smmary was manually edited to remove '#' lines, keepingonly headers

In [72]:
solve_cluster_df = pd.read_csv(r"Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_cluster_molecule_variant_summary.txt", sep='\t',dtype=object,  index_col=False)
solve_cluster_df['type'].value_counts()

insertion                 5827
deletion                  3385
inversion                   67
translocation_intrachr      44
translocation_interchr      32
duplication_direct          30
duplication_inverted        24
Name: type, dtype: int64

flatten inversions

In [73]:
pd.unique(solve_df['Type'])

array(['insertion', 'deletion', 'duplication_inverted', 'duplication',
       'duplication_split', 'inversion', 'inversion_partial',
       'inversion_paired', 'inversion_repeat', 'translocation_intrachr',
       'translocation_interchr', 'trans_intrachr_repeat'], dtype=object)

In [74]:
solve_inv_flat_df = flatten_df_inv(solve_df)
solve_inv_flat_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_inv_flat_out.csv", index=False)

Total: 16055
inversions, inversion nbase, and inversion & inversion_partials: 388


In [75]:
display(solve_inv_flat_df.head())
solve_inv_flat_df.shape, solve_inv_flat_df['Type'].value_counts()

Unnamed: 0,clusterId,SmapID,QryContigID,chr,RefcontigID2,QryStartPos,QryEndPos,RefStartPos,RefEndPos,Confidence,...,Genotype,GenotypeGroup,RawConfidence,RawConfidenceLeft,RawConfidenceRight,RawConfidenceCenter,SVsize,SVfreq,orientation,Sample_ID
0,9267,148,502,1,1,10389807.2,10543577.5,13217647.0,13242763.5,0.0,...,1,-1,-1.0,9.02,617.71,-1.0,25116.5,0.438,,GM24385
1,9268,146,501,1,1,13976504.6,13990446.0,16616818.0,16942107.0,0.22,...,1,66,-1.0,2098.24,22.53,-1.0,325289.0,0.37,,GM24385
2,9268,165,41,1,1,104099695.4,104113661.8,16616818.0,16942107.0,0.0,...,2,66,-1.0,151.51,12.68,-1.0,325289.0,0.203,,GM24385
3,9268,173,42,1,1,104346895.3,104360861.8,16616818.0,16942107.0,0.0,...,2,66,-1.0,153.26,14.27,-1.0,325289.0,0.412,,GM24385
4,9268,177,41,1,1,103480089.7,103499002.6,16566162.0,16859181.0,0.0,...,2,66,-1.0,18.97,17794.52,-1.0,293019.0,0.195,,GM24385


((15861, 29),
 insertion                 9959
 deletion                  5388
 inversion                  172
 translocation_intrachr     123
 translocation_interchr      66
 duplication                 59
 duplication_inverted        53
 inversion_paired            21
 trans_intrachr_repeat       13
 duplication_split            6
 inversion_repeat             1
 Name: Type, dtype: int64)

keep non-translocations and exclude gains/loss, *masked, *common, *tiny, *nbase and complex

In [76]:
# # Filter rare SVs and output selectd columns
# df_rareSV = filter_rare_svs(inv_flat_df, 1.0)
# # # df_rareSV = filter_rare_svs(case_df, 1.0)
# n_rare = len(df_rareSV) 

# exclude unwanted sv types
solve_df_excluded = exclude_types_svs(solve_inv_flat_df, True)
# df_excluded = exclude_types_svs(inv_flat_df)
# df_excluded = exclude_types_svs(case_df)
n_exc = len(solve_df_excluded)

# filter for confidence intervals
solve_df_sig = filter_sig_svs(solve_df_excluded)
n_sig = len(solve_df_sig)

solve_df_sig.to_csv(rf"Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_filtered_out.csv", index=False)

In [77]:
display(solve_df_sig.head())
solve_df_sig.shape,

Unnamed: 0,clusterId,SmapID,QryContigID,chr,RefcontigID2,QryStartPos,QryEndPos,RefStartPos,RefEndPos,Confidence,...,Genotype,GenotypeGroup,RawConfidence,RawConfidenceLeft,RawConfidenceRight,RawConfidenceCenter,SVsize,SVfreq,orientation,Sample_ID
18,9270,707,42,1,1,16633490.1,16633490.1,103711691.0,103734414.0,0.99,...,1,-1,-1.0,16069.91,5.25,-1.0,22723.0,0.455,,GM24385
25,9273,959,211,1,1,531135.5,531135.5,149840746.0,149855154.0,0.98,...,1,-1,-1.0,48.29,2.72,-1.0,14408.0,0.499,,GM24385
35,9278,2795,12,2,2,30390774.8,30389965.3,211826848.0,211840496.0,0.9,...,1,-1,-1.0,8376.11,4.8,-1.0,13648.0,0.556,,GM24385
40,9281,7183,1860,7,7,249179.9,249179.9,5986390.0,6738295.0,0.86,...,1,2710,-1.0,61.41,33.66,-1.0,751905.0,0.189,,GM24385
41,9281,7185,2402,7,7,218808.3,218808.3,5986390.0,6738295.0,0.86,...,2,2710,-1.0,31.7,49.95,-1.0,751905.0,0.192,,GM24385


((6467, 29),)

In [78]:
# Load compiled sv list
solve_sig_df = pd.read_csv("Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_filtered_out.csv", dtype=object,  index_col=False)

solve_sig_df = solve_sig_df.astype({'RefStartPos':'float64', 'RefEndPos':'float64', 'chr':'int'})
display(solve_sig_df.head())

Unnamed: 0,clusterId,SmapID,QryContigID,chr,RefcontigID2,QryStartPos,QryEndPos,RefStartPos,RefEndPos,Confidence,...,Genotype,GenotypeGroup,RawConfidence,RawConfidenceLeft,RawConfidenceRight,RawConfidenceCenter,SVsize,SVfreq,orientation,Sample_ID
0,9270,707,42,1,1,16633490.1,16633490.1,103711691.0,103734414.0,0.99,...,1,-1,-1.0,16069.91,5.25,-1.0,22723.0,0.455,,GM24385
1,9273,959,211,1,1,531135.5,531135.5,149840746.0,149855154.0,0.98,...,1,-1,-1.0,48.29,2.72,-1.0,14408.0,0.499,,GM24385
2,9278,2795,12,2,2,30390774.8,30389965.3,211826848.0,211840496.0,0.9,...,1,-1,-1.0,8376.11,4.8,-1.0,13648.0,0.556,,GM24385
3,9281,7183,1860,7,7,249179.9,249179.9,5986390.0,6738295.0,0.86,...,1,2710,-1.0,61.41,33.66,-1.0,751905.0,0.189,,GM24385
4,9281,7185,2402,7,7,218808.3,218808.3,5986390.0,6738295.0,0.86,...,2,2710,-1.0,31.7,49.95,-1.0,751905.0,0.192,,GM24385


ngaps and overlapgenes

In [79]:
# remove indel_ngaps
# True if overlaps gaps, false if not, remove Falses
solve_sig_df["has_gaps"] = solve_sig_df.apply(lambda x: filter_nogaps(x["Type"], x["chr"], x["RefStartPos"], x["RefEndPos"]), axis=1)
solve_nogaps_df = solve_sig_df.loc[solve_sig_df["has_gaps"] == False]

num_true = len(solve_sig_df[solve_sig_df["has_gaps"] == True])
num_false = len(solve_sig_df[solve_sig_df["has_gaps"] == False])

print(f"{num_true} has gaps, and {num_false} has no gaps, for a total of {num_true + num_false} SVs")

# out_file(nogaps_df, "case_cleaned_out")
solve_nogaps_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_nogaps_out.csv", index=False)

234 has gaps, and 6233 has no gaps, for a total of 6467 SVs


In [81]:
# find overlap genes matching hg38knownCanonical.bed
solve_nogaps_df = pd.read_csv("Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_nogaps_out.csv", dtype=object,  index_col=False)
solve_nogaps_df = solve_nogaps_df.astype({'RefStartPos':'float64','RefEndPos':'float64','chr':'int'})
solve_nogaps_df.dtypes

solve_nogaps_df["OverlapGenes"] = solve_nogaps_df.apply(lambda x: filter_genes(bed_df, x["chr"], x["RefStartPos"], x["RefEndPos"]), axis=1)

no_overlaps = len(solve_nogaps_df.loc[solve_nogaps_df["OverlapGenes"] == "-"])
overlaps = len(solve_nogaps_df.loc[solve_nogaps_df["OverlapGenes"] != "-"])
print(f"From {no_overlaps + overlaps} cleaned SVs, {no_overlaps} SVs do not overlap any gene, while {overlaps} do.")

# out_file(nogaps_df, "control_cleaned_out")
solve_nogaps_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_cleaned_out.csv", index=False)

From 6233 cleaned SVs, 2202 SVs do not overlap any gene, while 4031 do.


In [82]:
solve_sv_df = pd.read_csv("Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_cleaned_out.csv", dtype=object,  index_col=False)
solve_sv_df = solve_sv_df.astype({'RefStartPos':'float64', 'RefEndPos':'float64', 'SVsize':'float64', 'chr':'int'})

In [83]:
solve_sv_df['OverlapGenes'] = solve_sv_df.apply(lambda x: parse_sorted_unique_genes(x['OverlapGenes']), axis=1)

How many filtered clusters?

In [84]:
solve_sv_df['Type'].value_counts(), solve_cluster_df.loc[solve_cluster_df['clusterId'].isin(solve_sv_df['clusterId'])]['type'].value_counts()

(insertion               4337
 deletion                1751
 duplication               59
 duplication_inverted      53
 inversion_paired          21
 inversion                  6
 duplication_split          6
 Name: Type, dtype: int64,
 insertion               2597
 deletion                1217
 duplication_direct        30
 duplication_inverted      24
 inversion                 16
 Name: type, dtype: int64)

In [85]:
len(solve_sv_df['clusterId'].unique())

3884

In [86]:
solve_sv_df.columns

Index(['clusterId', 'SmapID', 'QryContigID', 'chr', 'RefcontigID2',
       'QryStartPos', 'QryEndPos', 'RefStartPos', 'RefEndPos', 'Confidence',
       'Type', 'XmapID1', 'XmapID2', 'LinkID', 'QryStartIdx', 'QryEndIdx',
       'RefStartIdx', 'RefEndIdx', 'Zygosity', 'Genotype', 'GenotypeGroup',
       'RawConfidence', 'RawConfidenceLeft', 'RawConfidenceRight',
       'RawConfidenceCenter', 'SVsize', 'SVfreq', 'orientation', 'Sample_ID',
       'has_gaps', 'OverlapGenes'],
      dtype='object')

In [None]:
pd.options.mode.chained_assignment = None
# group_sv_by_gene(merged_df.head(5000))
solve_in_df, solve_out_df = group_sv_by_gene_single_cohort(solve_sv_df)

28
39
60
101


KeyboardInterrupt: 

In [None]:
solve_in_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_grouped_sv_out.csv", index=False)
solve_out_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_cluster_grouped_cluster_out.csv", index=False)

breakdown of SV by type shared to AP for double checking

In [None]:
solve_in_df['Type'].value_counts(),solve_out_df['Type'].value_counts()

(insertion               4337
 deletion                1751
 duplication               59
 duplication_inverted      53
 inversion_paired          21
 inversion                  6
 duplication_split          6
 Name: Type, dtype: int64,
 insertion               2648
 deletion                1234
 duplication_inverted      30
 duplication               30
 inversion_paired          13
 duplication_split          4
 inversion                  3
 Name: Type, dtype: int64)

also inspect variant summary file

### inspect clusters between solve and cohort

In [None]:
solve_cID = solve_in_df['clusterId'].unique()
# cohort_cID = solve_in_df['cluster_ID']

discordant = []
for i in solve_cID:
    IDs = solve_in_df.loc[solve_in_df['clusterId'] == i]['cluster_ID']
    if len(set(IDs)) > 1:
        discordant.append(i)

len(discordant)

# mark rows with discordant cluster IDs
solve_in_df['discordant'] = solve_in_df.apply(lambda x: True if x['clusterId'] in (discordant) else False, axis=1)
solve_in_df.loc[solve_in_df['discordant'] == True].to_excel(rf"Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_discordant_cluster.xlsx")

## Solve Clustering 2.0: (Filter then Solve Cluster)
`Output\output_hg002_clustering_04Jul2023\GM24385_case_cleaned_out.csv` converted to .smap and used as input for clustering. Output file treated as `solve_in_df`, using `cluster_ID` column from Solve variant_clustering binary.

Path to .smap: `Enrichment Project\clustering_optimization_enrichment\Output\output_hg002_clustering_04Jul2023\filter_then_solve_clustering\GM24385_case_cleaned_out.smap`

load filtered .csv and transform to .smap ("#" headers manually added)

In [None]:
sv_df = pd.read_csv(r"Output\output_hg002_clustering_04Jul2023\filter_then_solve_clustering\GM24385_case_cleaned_out.csv", dtype=object,  index_col=False)
sv_df.to_csv(r"Output\output_hg002_clustering_04Jul2023\filter_then_solve_clustering\GM24385_case_cleaned_out.smap", sep ='\t', index=False)

clustering done using solve arguments

convert to header + table

In [None]:
# in_path = r"Output\output_hg002_clustering_04Jul2023\filter_then_solve_clustering\GM24385_solve_cluster_molecule_variant_summary.txt"
# out_path = r"Output\output_hg002_clustering_04Jul2023\filter_then_solve_clustering\GM24385_solve_clusters.txt"
# get_smap_table(in_path, out_path)

In [None]:
final_sv_df = pd.read_csv(r"Output\output_hg002_clustering_04Jul2023\filter_then_solve_clustering\GM24385_solve_cluster_molecule_variant.txt", sep='\t',dtype=object,  index_col=False)
final_cluster_df = pd.read_csv(r"Output\output_hg002_clustering_04Jul2023\filter_then_solve_clustering\GM24385_solve_cluster_molecule_variant_summary.txt", sep='\t',dtype=object,  index_col=False)

In [None]:
final_sv_df['Type'].value_counts(),final_cluster_df['type'].value_counts()

(insertion               4337
 deletion                1751
 duplication               59
 duplication_inverted      53
 duplication_split          6
 Name: Type, dtype: int64,
 insertion               2597
 deletion                1217
 duplication_direct        30
 duplication_inverted      24
 Name: type, dtype: int64)

## Enrichment Single Clustering (filter then cluster sans OverlapGenes)

Discrepancies might be due to clustering parameters

enrichment clustering accounts for OverlapGenes, while solve clustering doesn't

In [None]:
cleaned_df = pd.read_csv("Output\output_hg002_clustering_04Jul2023\GM24385_case_cleaned_out.csv", dtype=object,  index_col=False)
cleaned_df = cleaned_df.astype({'RefStartPos':'float64', 'RefEndPos':'float64', 'SVsize':'float64', 'chr':'int'})

In [None]:
cleaned_df['OverlapGenes'] = cleaned_df.apply(lambda x: parse_sorted_unique_genes(x['OverlapGenes']), axis=1)
# cleaned_df.to_csv(rf"Output\output_hg002_clustering_04Jul2023\GM24385_case_cleaned_out.csv", index=False)

In [None]:
cleaned_df.shape, cleaned_df['Type'].value_counts()

((6233, 30),
 insertion               4337
 deletion                1751
 duplication               59
 duplication_inverted      53
 inversion_paired          21
 inversion                  6
 duplication_split          6
 Name: Type, dtype: int64)

In [88]:
# Single cohort clustering
# 2 Helper functions to cluster SVs. Clustering removes redundant SVs, keeping only `unique` SVs
pd.options.mode.chained_assignment = None
# Helper function to cluster gene coordinates, given a group of SV(s) grouped by gene, chr, zygo, and type
def cluster_refpos_single_nogenes(prev_cluster_ID, df_in, posWindow, reciprocalSize):
    
    # Get sv info from first df item
    sv_info_headers = ["OverlapGenes","Zygosity","chr", "Type"]
    sv_info = df_in[sv_info_headers].head(1)
    # display(df_in.head(1))

    qry_type = df_in["Type"].iat[0]

    sv_df = pd.DataFrame()
    row_df = pd.DataFrame()
    while False in pd.unique(df_in['clustered']):
        
        # subset non-clustered SVs and get first item
        # df_in = df_in.loc[df_in['clustered'] == False, ['RefStartPos', 'RefEndPos', 'SVsize', 'case_ID', 'ctrl_ID', 'num_overlap_DGV_calls', 'clustered']]
        df_in = df_in.loc[df_in['clustered'] == False]
        
        # get gene coordinates and size of first item
        qry_start = float(df_in['RefStartPos'].iat[0]) + posWindow
        qry_end = float(df_in['RefEndPos'].iat[0]) + posWindow
        qry_size = float(df_in['SVsize'].iat[0])

        # find overlaps with first item (special treatnment for inversions since no inv size in ctrldb)
        if qry_type in ['ins','insertion', 'del' ,'deletion', 'dup','duplication_paired' ,'duplication_inverted', 'duplication', 'duplication_split']:
        
            cluster_df = df_in.loc[(((qry_start - posWindow <= df_in['RefStartPos']) & (df_in['RefStartPos'] <= qry_end + posWindow)) \
                                    | ((df_in['RefStartPos'] <= qry_start - posWindow) & (qry_start - posWindow <= df_in['RefEndPos']))) \
                                    & (((qry_size/df_in['SVsize'])*100 >= reciprocalSize) \
                                        & ((df_in['SVsize']/qry_size)*100 >= reciprocalSize))]
    
            
            # mark as clustered in original input_df
            df_in.loc[(((qry_start - posWindow <= df_in['RefStartPos']) & (df_in['RefStartPos'] <= qry_end + posWindow)) \
                                    | ((df_in['RefStartPos'] <= qry_start - posWindow) & (qry_start - posWindow <= df_in['RefEndPos']))) \
                                    & (((qry_size/df_in['SVsize'])*100 >= reciprocalSize) \
                                        & ((df_in['SVsize']/qry_size)*100 >= reciprocalSize)), 'clustered'] = True

        else: # else if its inv (or trans)

            cluster_df = df_in.loc[(((qry_start - posWindow <= df_in['RefStartPos']) & (df_in['RefStartPos'] <= qry_end + posWindow)) \
                                    | ((df_in['RefStartPos'] <= qry_start - posWindow) & (qry_start - posWindow <= df_in['RefEndPos'])))]
            
            df_in.loc[(((qry_start - posWindow <= df_in['RefStartPos']) & (df_in['RefStartPos'] <= qry_end + posWindow)) \
                                    | ((df_in['RefStartPos'] <= qry_start - posWindow) & (qry_start - posWindow <= df_in['RefEndPos']))), 'clustered'] = True


        # get cluster info
        # get SV key information (OverlapGenes, Chr, Type, Zygo)
        row = sv_info
        
        # set cluster and row ID number
        prev_cluster_ID += 1
        cluster_df['cluster_ID'] = prev_cluster_ID
        row['cluster_ID'] = prev_cluster_ID

        # get list of gene coordinates 
        row['RefStartPos'] = cluster_df['RefStartPos'].min()
        row['RefEndPos'] = cluster_df['RefEndPos'].max()
        row['listRefStartPos'] = ", ".join(cluster_df['RefStartPos'].astype(str))
        row['listRefEndPos'] = ", ".join(cluster_df['RefEndPos'].astype(str))

        # Get info on n samples that share an SV, including and excluding repeats
        row['num_SVs'] = len(cluster_df)
        row['num_unique_samples'] = len(cluster_df['Sample_ID'].dropna().unique())

        row['Sample_ID'] =  ", ".join(cluster_df['Sample_ID'])

        # populate lists of other sample-specific metrics for table QC

        row['SVsize'] = ", ".join(cluster_df['SVsize'].astype(str))
        row['sv_means'] = cluster_df['SVsize'].mean()
        row['sv_std'] = cluster_df['SVsize'].std()
        # row['num_overlap_DGV_calls'] = ", ".join(list(pd.unique(cluster_df['num_overlap_DGV_calls'].dropna())))
        # print(row)

        # display(cluster_df)
        # Remove duplicate sample_ID and keep only 1 SV in a cluster per sample
        # cluster_df = cluster_df.drop_duplicates(subset=['Sample_ID'])
        cluster_df['clustered'] = True
        
        sv_df = pd.concat([cluster_df, sv_df], axis=0, ignore_index=True)
        row_df = pd.concat([row_df, row], axis=0, ignore_index=True)
        
    return prev_cluster_ID, sv_df, row_df


def cluster_sv_single_cohort_nogenes(cleaned_df):
    # initialize column of T/F, where SVs == T have been clustered, and F haven't.
    cleaned_df.loc[:, 'clustered'] = False    
    cleaned_df.loc[:, 'cluster_ID'] = -1

    # Sort first by refstartstop from smallest to largest
    cleaned_df = cleaned_df.sort_values(["chr", "Type", 'RefStartPos','RefEndPos', 'SVsize'])

    # Group compiled rare SVs by criteria, then list samples sharing each rare SV.
    # due to memory issues, we put df_g in a dictionary, using identified rare SVs as key, then iterate by key to values containing df
    groups = dict(list(cleaned_df.groupby(by=["chr", "Type"])))

    df_in = pd.DataFrame()
    df_out = pd.DataFrame()

    total = len(groups.values())
    prev_cluster_ID = 0

    # main loop to process every SV group into smaller clusters, appends clustered dataframe into out_df
    for df in groups.values():
        # inputs certain columns for clustering
        prev_cluster_ID, cluster_df_in, cluster_df_out = cluster_refpos_single_nogenes(prev_cluster_ID, df, 5000.0, 50)

        # returned cluster SVs are appended to output df.
        df_in = pd.concat([df_in, cluster_df_in], axis=0, ignore_index=True)
        df_out = pd.concat([df_out, cluster_df_out], axis=0, ignore_index=True)

        print(prev_cluster_ID)

    return df_in, df_out

In [None]:
pd.options.mode.chained_assignment = None
# group_sv_by_gene(merged_df.head(5000))
in_df_nogenes, out_df_nogenes = group_sv_by_gene_single_cohort_nogenes(cleaned_df)

97
98
102
303
305
406
408
569
570
642
774
874
988
1062
1063
1191
1261
1262
1413
1484
1487
1490
1633
1634
1637
1699
1700
1701
1815
1816
1876
1878
2003
2051
2054
2183
2235
2377
2428
2549
2552
2589
2590
2682
2728
2729
2730
2804
2832
2834
2836
2912
2913
2935
2938
2939
3028
3064
3067
3069
3170
3200
3258
3298
3301
3303
3418
3444
3524
3525
3541
3598
3623
3625
3628
3704
3748
3749
3860
3862
3873
3876
3877
3878
3889
3890


In [None]:
in_df_nogenes.to_csv(rf"Output\output_hg002_clustering_04Jul2023\cohort_clustering_nogenes\GM24385_case_grouped_sv_out.csv", index=False)
out_df_nogenes.to_csv(rf"Output\output_hg002_clustering_04Jul2023\cohort_clustering_nogenes\GM24385_case_grouped_cluster_out.csv", index=False)

breakdown of SV by type shared to AP for double checking

In [None]:
in_df_nogenes['Type'].value_counts(),out_df_nogenes['Type'].value_counts()

(insertion               4337
 deletion                1751
 duplication               59
 duplication_inverted      53
 inversion_paired          21
 inversion                  6
 duplication_split          6
 Name: Type, dtype: int64,
 insertion               2601
 deletion                1219
 duplication               27
 duplication_inverted      24
 inversion_paired          13
 inversion                  3
 duplication_split          3
 Name: Type, dtype: int64)

compare solve vs enrichment clustering

In [87]:
solve_sv_df['Type'].unique()

array(['inversion_paired', 'inversion', 'insertion', 'deletion',
       'duplication_inverted', 'duplication', 'duplication_split'],
      dtype=object)

In [89]:
solve_in_df_nogenes, solve_out_df_nogenes = group_sv_by_gene_single_cohort_nogenes(solve_sv_df)

97
98
102
303
305
405
407
568
569
641
773
873
986
1060
1061
1188
1258
1259
1410
1481
1484
1487
1629
1630
1633
1695
1696
1697
1811
1812
1872
1874
1999
2047
2050
2179
2231
2373
2424
2545
2548
2584
2585
2677
2723
2724
2725
2799
2827
2829
2831
2907
2908
2930
2933
2934
3023
3059
3062
3064
3164
3194
3252
3292
3295
3297
3412
3438
3518
3519
3535
3592
3617
3619
3622
3698
3742
3743
3854
3856
3867
3870
3871
3872
3883
3884


In [90]:
solve_in_df_nogenes['Type'].value_counts(),solve_out_df_nogenes['Type'].value_counts()

(insertion               4337
 deletion                1751
 duplication               59
 duplication_inverted      53
 inversion_paired          21
 inversion                  6
 duplication_split          6
 Name: Type, dtype: int64,
 insertion               2597
 deletion                1217
 duplication               27
 duplication_inverted      24
 inversion_paired          13
 inversion                  3
 duplication_split          3
 Name: Type, dtype: int64)

In [99]:
solve_cID = solve_in_df_nogenes['clusterId'].unique()
# cohort_cID = solve_in_df['cluster_ID']

discordant = []
for i in solve_cID:
    IDs = solve_in_df_nogenes.loc[solve_in_df_nogenes['clusterId'] == i]['cluster_ID']
    if len(set(IDs)) > 1:
        discordant.append(i)

print(len(discordant))

# mark rows with discordant cluster IDs
solve_in_df_nogenes['discordant'] = solve_in_df_nogenes.apply(lambda x: True if x['clusterId'] in (discordant) else False, axis=1)
solve_in_df_nogenes.loc[solve_in_df_nogenes['discordant'] == True].to_excel(rf"Output\output_hg002_clustering_04Jul2023\solve_clustering\GM24385_solve_discordant_cluster.xlsx")

2


In [97]:
solve_cID = solve_in_df_nogenes['cluster_ID'].unique()
# cohort_cID = solve_in_df['cluster_ID']

discordant = []
for i in solve_cID:
    IDs = solve_in_df_nogenes.loc[solve_in_df_nogenes['cluster_ID'] == i]['clusterId']
    if len(set(IDs)) > 1:
        discordant.append(i)

len(discordant)

2

In [98]:
discordant

[981, 2728]

In [95]:
solve_in_df_nogenes.loc[solve_in_df_nogenes['cluster_ID'] == 981]

Unnamed: 0,clusterId,SmapID,QryContigID,chr,RefcontigID2,QryStartPos,QryEndPos,RefStartPos,RefEndPos,Confidence,...,RawConfidenceCenter,SVsize,SVfreq,orientation,Sample_ID,has_gaps,OverlapGenes,clustered,cluster_ID,discordant
1365,2635,4889,192,4,4,364190.4,374066.1,189831786.5,189838877.0,0.99,...,70.88,2785.2,0.391,,GM24385,False,-,True,981,False
1366,2636,4897,2842,4,4,395375.2,403306.8,189845937.0,189849826.0,0.99,...,50.05,4042.6,0.268,,GM24385,False,-,True,981,True
1367,2636,4698,191,4,4,395760.8,403698.2,189845937.0,189849826.0,0.99,...,872.34,4048.3,0.32,,GM24385,False,-,True,981,True


In [None]:
solve_in_df_nogenes.loc[solve_in_df_nogenes['clusterId'] == '6712']

Unnamed: 0,clusterId,SmapID,QryContigID,chr,RefcontigID2,QryStartPos,QryEndPos,RefStartPos,RefEndPos,Confidence,...,RawConfidenceCenter,SVsize,SVfreq,orientation,Sample_ID,has_gaps,OverlapGenes,clustered,cluster_ID,discordant
4397,6712,12581,1371,14,14,361934.6,378545.7,19009467.0,19022996.5,0.99,...,71.59,3081.6,0.062,,GM24385,False,-,False,2733,True
4398,6712,12590,1591,14,14,217718.6,224135.0,19008159.0,19013073.0,0.99,...,75.92,1502.4,0.023,,GM24385,False,-,False,2732,True
4399,6712,12585,1372,14,14,361934.6,376943.7,19009467.0,19022996.5,0.99,...,37.33,1479.6,0.05,,GM24385,False,-,False,2732,True
4400,6712,12566,1602,14,14,524392.1,539538.2,19009467.0,19022996.5,0.99,...,44.82,1616.6,0.017,,GM24385,False,-,False,2732,True
