/
top 10 percent calculation bladder & modify regions final.R
107 lines (65 loc) · 2.27 KB
/
top 10 percent calculation bladder & modify regions final.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
nmibc <- read.csv("E:\\Bladder Hotspot Paper\\NMIBC_Dataset.csv")
mibc <- read.csv("E:\\Bladder Hotspot Paper\\BLCA_Dataset.csv")
impact_regions <- read.csv("E:\\Bladder Hotspot Paper\\mskimpact_regions_new.csv")
colnames(impact_regions)[[1]] <- "chr"
chr_list <- unique(impact_regions$chr)
nmibc_new <- data.frame()
mibc_new <- data.frame()
for (c in chr_list){
pos_list <- c()
regions_sub <- subset(impact_regions, chr == c)
for (i in 1:nrow(regions_sub)){
pos_list <- c(pos_list, regions_sub$start[[i]]:regions_sub$stop[[i]])
}
nmibc_sub <- subset(nmibc, Chromosome == c & Start_Position %in% pos_list)
nmibc_new <- rbind(nmibc_new, nmibc_sub)
mibc_sub <- subset(mibc, Chromosome == c & Start_Position %in% pos_list)
mibc_new <- rbind(mibc_new, mibc_sub)
}
# identify top 10% of mutated regions panel length
mc <- nmibc_new[,c(1,5,6)]
colnames(mc) <- c("gene", "chr", "pos")
chr_list <- unique(mc$chr)
all_pos <- c()
for (c in chr_list){
mc_sub <- subset(mc, chr == c)
all_vec <- c()
for (p in mc_sub$pos){
vec <- (p - 50):(p + 50)
all_vec <- c(all_vec, vec)
}
all_vec <- unique(all_vec)
all_pos <- c(all_pos, all_vec)
}
nmibc_top_10 <- length(all_pos) * 0.1
### NMIBC 10552
mc <- mibc_new[,c(1,5,6)]
colnames(mc) <- c("gene", "chr", "pos")
chr_list <- unique(mc$chr)
all_pos <- c()
for (c in chr_list){
mc_sub <- subset(mc, chr == c)
all_vec <- c()
for (p in mc_sub$pos){
vec <- (p - 50):(p + 50)
all_vec <- c(all_vec, vec)
}
all_vec <- unique(all_vec)
all_pos <- c(all_pos, all_vec)
}
mibc_top_10 <- length(all_pos) * 0.1
### MIBC 36917
### average number of mutations per sample
mibc_samples <- unique(mibc_new$Tumor_Sample_Barcode)
nmibc_samples <- unique(nmibc_new$Tumor_Sample_Barcode)
mibc_count_list <- c()
for (m in mibc_samples){
mibc_count_list <- c(mibc_count_list, nrow(subset(mibc_new, Tumor_Sample_Barcode == m)))
}
nmibc_count_list <- c()
for (n in nmibc_samples){
nmibc_count_list <- c(nmibc_count_list, nrow(subset(nmibc_new, Tumor_Sample_Barcode == n)))
}
mibc_avg <- mean(mibc_count_list)
nmibc_avg <- mean(nmibc_count_list)
wilcox.test(mibc_count_list, nmibc_count_list, paired = FALSE)