-
Notifications
You must be signed in to change notification settings - Fork 0
/
Canceranalysis.r
1407 lines (1392 loc) · 54 KB
/
Canceranalysis.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
(a) Clean your data (if required) before running any model. Create a C&R decision tree
to its full depth. How many leaves are in tree? How do you get this information?
testX = read.csv("testX.csv",header = FALSE)
testY = read.csv("testY.csv",header = FALSE)
trainX = read.csv("trainX.csv",header = FALSE)
trainY = read.csv("trainY.csv",header = FALSE)
#Merging the Test Tables
testXY = cbind(testX,testY)
#Merging the Train Tables
trainXY = cbind(trainX,trainY)
#Column Names Added
colnames(testXY) = c("R_Mean", "T_Mean","P_Mean",
"A_Mean","S_Mean","CP_Mean","CC_Mean", "NC_Mean", "Sy_Mean", "F_Mean","R_SD",
"T_SD","P_SD", "A_SD","S_SD","CP_SD","CC_SD", "NC_SD", "Sy_SD",
"F_SD","R_LAR", "T_LAR","P_LAR", "A_LAR","S_LAR","CP_LAR","CC_LAR", "NC_LAR",
"Sy_LAR", "F_LAR","Tissue")
#Making Tissus as Factor
testXY$Tissue = as.factor(testXY$Tissue)
#Adding Column names
colnames(trainXY)[1:31] = c("R_Mean", "T_Mean","P_Mean",
"A_Mean","S_Mean","CP_Mean","CC_Mean", "NC_Mean", "Sy_Mean", "F_Mean","R_SD",
"T_SD","P_SD", "A_SD","S_SD","CP_SD","CC_SD", "NC_SD", "Sy_SD",
"F_SD","R_LAR", "T_LAR","P_LAR", "A_LAR","S_LAR","CP_LAR","CC_LAR", "NC_LAR",
"Sy_LAR", "F_LAR","Tissues")
View(trainXY)
#cleaning the train data
#Outlier Detection
outlier = function(value)
{
iqr = IQR(value)
q1 = as.numeric(quantile(value,0.25))
q3 = as.numeric(quantile(value,0.75))
higher = q3 + 1.5 * iqr
lower = q1 - 1.5 *iqr
ifelse(value < higher & value >lower, value , NA)
}
trainXY_outlier = sapply(trainXY[,1:30],outlier)
View(trainXY_outlier)
train_clean = data.frame(trainXY_outlier,trainXY[31])
train_final = na.omit(train_clean)
train_final$Tissues = as.factor(train_final$Tissues)
summary(train_final)
## R_Mean T_Mean P_Mean A_Mean
## Min. : 6.981 Min. : 9.71 Min. : 43.79 Min. : 143.5
## 1st Qu.:11.633 1st Qu.:15.80 1st Qu.: 74.69 1st Qu.: 413.3
## Median :12.920 Median :18.16 Median : 83.16 Median : 513.9
## Mean :13.242 Mean :18.60 Mean : 85.52 Mean : 558.2
## 3rd Qu.:14.588 3rd Qu.:21.00 3rd Qu.: 94.43 3rd Qu.: 656.8
## Max. :20.590 Max. :29.81 Max. :137.80 Max. :1320.0
## S_Mean CP_Mean CC_Mean NC_Mean
## Min. :0.06251 Min. :0.01938 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.08456 1st Qu.:0.05981 1st Qu.:0.02555 1st Qu.:0.01782
## Median :0.09254 Median :0.07921 Median :0.04541 Median :0.02775
## Mean :0.09313 Mean :0.08616 Mean :0.05936 Mean :0.03478
## 3rd Qu.:0.10160 3rd Qu.:0.10723 3rd Qu.:0.08150 3rd Qu.:0.04709
## Max. :0.12430 Max. :0.20220 Max. :0.24170 Max. :0.12590
## Sy_Mean F_Mean R_SD T_SD
## Min. :0.1167 Min. :0.04996 Min. :0.1144 Min. :0.3621
## 1st Qu.:0.1586 1st Qu.:0.05756 1st Qu.:0.2135 1st Qu.:0.7811
## Median :0.1724 Median :0.06102 Median :0.2720 Median :1.0240
## Mean :0.1735 Mean :0.06146 Mean :0.2975 Mean :1.0983
## 3rd Qu.:0.1886 3rd Qu.:0.06475 3rd Qu.:0.3581 3rd Qu.:1.3522
## Max. :0.2275 Max. :0.07818 Max. :0.7474 Max. :2.4260
## P_SD A_SD S_SD CP_SD
## Min. :0.757 Min. : 6.802 Min. :0.002838 Min. :0.002252
## 1st Qu.:1.480 1st Qu.:16.093 1st Qu.:0.005033 1st Qu.:0.011843
## Median :1.970 Median :20.695 Median :0.006059 Median :0.017660
## Mean :2.105 Mean :25.323 Mean :0.006532 Mean :0.019877
## 3rd Qu.:2.559 3rd Qu.:29.740 3rd Qu.:0.007803 3rd Qu.:0.025795
## Max. :5.216 Max. :81.230 Max. :0.012910 Max. :0.055920
## CC_SD NC_SD Sy_SD F_SD
## Min. :0.00000 Min. :0.000000 Min. :0.009539 Min. :0.0008948
## 1st Qu.:0.01336 1st Qu.:0.006808 1st Qu.:0.014778 1st Qu.:0.0020890
## Median :0.02043 Median :0.009245 Median :0.018435 Median :0.0027180
## Mean :0.02400 Mean :0.009776 Mean :0.018872 Mean :0.0030428
## 3rd Qu.:0.03172 3rd Qu.:0.012195 3rd Qu.:0.021785 3rd Qu.:0.0037360
## Max. :0.08158 Max. :0.022950 Max. :0.034640 Max. :0.0077310
## R_LAR T_LAR P_LAR A_LAR
## Min. : 7.93 Min. :12.02 Min. : 50.41 Min. : 185.2
## 1st Qu.:12.98 1st Qu.:20.51 1st Qu.: 83.70 1st Qu.: 512.9
## Median :14.24 Median :24.59 Median : 92.44 Median : 623.3
## Mean :14.90 Mean :24.79 Mean : 97.49 Mean : 709.6
## 3rd Qu.:16.38 3rd Qu.:28.61 3rd Qu.:108.60 3rd Qu.: 819.5
## Max. :24.56 Max. :39.34 Max. :166.40 Max. :1872.0
## S_LAR CP_LAR CC_LAR NC_LAR
## Min. :0.08125 Min. :0.03432 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.11440 1st Qu.:0.13533 1st Qu.:0.1047 1st Qu.:0.05942
## Median :0.12885 Median :0.19260 Median :0.1811 Median :0.08337
## Mean :0.12902 Mean :0.21291 Mean :0.2118 Mean :0.09465
## 3rd Qu.:0.14235 3rd Qu.:0.26645 3rd Qu.:0.3070 3rd Qu.:0.12565
## Max. :0.17940 Max. :0.61100 Max. :0.7727 Max. :0.25430
## Sy_LAR F_LAR Tissues
## Min. :0.1783 Min. :0.05521 0:242
## 1st Qu.:0.2487 1st Qu.:0.07074 1: 72
## Median :0.2778 Median :0.07827
## Mean :0.2797 Mean :0.08002
## 3rd Qu.:0.3067 3rd Qu.:0.08761
## Max. :0.4128 Max. :0.12050
str(train_final)
## 'data.frame': 314 obs. of 31 variables:
## $ R_Mean : num 12.9 17.9 19.2 13.7 14.4 ...
## $ T_Mean : num 13.3 24.5 26.6 15.2 16.5 ...
## $ P_Mean : num 82.8 115.2 126.2 88.3 94.2 ...
## $ A_Mean : num 505 999 1138 581 641 ...
## $ S_Mean : num 0.1134 0.0886 0.102 0.0827 0.0975 ...
## $ CP_Mean: num 0.0883 0.0703 0.1453 0.0755 0.1139 ...
## $ CC_Mean: num 0.038 0.057 0.1921 0.0425 0.0801 ...
## $ NC_Mean: num 0.034 0.0474 0.0966 0.0247 0.0422 ...
## $ Sy_Mean: num 0.154 0.154 0.19 0.179 0.191 ...
## $ F_Mean : num 0.0648 0.0551 0.0622 0.059 0.0641 ...
## $ R_SD : num 0.221 0.421 0.636 0.14 0.349 ...
## $ T_SD : num 1.042 1.433 1.001 0.542 0.771 ...
## $ P_SD : num 1.61 2.77 4.32 1.1 2.68 ...
## $ A_SD : num 16.6 45.8 69.7 11.3 32.1 ...
## $ S_SD : num 0.00591 0.00544 0.00739 0.00521 0.00458 ...
## $ CP_SD : num 0.0202 0.0117 0.0245 0.0298 0.0305 ...
## $ CC_SD : num 0.019 0.0162 0.0399 0.0244 0.0384 ...
## $ NC_SD : num 0.01011 0.00852 0.01293 0.00836 0.01243 ...
## $ Sy_SD : num 0.012 0.0142 0.0143 0.0182 0.0187 ...
## $ F_SD : num 0.00311 0.00275 0.00345 0.00487 0.00337 ...
## $ R_LAR : num 14 20.9 23.7 14.5 16.7 ...
## $ T_LAR : num 21.1 34.7 35.9 19.6 21.5 ...
## $ P_LAR : num 92.8 135.1 159.8 98 111.4 ...
## $ A_LAR : num 600 1320 1724 657 862 ...
## $ S_LAR : num 0.155 0.132 0.178 0.128 0.129 ...
## $ CP_LAR : num 0.223 0.181 0.384 0.31 0.337 ...
## $ CC_LAR : num 0.179 0.208 0.575 0.257 0.376 ...
## $ NC_LAR : num 0.116 0.114 0.187 0.105 0.141 ...
## $ Sy_LAR : num 0.238 0.25 0.326 0.339 0.305 ...
## $ F_LAR : num 0.0855 0.0795 0.0972 0.0964 0.0876 ...
## $ Tissues: Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:141] 1 6 9 13 20 23 29 32 33
35 ...
## ..- attr(*, "names")= chr [1:141] "1" "6" "9" "13" ...
#classification and regression tree
C_Rtree=ctree(Tissues~.,data=train_final)
plot(C_Rtree)
#Create a simple decision tree using rpart using train data
#Decision Tree
C_R = rpart(Tissues~., data = train_final)
rpart.plot(C_R)
#Creating a full depth Decision tree
C_R_full = rpart(Tissues~., data = train_final, parms = list(split =
"information"), control = rpart.control(minsplit = 0, minbucket = 0, cp = -1))
rpart.plot(C_R_full)
print(C_R_full)
## n= 314
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 314 72 0 (0.770700637 0.229299363)
## 2) P_LAR< 105.95 227 5 0 (0.977973568 0.022026432)
## 4) CC_LAR< 0.40725 218 1 0 (0.995412844 0.004587156)
## 8) T_LAR< 33.27 208 0 0 (1.000000000 0.000000000) *
## 9) T_LAR>=33.27 10 1 0 (0.900000000 0.100000000)
## 18) R_LAR< 14.42 9 0 0 (1.000000000 0.000000000) *
## 19) R_LAR>=14.42 1 0 1 (0.000000000 1.000000000) *
## 5) CC_LAR>=0.40725 9 4 0 (0.555555556 0.444444444)
## 10) T_LAR< 26.9 5 0 0 (1.000000000 0.000000000) *
## 11) T_LAR>=26.9 4 0 1 (0.000000000 1.000000000) *
## 3) P_LAR>=105.95 87 20 1 (0.229885057 0.770114943)
## 6) T_LAR< 20.045 13 0 0 (1.000000000 0.000000000) *
## 7) T_LAR>=20.045 74 7 1 (0.094594595 0.905405405)
## 14) NC_Mean< 0.048785 15 6 1 (0.400000000 0.600000000)
## 28) CP_SD>=0.02017 5 0 0 (1.000000000 0.000000000) *
## 29) CP_SD< 0.02017 10 1 1 (0.100000000 0.900000000)
## 58) Sy_Mean< 0.14685 1 0 0 (1.000000000 0.000000000) *
## 59) Sy_Mean>=0.14685 9 0 1 (0.000000000 1.000000000) *
## 15) NC_Mean>=0.048785 59 1 1 (0.016949153 0.983050847)
## 30) Sy_SD< 0.01062 1 0 0 (1.000000000 0.000000000) *
## 31) Sy_SD>=0.01062 58 0 1 (0.000000000 1.000000000) *
summary(C_R_full)
## Call:
## rpart(formula = Tissues ~ ., data = train_final, parms = list(split =
"information"),
## control = rpart.control(minsplit = 0, minbucket = 0, cp = -1))
## n= 314
##
## CP nsplit rel error xerror xstd
## 1 0.652777778 0 1.00000000 1.0000000 0.10346099
## 2 0.180555556 1 0.34722222 0.4722222 0.07647529
## 3 0.034722222 2 0.16666667 0.3472222 0.06662260
## 4 0.027777778 4 0.09722222 0.3055556 0.06282109
## 5 0.013888889 6 0.04166667 0.3194444 0.06412289
## 6 0.006944444 8 0.01388889 0.3472222 0.06662260
## 7 -1.000000000 10 0.00000000 0.3472222 0.06662260
##
## Variable importance
## P_LAR A_LAR R_LAR P_Mean A_Mean R_Mean T_LAR T_Mean CC_SD
NC_Mean
## 14 14 13 11 11 11 5 3 2
2
## Sy_Mean CC_LAR CC_Mean NC_LAR Sy_SD NC_SD CP_SD CP_Mean CP_LAR
T_SD
## 2 2 1 1 1 1 1 1 1
1
## A_SD S_Mean F_LAR P_SD R_SD
## 1 1 1 1 1
##
## Node number 1: 314 observations, complexity param=0.6527778
## predicted class=0 expected loss=0.2292994 P(node) =1
## class counts: 242 72
## probabilities: 0.771 0.229
## left son=2 (227 obs) right son=3 (87 obs)
## Primary splits:
## P_LAR < 105.95 to the left, improve=98.13947, (0 missing)
## A_LAR < 727.1 to the left, improve=92.26757, (0 missing)
## R_LAR < 15.515 to the left, improve=90.94375, (0 missing)
## NC_LAR < 0.11135 to the left, improve=87.62984, (0 missing)
## NC_Mean < 0.048785 to the left, improve=86.75491, (0 missing)
## Surrogate splits:
## A_LAR < 784.15 to the left, agree=0.978, adj=0.920, (0 split)
## R_LAR < 16.205 to the left, agree=0.971, adj=0.897, (0 split)
## P_Mean < 91.405 to the left, agree=0.939, adj=0.782, (0 split)
## R_Mean < 14.315 to the left, agree=0.933, adj=0.759, (0 split)
## A_Mean < 632.8 to the left, agree=0.933, adj=0.759, (0 split)
##
## Node number 2: 227 observations, complexity param=0.02777778
## predicted class=0 expected loss=0.02202643 P(node) =0.7229299
## class counts: 222 5
## probabilities: 0.978 0.022
## left son=4 (218 obs) right son=5 (9 obs)
## Primary splits:
## CC_LAR < 0.40725 to the left, improve=11.457230, (0 missing)
## NC_LAR < 0.13485 to the left, improve=11.457230, (0 missing)
## CC_Mean < 0.09752 to the left, improve=10.015610, (0 missing)
## NC_Mean < 0.045465 to the left, improve= 9.287370, (0 missing)
## CP_LAR < 0.4124 to the left, improve= 8.462249, (0 missing)
## Surrogate splits:
## NC_LAR < 0.1443 to the left, agree=0.987, adj=0.667, (0 split)
## CP_LAR < 0.399 to the left, agree=0.982, adj=0.556, (0 split)
## CC_Mean < 0.1075 to the left, agree=0.978, adj=0.444, (0 split)
## F_LAR < 0.1148 to the left, agree=0.974, adj=0.333, (0 split)
## S_LAR < 0.17765 to the left, agree=0.969, adj=0.222, (0 split)
##
## Node number 3: 87 observations, complexity param=0.1805556
## predicted class=1 expected loss=0.2298851 P(node) =0.2770701
## class counts: 20 67
## probabilities: 0.230 0.770
## left son=6 (13 obs) right son=7 (74 obs)
## Primary splits:
## T_LAR < 20.045 to the left, improve=23.73991, (0 missing)
## T_Mean < 16.95 to the left, improve=21.74517, (0 missing)
## NC_Mean < 0.06381 to the left, improve=14.85026, (0 missing)
## S_LAR < 0.13595 to the left, improve=14.38966, (0 missing)
## CC_Mean < 0.08556 to the left, improve=13.86440, (0 missing)
## Surrogate splits:
## T_Mean < 15.745 to the left, agree=0.977, adj=0.846, (0 split)
## T_SD < 0.47315 to the left, agree=0.885, adj=0.231, (0 split)
## Sy_Mean < 0.1384 to the left, agree=0.874, adj=0.154, (0 split)
## R_SD < 0.18995 to the left, agree=0.874, adj=0.154, (0 split)
## P_SD < 1.245 to the left, agree=0.874, adj=0.154, (0 split)
##
## Node number 4: 218 observations, complexity param=0.006944444
## predicted class=0 expected loss=0.004587156 P(node) =0.6942675
## class counts: 217 1
## probabilities: 0.995 0.005
## left son=8 (208 obs) right son=9 (10 obs)
## Primary splits:
## T_LAR < 33.27 to the left, improve=3.131368, (0 missing)
## T_Mean < 22.455 to the left, improve=2.183594, (0 missing)
## S_LAR < 0.14185 to the left, improve=1.705924, (0 missing)
## CC_LAR < 0.21395 to the left, improve=1.632717, (0 missing)
## NC_LAR < 0.093205 to the left, improve=1.542765, (0 missing)
## Surrogate splits:
## T_Mean < 25.175 to the left, agree=0.977, adj=0.5, (0 split)
## T_SD < 2.2195 to the left, agree=0.959, adj=0.1, (0 split)
##
## Node number 5: 9 observations, complexity param=0.02777778
## predicted class=0 expected loss=0.4444444 P(node) =0.02866242
## class counts: 5 4
## probabilities: 0.556 0.444
## left son=10 (5 obs) right son=11 (4 obs)
## Primary splits:
## T_LAR < 26.9 to the left, improve=6.182654, (0 missing)
## A_LAR < 708.7 to the left, improve=6.182654, (0 missing)
## S_Mean < 0.095875 to the left, improve=3.680642, (0 missing)
## NC_Mean < 0.04271 to the left, improve=3.680642, (0 missing)
## A_SD < 18.345 to the left, improve=3.680642, (0 missing)
## Surrogate splits:
## A_LAR < 708.7 to the left, agree=1.000, adj=1.00, (0 split)
## S_Mean < 0.095875 to the left, agree=0.889, adj=0.75, (0 split)
## NC_Mean < 0.04271 to the left, agree=0.889, adj=0.75, (0 split)
## A_SD < 18.345 to the left, agree=0.889, adj=0.75, (0 split)
## CC_SD < 0.039445 to the right, agree=0.889, adj=0.75, (0 split)
##
## Node number 6: 13 observations
## predicted class=0 expected loss=0 P(node) =0.04140127
## class counts: 13 0
## probabilities: 1.000 0.000
##
## Node number 7: 74 observations, complexity param=0.03472222
## predicted class=1 expected loss=0.09459459 P(node) =0.2356688
## class counts: 7 67
## probabilities: 0.095 0.905
## left son=14 (15 obs) right son=15 (59 obs)
## Primary splits:
## NC_Mean < 0.048785 to the left, improve=8.000851, (0 missing)
## R_LAR < 16.8 to the left, improve=7.528256, (0 missing)
## S_LAR < 0.13525 to the left, improve=7.419656, (0 missing)
## CC_Mean < 0.080135 to the left, improve=7.093591, (0 missing)
## A_LAR < 868.2 to the left, improve=7.093591, (0 missing)
## Surrogate splits:
## CC_Mean < 0.076355 to the left, agree=0.932, adj=0.667, (0 split)
## CP_Mean < 0.072635 to the left, agree=0.865, adj=0.333, (0 split)
## CC_SD < 0.01873 to the left, agree=0.865, adj=0.333, (0 split)
## NC_LAR < 0.12105 to the left, agree=0.865, adj=0.333, (0 split)
## NC_SD < 0.0085795 to the left, agree=0.851, adj=0.267, (0 split)
##
## Node number 8: 208 observations
## predicted class=0 expected loss=0 P(node) =0.6624204
## class counts: 208 0
## probabilities: 1.000 0.000
##
## Node number 9: 10 observations, complexity param=0.006944444
## predicted class=0 expected loss=0.1 P(node) =0.03184713
## class counts: 9 1
## probabilities: 0.900 0.100
## left son=18 (9 obs) right son=19 (1 obs)
## Primary splits:
## R_LAR < 14.42 to the left, improve=3.25083, (0 missing)
## T_LAR < 33.56 to the right, improve=3.25083, (0 missing)
## P_LAR < 91.665 to the left, improve=3.25083, (0 missing)
## A_LAR < 643.25 to the left, improve=3.25083, (0 missing)
## NC_LAR < 0.09229 to the left, improve=3.25083, (0 missing)
##
## Node number 10: 5 observations
## predicted class=0 expected loss=0 P(node) =0.01592357
## class counts: 5 0
## probabilities: 1.000 0.000
##
## Node number 11: 4 observations
## predicted class=1 expected loss=0 P(node) =0.01273885
## class counts: 0 4
## probabilities: 0.000 1.000
##
## Node number 14: 15 observations, complexity param=0.03472222
## predicted class=1 expected loss=0.4 P(node) =0.0477707
## class counts: 6 9
## probabilities: 0.400 0.600
## left son=28 (5 obs) right son=29 (10 obs)
## Primary splits:
## CP_SD < 0.02017 to the right, improve=6.844345, (0 missing)
## NC_SD < 0.0105105 to the right, improve=6.844345, (0 missing)
## R_LAR < 16.825 to the left, improve=6.844345, (0 missing)
## Sy_Mean < 0.1862 to the right, improve=4.879643, (0 missing)
## CC_SD < 0.030115 to the right, improve=4.879643, (0 missing)
## Surrogate splits:
## NC_SD < 0.0105105 to the right, agree=1.000, adj=1.0, (0 split)
## Sy_Mean < 0.18025 to the right, agree=0.933, adj=0.8, (0 split)
## CC_SD < 0.030115 to the right, agree=0.933, adj=0.8, (0 split)
## CP_Mean < 0.097055 to the right, agree=0.867, adj=0.6, (0 split)
## Sy_SD < 0.014925 to the right, agree=0.867, adj=0.6, (0 split)
##
## Node number 15: 59 observations, complexity param=0.01388889
## predicted class=1 expected loss=0.01694915 P(node) =0.1878981
## class counts: 1 58
## probabilities: 0.017 0.983
## left son=30 (1 obs) right son=31 (58 obs)
## Primary splits:
## Sy_SD < 0.01062 to the left, improve=5.069015, (0 missing)
## T_LAR < 20.645 to the left, improve=5.069015, (0 missing)
## CC_LAR < 0.1894 to the left, improve=5.069015, (0 missing)
## NC_LAR < 0.1057 to the left, improve=5.069015, (0 missing)
## Sy_LAR < 0.21975 to the left, improve=5.069015, (0 missing)
##
## Node number 18: 9 observations
## predicted class=0 expected loss=0 P(node) =0.02866242
## class counts: 9 0
## probabilities: 1.000 0.000
##
## Node number 19: 1 observations
## predicted class=1 expected loss=0 P(node) =0.003184713
## class counts: 0 1
## probabilities: 0.000 1.000
##
## Node number 28: 5 observations
## predicted class=0 expected loss=0 P(node) =0.01592357
## class counts: 5 0
## probabilities: 1.000 0.000
##
## Node number 29: 10 observations, complexity param=0.01388889
## predicted class=1 expected loss=0.1 P(node) =0.03184713
## class counts: 1 9
## probabilities: 0.100 0.900
## left son=58 (1 obs) right son=59 (9 obs)
## Primary splits:
## Sy_Mean < 0.14685 to the left, improve=3.25083, (0 missing)
## F_SD < 0.001342 to the left, improve=3.25083, (0 missing)
## R_LAR < 16.675 to the left, improve=3.25083, (0 missing)
## P_LAR < 107.3 to the left, improve=3.25083, (0 missing)
## A_LAR < 839.85 to the left, improve=3.25083, (0 missing)
##
## Node number 30: 1 observations
## predicted class=0 expected loss=0 P(node) =0.003184713
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 31: 58 observations
## predicted class=1 expected loss=0 P(node) =0.1847134
## class counts: 0 58
## probabilities: 0.000 1.000
##
## Node number 58: 1 observations
## predicted class=0 expected loss=0 P(node) =0.003184713
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 59: 9 observations
## predicted class=1 expected loss=0 P(node) =0.02866242
## class counts: 0 9
## probabilities: 0.000 1.000
#Calculating Leaf nodes
printcp(C_R_full)
##
## Classification tree:
## rpart(formula = Tissues ~ ., data = train_final, parms = list(split =
"information"),
## control = rpart.control(minsplit = 0, minbucket = 0, cp = -1))
##
## Variables actually used in tree construction:
## [1] CC_LAR CP_SD NC_Mean P_LAR R_LAR Sy_Mean Sy_SD T_LAR
##
## Root node error: 72/314 = 0.2293
##
## n= 314
##
## CP nsplit rel error xerror xstd
## 1 0.6527778 0 1.000000 1.00000 0.103461
## 2 0.1805556 1 0.347222 0.47222 0.076475
## 3 0.0347222 2 0.166667 0.34722 0.066623
## 4 0.0277778 4 0.097222 0.30556 0.062821
## 5 0.0138889 6 0.041667 0.31944 0.064123
## 6 0.0069444 8 0.013889 0.34722 0.066623
## 7 -1.0000000 10 0.000000 0.34722 0.066623
As we can see, there are 11 terminal nodes. We made the full depth decision tree with
minimum split being zero and minimum bucket being zero and cp value = -1.
(b) What are the major predictors of Diagnosis? Please justify your reasoning.
summary(C_R_full)
## Call:
## rpart(formula = Tissues ~ ., data = train_final, parms = list(split =
"information"),
## control = rpart.control(minsplit = 0, minbucket = 0, cp = -1))
## n= 314
##
## CP nsplit rel error xerror xstd
## 1 0.652777778 0 1.00000000 1.0000000 0.10346099
## 2 0.180555556 1 0.34722222 0.4722222 0.07647529
## 3 0.034722222 2 0.16666667 0.3472222 0.06662260
## 4 0.027777778 4 0.09722222 0.3055556 0.06282109
## 5 0.013888889 6 0.04166667 0.3194444 0.06412289
## 6 0.006944444 8 0.01388889 0.3472222 0.06662260
## 7 -1.000000000 10 0.00000000 0.3472222 0.06662260
##
## Variable importance
## P_LAR A_LAR R_LAR P_Mean A_Mean R_Mean T_LAR T_Mean CC_SD
NC_Mean
## 14 14 13 11 11 11 5 3 2
2
## Sy_Mean CC_LAR CC_Mean NC_LAR Sy_SD NC_SD CP_SD CP_Mean CP_LAR
T_SD
## 2 2 1 1 1 1 1 1 1
1
## A_SD S_Mean F_LAR P_SD R_SD
## 1 1 1 1 1
##
## Node number 1: 314 observations, complexity param=0.6527778
## predicted class=0 expected loss=0.2292994 P(node) =1
## class counts: 242 72
## probabilities: 0.771 0.229
## left son=2 (227 obs) right son=3 (87 obs)
## Primary splits:
## P_LAR < 105.95 to the left, improve=98.13947, (0 missing)
## A_LAR < 727.1 to the left, improve=92.26757, (0 missing)
## R_LAR < 15.515 to the left, improve=90.94375, (0 missing)
## NC_LAR < 0.11135 to the left, improve=87.62984, (0 missing)
## NC_Mean < 0.048785 to the left, improve=86.75491, (0 missing)
## Surrogate splits:
## A_LAR < 784.15 to the left, agree=0.978, adj=0.920, (0 split)
## R_LAR < 16.205 to the left, agree=0.971, adj=0.897, (0 split)
## P_Mean < 91.405 to the left, agree=0.939, adj=0.782, (0 split)
## R_Mean < 14.315 to the left, agree=0.933, adj=0.759, (0 split)
## A_Mean < 632.8 to the left, agree=0.933, adj=0.759, (0 split)
##
## Node number 2: 227 observations, complexity param=0.02777778
## predicted class=0 expected loss=0.02202643 P(node) =0.7229299
## class counts: 222 5
## probabilities: 0.978 0.022
## left son=4 (218 obs) right son=5 (9 obs)
## Primary splits:
## CC_LAR < 0.40725 to the left, improve=11.457230, (0 missing)
## NC_LAR < 0.13485 to the left, improve=11.457230, (0 missing)
## CC_Mean < 0.09752 to the left, improve=10.015610, (0 missing)
## NC_Mean < 0.045465 to the left, improve= 9.287370, (0 missing)
## CP_LAR < 0.4124 to the left, improve= 8.462249, (0 missing)
## Surrogate splits:
## NC_LAR < 0.1443 to the left, agree=0.987, adj=0.667, (0 split)
## CP_LAR < 0.399 to the left, agree=0.982, adj=0.556, (0 split)
## CC_Mean < 0.1075 to the left, agree=0.978, adj=0.444, (0 split)
## F_LAR < 0.1148 to the left, agree=0.974, adj=0.333, (0 split)
## S_LAR < 0.17765 to the left, agree=0.969, adj=0.222, (0 split)
##
## Node number 3: 87 observations, complexity param=0.1805556
## predicted class=1 expected loss=0.2298851 P(node) =0.2770701
## class counts: 20 67
## probabilities: 0.230 0.770
## left son=6 (13 obs) right son=7 (74 obs)
## Primary splits:
## T_LAR < 20.045 to the left, improve=23.73991, (0 missing)
## T_Mean < 16.95 to the left, improve=21.74517, (0 missing)
## NC_Mean < 0.06381 to the left, improve=14.85026, (0 missing)
## S_LAR < 0.13595 to the left, improve=14.38966, (0 missing)
## CC_Mean < 0.08556 to the left, improve=13.86440, (0 missing)
## Surrogate splits:
## T_Mean < 15.745 to the left, agree=0.977, adj=0.846, (0 split)
## T_SD < 0.47315 to the left, agree=0.885, adj=0.231, (0 split)
## Sy_Mean < 0.1384 to the left, agree=0.874, adj=0.154, (0 split)
## R_SD < 0.18995 to the left, agree=0.874, adj=0.154, (0 split)
## P_SD < 1.245 to the left, agree=0.874, adj=0.154, (0 split)
##
## Node number 4: 218 observations, complexity param=0.006944444
## predicted class=0 expected loss=0.004587156 P(node) =0.6942675
## class counts: 217 1
## probabilities: 0.995 0.005
## left son=8 (208 obs) right son=9 (10 obs)
## Primary splits:
## T_LAR < 33.27 to the left, improve=3.131368, (0 missing)
## T_Mean < 22.455 to the left, improve=2.183594, (0 missing)
## S_LAR < 0.14185 to the left, improve=1.705924, (0 missing)
## CC_LAR < 0.21395 to the left, improve=1.632717, (0 missing)
## NC_LAR < 0.093205 to the left, improve=1.542765, (0 missing)
## Surrogate splits:
## T_Mean < 25.175 to the left, agree=0.977, adj=0.5, (0 split)
## T_SD < 2.2195 to the left, agree=0.959, adj=0.1, (0 split)
##
## Node number 5: 9 observations, complexity param=0.02777778
## predicted class=0 expected loss=0.4444444 P(node) =0.02866242
## class counts: 5 4
## probabilities: 0.556 0.444
## left son=10 (5 obs) right son=11 (4 obs)
## Primary splits:
## T_LAR < 26.9 to the left, improve=6.182654, (0 missing)
## A_LAR < 708.7 to the left, improve=6.182654, (0 missing)
## S_Mean < 0.095875 to the left, improve=3.680642, (0 missing)
## NC_Mean < 0.04271 to the left, improve=3.680642, (0 missing)
## A_SD < 18.345 to the left, improve=3.680642, (0 missing)
## Surrogate splits:
## A_LAR < 708.7 to the left, agree=1.000, adj=1.00, (0 split)
## S_Mean < 0.095875 to the left, agree=0.889, adj=0.75, (0 split)
## NC_Mean < 0.04271 to the left, agree=0.889, adj=0.75, (0 split)
## A_SD < 18.345 to the left, agree=0.889, adj=0.75, (0 split)
## CC_SD < 0.039445 to the right, agree=0.889, adj=0.75, (0 split)
##
## Node number 6: 13 observations
## predicted class=0 expected loss=0 P(node) =0.04140127
## class counts: 13 0
## probabilities: 1.000 0.000
##
## Node number 7: 74 observations, complexity param=0.03472222
## predicted class=1 expected loss=0.09459459 P(node) =0.2356688
## class counts: 7 67
## probabilities: 0.095 0.905
## left son=14 (15 obs) right son=15 (59 obs)
## Primary splits:
## NC_Mean < 0.048785 to the left, improve=8.000851, (0 missing)
## R_LAR < 16.8 to the left, improve=7.528256, (0 missing)
## S_LAR < 0.13525 to the left, improve=7.419656, (0 missing)
## CC_Mean < 0.080135 to the left, improve=7.093591, (0 missing)
## A_LAR < 868.2 to the left, improve=7.093591, (0 missing)
## Surrogate splits:
## CC_Mean < 0.076355 to the left, agree=0.932, adj=0.667, (0 split)
## CP_Mean < 0.072635 to the left, agree=0.865, adj=0.333, (0 split)
## CC_SD < 0.01873 to the left, agree=0.865, adj=0.333, (0 split)
## NC_LAR < 0.12105 to the left, agree=0.865, adj=0.333, (0 split)
## NC_SD < 0.0085795 to the left, agree=0.851, adj=0.267, (0 split)
##
## Node number 8: 208 observations
## predicted class=0 expected loss=0 P(node) =0.6624204
## class counts: 208 0
## probabilities: 1.000 0.000
##
## Node number 9: 10 observations, complexity param=0.006944444
## predicted class=0 expected loss=0.1 P(node) =0.03184713
## class counts: 9 1
## probabilities: 0.900 0.100
## left son=18 (9 obs) right son=19 (1 obs)
## Primary splits:
## R_LAR < 14.42 to the left, improve=3.25083, (0 missing)
## T_LAR < 33.56 to the right, improve=3.25083, (0 missing)
## P_LAR < 91.665 to the left, improve=3.25083, (0 missing)
## A_LAR < 643.25 to the left, improve=3.25083, (0 missing)
## NC_LAR < 0.09229 to the left, improve=3.25083, (0 missing)
##
## Node number 10: 5 observations
## predicted class=0 expected loss=0 P(node) =0.01592357
## class counts: 5 0
## probabilities: 1.000 0.000
##
## Node number 11: 4 observations
## predicted class=1 expected loss=0 P(node) =0.01273885
## class counts: 0 4
## probabilities: 0.000 1.000
##
## Node number 14: 15 observations, complexity param=0.03472222
## predicted class=1 expected loss=0.4 P(node) =0.0477707
## class counts: 6 9
## probabilities: 0.400 0.600
## left son=28 (5 obs) right son=29 (10 obs)
## Primary splits:
## CP_SD < 0.02017 to the right, improve=6.844345, (0 missing)
## NC_SD < 0.0105105 to the right, improve=6.844345, (0 missing)
## R_LAR < 16.825 to the left, improve=6.844345, (0 missing)
## Sy_Mean < 0.1862 to the right, improve=4.879643, (0 missing)
## CC_SD < 0.030115 to the right, improve=4.879643, (0 missing)
## Surrogate splits:
## NC_SD < 0.0105105 to the right, agree=1.000, adj=1.0, (0 split)
## Sy_Mean < 0.18025 to the right, agree=0.933, adj=0.8, (0 split)
## CC_SD < 0.030115 to the right, agree=0.933, adj=0.8, (0 split)
## CP_Mean < 0.097055 to the right, agree=0.867, adj=0.6, (0 split)
## Sy_SD < 0.014925 to the right, agree=0.867, adj=0.6, (0 split)
##
## Node number 15: 59 observations, complexity param=0.01388889
## predicted class=1 expected loss=0.01694915 P(node) =0.1878981
## class counts: 1 58
## probabilities: 0.017 0.983
## left son=30 (1 obs) right son=31 (58 obs)
## Primary splits:
## Sy_SD < 0.01062 to the left, improve=5.069015, (0 missing)
## T_LAR < 20.645 to the left, improve=5.069015, (0 missing)
## CC_LAR < 0.1894 to the left, improve=5.069015, (0 missing)
## NC_LAR < 0.1057 to the left, improve=5.069015, (0 missing)
## Sy_LAR < 0.21975 to the left, improve=5.069015, (0 missing)
##
## Node number 18: 9 observations
## predicted class=0 expected loss=0 P(node) =0.02866242
## class counts: 9 0
## probabilities: 1.000 0.000
##
## Node number 19: 1 observations
## predicted class=1 expected loss=0 P(node) =0.003184713
## class counts: 0 1
## probabilities: 0.000 1.000
##
## Node number 28: 5 observations
## predicted class=0 expected loss=0 P(node) =0.01592357
## class counts: 5 0
## probabilities: 1.000 0.000
##
## Node number 29: 10 observations, complexity param=0.01388889
## predicted class=1 expected loss=0.1 P(node) =0.03184713
## class counts: 1 9
## probabilities: 0.100 0.900
## left son=58 (1 obs) right son=59 (9 obs)
## Primary splits:
## Sy_Mean < 0.14685 to the left, improve=3.25083, (0 missing)
## F_SD < 0.001342 to the left, improve=3.25083, (0 missing)
## R_LAR < 16.675 to the left, improve=3.25083, (0 missing)
## P_LAR < 107.3 to the left, improve=3.25083, (0 missing)
## A_LAR < 839.85 to the left, improve=3.25083, (0 missing)
##
## Node number 30: 1 observations
## predicted class=0 expected loss=0 P(node) =0.003184713
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 31: 58 observations
## predicted class=1 expected loss=0 P(node) =0.1847134
## class counts: 0 58
## probabilities: 0.000 1.000
##
## Node number 58: 1 observations
## predicted class=0 expected loss=0 P(node) =0.003184713
## class counts: 1 0
## probabilities: 1.000 0.000
##
## Node number 59: 9 observations
## predicted class=1 expected loss=0 P(node) =0.02866242
## class counts: 0 9
## probabilities: 0.000 1.000
The major predictors for the Train data as can be seen from the full depth decision tree are:
- The root node: Largest Perimeter. - As we can see from the summary that the important
variables of our diagnosis are Largest Perimter, Largest Area and Largest Radius etc. which
will be our major predictors for the diagnosis as well.
(c) Give two strong rules that describe who is likely to have cancer. Please justify your
choices.
rules = apriori(data = train_final, parameter = list(supp = 0.1, conf =
0.8))
## Warning: Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
## 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 not logical or
factor.
## Applying default discretization (see '? discretizeDF').
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 31
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[92 item(s), 314 transaction(s)] done [0.00s].
## sorting and recoding items ... [92 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(data = train_final, parameter = list(supp = 0.1, conf =
## 0.8)): Mining stopped (maxlen reached). Only patterns up to a length of 10
## returned!
## done [0.15s].
## writing ... [1377687 rule(s)] done [0.46s].
## creating S4 object ... done [0.62s].
inspect(rules[1:10])
## lhs rhs support confidence coverage
## [1] {Tissues=1} => {CC_LAR=[0.252,0.773]} 0.2038217 0.8888889
0.2292994
## [2] {Tissues=1} => {NC_LAR=[0.109,0.254]} 0.2197452 0.9583333
0.2292994
## [3] {Tissues=1} => {R_Mean=[13.8,20.6]} 0.2006369 0.8750000
0.2292994
## [4] {Tissues=1} => {P_Mean=[88.9,138]} 0.2038217 0.8888889
0.2292994
## [5] {Tissues=1} => {A_Mean=[586,1.32e+03]} 0.1974522 0.8611111
0.2292994
## [6] {Tissues=1} => {NC_Mean=[0.0374,0.126]} 0.2101911 0.9166667
0.2292994
## [7] {Tissues=1} => {CC_Mean=[0.0666,0.242]} 0.2038217 0.8888889
0.2292994
## [8] {Tissues=1} => {A_LAR=[738,1.87e+03]} 0.2229299 0.9722222
0.2292994
## [9] {Tissues=1} => {P_LAR=[102,166]} 0.2261146 0.9861111
0.2292994
## [10] {Tissues=1} => {R_LAR=[15.5,24.6]} 0.2229299 0.9722222
0.2292994
## lift count
## [1] 2.658201 64
## [2] 2.865873 69
## [3] 2.616667 63
## [4] 2.658201 64
## [5] 2.575132 62
## [6] 2.741270 66
## [7] 2.658201 64
## [8] 2.907407 70
## [9] 2.948942 71
## [10] 2.907407 70
Using the above output, we can make analysis such as
1) 98.6% of people having malignant cancerous tissues have largest value for
Perimeter in the range of [102,166]
2) 97.22% of people having malignant cancerous tissues have largest value for Radius
in the range of [15.5,24.6]
(d) What is the accuracy of your decision tree model on the training data? What is the
accuracy of this model on the test data?
#prediction for train data
train_predicted = predict(C_R_full,train_final,type = "class")
print(train_predicted)
## 2 3 4 5 7 8 10 11 12 14 15 16 17 18 19 21 22 24
25 26
## 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0
1 1
## 27 28 30 31 34 37 38 40 41 43 45 46 48 49 50 52 53 54
56 59
## 1 0 1 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0
0 0
## 60 61 63 64 65 71 73 74 75 76 78 79 80 82 83 86 88 89
90 91
## 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0
0 0
## 92 94 95 96 100 102 104 105 106 107 109 111 112 115 117 118 119 121
122 124
## 1 1 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0
1 0
## 126 127 128 130 131 132 133 134 135 136 138 139 140 142 143 145 146 147
150 151
## 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0
0 0
## 152 154 155 156 158 160 162 163 164 166 167 168 169 171 172 173 174 175
176 177
## 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0
0 0
## 178 180 181 182 183 187 188 189 190 191 192 194 195 198 201 202 204 205
207 208
## 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1
0 1
## 212 213 215 216 217 218 221 222 223 224 225 227 228 229 230 231 232 233
234 235
## 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0
## 236 237 238 239 240 242 243 244 245 248 249 250 252 253 254 255 256 257
260 261
## 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1
0 1
## 266 268 272 274 275 276 277 279 282 283 284 285 286 287 289 291 292 293
294 296
## 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1
0 1
## 297 300 301 302 303 305 306 307 310 311 312 314 319 321 322 323 324 328
329 330
## 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
0 0
## 332 333 335 336 338 339 340 341 342 343 345 346 347 348 350 351 352 354
355 356
## 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0
0 0
## 357 358 359 360 362 363 364 365 366 369 370 371 372 373 374 375 377 378
379 380
## 0 0 0 0 1 0 1 0 1 1 0 0 1 0 0 1 1 0
0 0
## 381 382 383 384 385 386 387 388 389 390 392 394 396 398 399 400 401 402
403 405
## 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
0 0
## 410 411 413 414 415 416 417 418 420 421 422 423 425 426 427 428 431 432
433 434
## 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0
1 0
## 435 437 438 439 444 445 446 447 448 449 451 453 454 455
## 1 1 0 0 0 0 0 0 0 0 1 0 0 1
## Levels: 0 1
mean(train_final$Tissues == train_predicted)
## [1] 1
#Confusion Matrix
confusionMatrix(train_predicted,train_final$Tissues)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 242 0
## 1 0 72
##
## Accuracy : 1
## 95% CI : (0.9883, 1)
## No Information Rate : 0.7707
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.7707
## Detection Rate : 0.7707
## Detection Prevalence : 0.7707
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
#Prediction for test data
test_predicted_class = predict(C_R_full,testXY, type = "class")
print(test_predicted_class)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
26
## 1 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0
0
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
52
## 0 0 1 1 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0
0
## 53 54 55 56 57
## 0 1 1 0 0
## Levels: 0 1
#Confusion Matrix
confusionMatrix(test_predicted_class, testXY$Tissue)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 30 8
## 1 2 17
##
## Accuracy : 0.8246
## 95% CI : (0.7009, 0.9125)
## No Information Rate : 0.5614
## P-Value [Acc > NIR] : 2.515e-05
##
## Kappa : 0.6341
##
## Mcnemar's Test P-Value : 0.1138
##
## Sensitivity : 0.9375
## Specificity : 0.6800
## Pos Pred Value : 0.7895
## Neg Pred Value : 0.8947
## Prevalence : 0.5614
## Detection Rate : 0.5263
## Detection Prevalence : 0.6667
## Balanced Accuracy : 0.8088
##
## 'Positive' Class : 0
##
(e) Is it possible to improve the performance of your model?
Yes its possible for the test data as its accuracy is 82.4% so we can prune the test data
model to improve its accuracy. The accuracy of our training model is already 1 so we do not
need to prune it further.
#prune a test data
printcp(C_R_full)
##
## Classification tree:
## rpart(formula = Tissues ~ ., data = train_final, parms = list(split =
"information"),
## control = rpart.control(minsplit = 0, minbucket = 0, cp = -1))
##
## Variables actually used in tree construction:
## [1] CC_LAR CP_SD NC_Mean P_LAR R_LAR Sy_Mean Sy_SD T_LAR
##
## Root node error: 72/314 = 0.2293
##
## n= 314
##
## CP nsplit rel error xerror xstd
## 1 0.6527778 0 1.000000 1.00000 0.103461
## 2 0.1805556 1 0.347222 0.47222 0.076475
## 3 0.0347222 2 0.166667 0.34722 0.066623
## 4 0.0277778 4 0.097222 0.30556 0.062821
## 5 0.0138889 6 0.041667 0.31944 0.064123
## 6 0.0069444 8 0.013889 0.34722 0.066623
## 7 -1.0000000 10 0.000000 0.34722 0.066623
pruned = prune(C_R_full,cp = 0.65)
fancyRpartPlot(pruned)
pruned.pred = predict(pruned, testXY, type = 'class')
confusionMatrix(testXY$Tissue, pruned.pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 28 4
## 1 3 22
##
## Accuracy : 0.8772
## 95% CI : (0.7632, 0.9492)
## No Information Rate : 0.5439
## P-Value [Acc > NIR] : 7.681e-08
##
## Kappa : 0.7517
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9032
## Specificity : 0.8462
## Pos Pred Value : 0.8750
## Neg Pred Value : 0.8800
## Prevalence : 0.5439
## Detection Rate : 0.4912
## Detection Prevalence : 0.5614
## Balanced Accuracy : 0.8747
##
## 'Positive' Class : 0
##
print(pruned.pred)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
26
## 0 0 0 1 0 1 0 1 0 0 1 1 0 0 1 1 0 0 1 0 1 0 0 0 0
0
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
52
## 1 0 1 1 0 0 0 1 1 1 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0
0
## 53 54 55 56 57
## 0 1 1 1 0
## Levels: 0 1
view(pruned.pred)
We can see the accuracy has increased from 82.4% to 87% with the help of pruning
f)Construct the best possible decision tree to predict the Y labels. Explain how you
construct such tree.
C_R_full_gini = rpart(Tissues~., data = train_final, parms = list(split =
"gini"), control = rpart.control(minsplit = 0, minbucket = 0, cp = -1))
summary(C_R_full_gini)
## Call:
## rpart(formula = Tissues ~ ., data = train_final, parms = list(split =
"gini"),
## control = rpart.control(minsplit = 0, minbucket = 0, cp = -1))
## n= 314
##
## CP nsplit rel error xerror xstd
## 1 0.652777778 0 1.00000000 1.0000000 0.10346099
## 2 0.180555556 1 0.34722222 0.4722222 0.07647529
## 3 0.034722222 2 0.16666667 0.3194444 0.06412289
## 4 0.027777778 4 0.09722222 0.2777778 0.06010233
## 5 0.013888889 5 0.06944444 0.2638889 0.05868005
## 6 0.006944444 8 0.02777778 0.2638889 0.05868005
## 7 -1.000000000 12 0.00000000 0.2638889 0.05868005
##