In [96]:
# Library 
import pandas as pd
import numpy as np

# Activity 1-4

In [97]:
# Load data
df = pd.read_csv('./data/mutationc.csv')

In [98]:
# Create actual labels: 1 for Cancer (C...), 0 for Non-Cancer (N...)
identifier_col = df.columns[0]
actual = df[identifier_col].str.match(r'^C\d+$').astype(int)  # 1 for Cancer, 0 for Non-Cancer
print(actual)

0      0
1      0
2      0
3      0
4      1
5      1
6      0
7      1
8      1
9      1
10     1
11     1
12     0
13     1
14     0
15     0
16     0
17     0
18     1
19     0
20     1
21     1
22     0
23     0
24     0
25     1
26     0
27     1
28     1
29     0
30     0
31     1
32     0
33     1
34     0
35     1
36     0
37     0
38     0
39     1
40     1
41     0
42     1
43     0
44     0
45     0
46     1
47     1
48     1
49     1
50     1
51     0
52     0
53     1
54     1
55     1
56     0
57     1
58     1
59     0
60     1
61     1
62     1
63     0
64     1
65     1
66     1
67     1
68     1
69     1
70     0
71     1
72     1
73     1
74     0
75     1
76     1
77     1
78     0
79     0
80     0
81     1
82     0
83     0
84     1
85     0
86     1
87     0
88     1
89     1
90     0
91     1
92     0
93     0
94     0
95     1
96     0
97     1
98     1
99     0
100    1
101    0
102    0
103    1
104    1
105    1
106    1
107    0
108    0
109    1
110    0
1

In [99]:
# Extract features (excluding the identifier column)
mutation_data = df.drop(columns=[identifier_col])
mutation_data = (mutation_data > 0).astype(int)  # Convert to binary (0/1)
features = mutation_data.columns.tolist()
print(features)
print(mutation_data[features[0]])

['HEPACAM_GRCh37_11:124794736-124794736_Silent_SNP_G-G-A_G-G-T', 'LRR1_GRCh37_14:50074349-50074349_Missense-Mutation_SNP_C-C-T', 'KRT34_GRCh37_17:39538253-39538253_Silent_SNP_C-C-T', 'DNAH2_GRCh37_17:7721317-7721317_Silent_SNP_C-C-T', 'RSPH1_GRCh37_21:43897435-43897435_Silent_SNP_C-C-T', 'APC_GRCh37_5:112174117-112174117_Frame-Shift-Del_DEL_T-T--', 'HDAC9_GRCh37_7:18767220-18767220_Silent_SNP_G-G-A', 'THYN1_GRCh37_11:134121181-134121181_Missense-Mutation_SNP_C-C-T', 'TENM4_GRCh37_11:78383150-78383150_Silent_SNP_G-G-A', 'KCNA10_GRCh37_1:111060872-111060872_Missense-Mutation_SNP_G-G-A', 'TNN_GRCh37_1:175104993-175104993_Missense-Mutation_SNP_C-C-T', 'APC_GRCh37_5:112175676-112175677_Frame-Shift-Del_DEL_AG-AG--', 'TRPC7_GRCh37_5:135551948-135551948_Missense-Mutation_SNP_G-G-A', 'SULT6B1_GRCh37_2:37406712-37406712_Nonsense-Mutation_SNP_G-G-A', 'SOX11_GRCh37_2:5832900-5832900_Missense-Mutation_SNP_G-G-A_G-G-T', 'NME9_GRCh37_3:138037034-138037034_Missense-Mutation_SNP_C-C-T', 'FBXW7_GRCh37_4

In [100]:
def find_best_feature_by_tpMinusfp(df, actual):
    best_feature = None
    best_score = -1
    for feature in df.columns:
        predictions = (df[feature] > 0).astype(int)  # Treat >0 as True/1
        tp = ((predictions == 1) & (actual == 1)).sum()  # True Positives
        fp = ((predictions == 1) & (actual == 0)).sum()  # False Positives
        score = tp - fp
        if score > best_score:
            best_score = score
            best_feature = feature
    return best_feature, best_score

In [101]:
# Find the best feature using TP - FP criterion
root_feature, root_score = find_best_feature_by_tpMinusfp(mutation_data, actual)
print(f"Best feature by (TP - FP): {root_feature} with TP - FP = {root_score}")

Best feature by (TP - FP): BRAF_GRCh37_7:140453136-140453136_Missense-Mutation_SNP_A-A-T with TP - FP = 23


In [102]:
# Divide the samples into two groups based on the root feature
groupA = df[df[root_feature] == 1].drop(columns=[identifier_col, root_feature])
groupB = df[df[root_feature] == 0].drop(columns=[identifier_col, root_feature])

# Find best feature for each group
actual_A = actual[df[root_feature] == 1]
actual_B = actual[df[root_feature] == 0]

best_feature_A, best_score_A = find_best_feature_by_tpMinusfp(groupA, actual_A)
best_feature_B, best_score_B = find_best_feature_by_tpMinusfp(groupB, actual_B)
print(f"Best feature in Group A by (TP - FP): {best_feature_A} with TP - FP = {best_score_A}")
print(f"Best feature in Group B by (TP - FP): {best_feature_B} with TP - FP = {best_score_B}")

Best feature in Group A by (TP - FP): DRD5_GRCh37_4:9785349-9785349_3'UTR_SNP_G-G-C with TP - FP = 17
Best feature in Group B by (TP - FP): KRAS_GRCh37_12:25398284-25398284_Missense-Mutation_SNP_C-C-A_C-C-T_C-C-G with TP - FP = 8


In [103]:
# decision tree rule
def classify_sample(sample_row):
    if sample_row[root_feature] > 0:
        if sample_row[best_feature_A] > 0:
            return 'C'  # Predict Cancer
        else:
            return "NC"  # Predict Non-Cancer
    else:
        if sample_row[best_feature_B] > 0:
            return 'C'  # Predict Cancer
        else:
            return "NC"  # Predict Non-Cancer

# Activity 5

In [104]:
# Calculate accuracy for each feature
accuracies = {}
for feature in features:
    predictions = mutation_data[feature]  # Treat >0 as True/1
    accuracy = (predictions == actual).mean() # predictions == actual gives a boolean series, mean() gives the accuracy
    accuracies[feature] = accuracy

# Sort features by accuracy in descending order
# accuracies.items() gives a list of (feature, accuracy) tuples
# item[1] is the accuracy, so we sort by that in reverse order (highest first)
# After sorting, convert back to a dictionary
sorted_accuracies = dict(sorted(accuracies.items(), key=lambda item: item[1], reverse=True))

# Display accuracies
for feature, accuracy in list(sorted_accuracies.items())[:10]:
    print(f'Feature: {feature}, Accuracy: {accuracy:.4f}')

Feature: BRAF_GRCh37_7:140453136-140453136_Missense-Mutation_SNP_A-A-T, Accuracy: 0.5970
Feature: DRD5_GRCh37_4:9785349-9785349_3'UTR_SNP_G-G-C, Accuracy: 0.5672
Feature: ZBTB20_GRCh37_3:114058003-114058003_Frame-Shift-Del_DEL_G-G--, Accuracy: 0.5373
Feature: DOCK3_GRCh37_3:51417604-51417604_Frame-Shift-Del_DEL_C-C--, Accuracy: 0.5274
Feature: DRD5_GRCh37_4:9785421-9785421_3'UTR_SNP_G-G-T, Accuracy: 0.5274
Feature: KRAS_GRCh37_12:25398284-25398284_Missense-Mutation_SNP_C-C-A_C-C-T_C-C-G, Accuracy: 0.5224
Feature: SVIL_GRCh37_10:29760116-29760116_Frame-Shift-Del_DEL_C-C--, Accuracy: 0.5174
Feature: KRAS_GRCh37_12:25398281-25398281_Missense-Mutation_SNP_C-C-T, Accuracy: 0.5174
Feature: RAB28_GRCh37_4:13485808-13485808_5'UTR_DEL_G-G--, Accuracy: 0.5174
Feature: PLEKHA6_GRCh37_1:204228411-204228411_Frame-Shift-Del_DEL_C-C--, Accuracy: 0.5174


In [105]:
# Classify all samples based on the decision tree
predictions = mutation_data.apply(classify_sample, axis=1)
pd.set_option('display.max_rows', None)      # Show all rows
result_df = pd.DataFrame({
    identifier_col: df[identifier_col],
    'Prediction': predictions
})
print(result_df)

    Unnamed: 0 Prediction
0          NC0         NC
1          NC1         NC
2          NC2          C
3          NC3         NC
4           C0          C
5           C1          C
6          NC4         NC
7           C2          C
8           C3         NC
9           C4          C
10          C5         NC
11          C6          C
12         NC5          C
13          C7          C
14         NC6         NC
15         NC7         NC
16         NC8         NC
17         NC9         NC
18          C8         NC
19        NC10         NC
20          C9          C
21         C10         NC
22        NC11         NC
23        NC12         NC
24        NC13         NC
25         C11         NC
26        NC14         NC
27         C12          C
28         C13          C
29        NC15         NC
30        NC16          C
31         C14         NC
32        NC17         NC
33         C15          C
34        NC18         NC
35         C16         NC
36        NC19         NC
37        NC

In [106]:
def confusion_matrix(predictions, actual):
    tp = ((predictions == 'C') & (actual == 1)).sum()  # True Positives
    tn = ((predictions == 'NC') & (actual == 0)).sum()  # True Negatives
    fp = ((predictions == 'C') & (actual == 0)).sum()  # False Positives
    fn = ((predictions == 'NC') & (actual == 1)).sum()  # False Negatives
    return tp, tn, fp, fn

In [107]:
def metric_reports(tp, tn, fp, fn):
    # accuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0

    # Sensitivity/Recall/True Positive Rate(TPR)
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    # Specificity/Selectivity/True Negative Rate(TNR)
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    # Precision/Positive Predictive Value (PPV)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    # Miss Rate/False Negative Rate (FNR)
    miss_rate = fn / (fn + tp) if (fn + tp) > 0 else 0

    # False discovery rate (FDR)
    fdr = fp / (fp + tp) if (fp + tp) > 0 else 0

    # False Omission Rate (FOR)
    for_rate = fn / (fn + tn) if (fn + tn) > 0 else 0
    return accuracy, recall, specificity, precision, miss_rate, fdr, for_rate

In [108]:
# Metrics for classification tree
binary_predictions = (predictions == 'C').astype(int)  # Convert 'C'/'NC' to 1/0
tp, tn, fp, fn = confusion_matrix(predictions, actual)

print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")

# Confusion matrix
confusion_matrix = pd.DataFrame({
    'Predicted_C': [tp, fp],
    'Predicted_NC': [fn, tn]
}, index=['Actual_C', 'Actual_NC'])
print("Confusion Matrix:")
print(confusion_matrix)

accuracy, sensitivity, specificity, precision, miss_rate, fdr, for_rate = metric_reports(tp, tn, fp, fn)
print(f"Accuracy: {accuracy:.4f}")
print(f"Sensitivity (Recall, TPR): {sensitivity:.4f}")
print(f"Specificity (TNR): {specificity:.4f}")
print(f"Precision (PPV): {precision:.4f}")
print(f"Miss Rate (FNR): {miss_rate:.4f}")
print(f"False Discovery Rate (FDR): {fdr:.4f}")
print(f"False Omission Rate (FOR): {for_rate:.4f}")

TP: 44, TN: 78, FP: 19, FN: 60
Confusion Matrix:
           Predicted_C  Predicted_NC
Actual_C            44            60
Actual_NC           19            78
Accuracy: 0.6070
Sensitivity (Recall, TPR): 0.4231
Specificity (TNR): 0.8041
Precision (PPV): 0.6984
Miss Rate (FNR): 0.5769
False Discovery Rate (FDR): 0.3016
False Omission Rate (FOR): 0.4348
