In [1]:
import numpy as np
import efficient_cancer_data as ecd

In [2]:
# read training data
A, b = ecd.read_training_data('train.data')

# compute coefficients of least-squares solution using Gram-Schmidt QR algorithm
x = ecd.gram_schmidt_qr(A, b)

# print coefficients of the linear model
for i, coef in enumerate(x):
    print(f"{i+1}. {coef:.4f}")


1. -0.8699
2. 0.0243
3. 0.0627
4. 0.0033
5. 8.7903
6. -1.7471
7. 0.2028
8. 6.5064
9. -5.0618
10. -49.1675
11. 0.9566
12. 0.0821
13. 0.0079
14. -0.0050
15. 27.8419
16. -3.3015
17. -4.9860
18. 16.3189
19. -10.3163
20. 21.3322
21. 0.4086
22. 0.0033
23. 0.0007
24. -0.0025
25. -4.5314
26. -0.5901
27. 0.7194
28. 2.1590
29. 3.8035
30. 12.2984


In [3]:
# read validation data
A_val, b_val = ecd.read_validation_data('validate.data')

# apply linear model to validation data
predictions = A_val @ x

# classify predictions using threshold of 0
classifications = ecd.classify(predictions, threshold=0)

# compute percentage of misclassifications on validation data
misclassifications_val = sum(1 for p, q in zip(classifications, b_val) if p != q)
percentage_misclassified_val = 100 * misclassifications_val / len(b_val)
accuracy_val = 100 - percentage_misclassified_val

# compute percentage of misclassifications on training data
predictions_train = A @ x
classifications_train = ecd.classify(predictions_train, threshold=0)
misclassifications_train = sum(1 for p, q in zip(classifications_train, b) if p != q)
percentage_misclassified_train = 100 * misclassifications_train / len(b)
accuracy_train = 100 - percentage_misclassified_train

In [4]:
# print percentage of misclassifications and accuracy rate on validation and training data

print(f"Percentage of misclassifications on training data: {percentage_misclassified_train:.2f}%, Accuracy rate: {accuracy_train:.2f}%")

print(f"Percentage of misclassifications on validation data: {percentage_misclassified_val:.2f}%, Accuracy rate: {accuracy_val:.2f}%")

# compare with success rate on training data
if percentage_misclassified_val > percentage_misclassified_train:
    print("Percentage of misclassifications on validation data is greater than success rate on training data.")
elif percentage_misclassified_val < percentage_misclassified_train:
    print("Percentage of misclassifications on validation data is smaller than success rate on training data.")
else:
    print("Percentage of misclassifications on validation data is equal to success rate on training data.")

Percentage of misclassifications on training data: 48.67%, Accuracy rate: 51.33%
Percentage of misclassifications on validation data: 3.08%, Accuracy rate: 96.92%
Percentage of misclassifications on validation data is smaller than success rate on training data.
