In [1]:
import numpy as np
import efficient_cancer_data as ecd

In [7]:
# read training data
A, b = ecd.read_training_data('train.data')

# Calculate the coefficients of the least squares solution using the Gram-Schmidt QR algorithm
x = ecd.gram_schmidt_qr(A, b)

# print coefficients of the linear model
for i, coef in enumerate(x):
    print(f"Coefficient for x{i+1}: {coef:.4f}")


Coefficient for x1: -0.8699
Coefficient for x2: 0.0243
Coefficient for x3: 0.0627
Coefficient for x4: 0.0033
Coefficient for x5: 8.7903
Coefficient for x6: -1.7471
Coefficient for x7: 0.2028
Coefficient for x8: 6.5064
Coefficient for x9: -5.0618
Coefficient for x10: -49.1675
Coefficient for x11: 0.9566
Coefficient for x12: 0.0821
Coefficient for x13: 0.0079
Coefficient for x14: -0.0050
Coefficient for x15: 27.8419
Coefficient for x16: -3.3015
Coefficient for x17: -4.9860
Coefficient for x18: 16.3189
Coefficient for x19: -10.3163
Coefficient for x20: 21.3322
Coefficient for x21: 0.4086
Coefficient for x22: 0.0033
Coefficient for x23: 0.0007
Coefficient for x24: -0.0025
Coefficient for x25: -4.5314
Coefficient for x26: -0.5901
Coefficient for x27: 0.7194
Coefficient for x28: 2.1590
Coefficient for x29: 3.8035
Coefficient for x30: 12.2984


In [9]:
# Count the number of misclassifications by iterating over pairs of predicted and true labels.
# p represents the predicted label, and q represents the true label.
# For each pair (p, q) where p is not equal to q, generate a value of 1 using a generator expression.
# Sum up the ones using the `sum` function to obtain the total number of misclassifications.


# read validation data
A_val, b_val = ecd.read_validation_data('validate.data')

# apply linear model to validation data
predictions = A_val @ x 

# classify predictions using threshold of 0
classifications = ecd.classify(predictions, threshold=0)

# compute percentage of misclassifications on validation data

misclassifications_val = sum(1 for p, q in zip(classifications, b_val) if p != q)
percentage_misclassified_val = 100 * misclassifications_val / len(b_val)
accuracy_val = 100 - percentage_misclassified_val

# do the same proccess for the tarining data

predictions_train = A @ x
classifications_train = ecd.classify(predictions_train, threshold=0)
misclassifications_train = sum(1 for p, q in zip(classifications_train, b) if p != q)

# compute percentage of misclassifications on training data
percentage_misclassified_train = 100 * misclassifications_train / len(b)
accuracy_train = 100 - percentage_misclassified_train

In [4]:
# print percentage of misclassifications and accuracy rate on validation and training data

print(f"Percentage of misclassifications on training data: {percentage_misclassified_train:.2f}%, Accuracy rate: {accuracy_train:.2f}%")

print(f"Percentage of misclassifications on validation data: {percentage_misclassified_val:.2f}%, Accuracy rate: {accuracy_val:.2f}%")

# compare with success rate on training data
if percentage_misclassified_val > percentage_misclassified_train:
    print("Percentage of misclassifications on validation data is greater than success rate on training data.")
elif percentage_misclassified_val < percentage_misclassified_train:
    print("Percentage of misclassifications on validation data is smaller than success rate on training data.")
else:
    print("Percentage of misclassifications on validation data is equal to success rate on training data.")

Percentage of misclassifications on training data: 48.67%, Accuracy rate: 51.33%
Percentage of misclassifications on validation data: 3.08%, Accuracy rate: 96.92%
Percentage of misclassifications on validation data is smaller than success rate on training data.
