In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
X = np.genfromtxt("hw01_data_points.csv", delimiter = ",", dtype = str)
y = np.genfromtxt("hw01_class_labels.csv", delimiter = ",", dtype = int)

In [3]:
#array[row_start:row_stop:row_step, col_start:col_stop:col_step]
def train_test_split(X, y):
    X_train = X[:50000]
    y_train = y[:50000]
    
    X_test = X[50000:]
    y_test = y[50000:]
    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = train_test_split(X, y)

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

(50000, 21)
(50000,)
(44727, 21)
(44727,)


In [4]:
def estimate_prior_probabilities(y):
    return [np.mean(y == c) for c in (1, 2)]

class_priors = estimate_prior_probabilities(y_train)
print(class_priors)

[0.04466, 0.95534]


In [5]:
def estimate_nucleotide_probabilities(X, y):
    bases = ["A", "C", "G", "T"]
    pAcd = [np.average(X == "A", axis=0, weights=(y == c)) for c in (1, 2)]
    pCcd = [np.average(X == "C", axis=0, weights=(y == c)) for c in (1, 2)]
    pGcd = [np.average(X == "G", axis=0, weights=(y == c)) for c in (1, 2)]
    pTcd = [np.average(X == "T", axis=0, weights=(y == c)) for c in (1, 2)]
    return pAcd, pCcd, pGcd, pTcd

In [6]:
pAcd, pCcd, pGcd, pTcd = estimate_nucleotide_probabilities(X_train, y_train)
print(pAcd)
print(pCcd)
print(pGcd)
print(pTcd)

[array([0.18674429, 0.17913121, 0.1437528 , 0.13390058, 0.11912226,
       0.11374832, 0.10523959, 0.10076131, 0.08687864, 0.07613077,
       0.06941335, 0.08687864, 0.10120914, 0.09628303, 0.08105687,
       0.07926556, 0.23510972, 0.05015674, 0.24540976, 0.23734886,
       0.2539185 ]), array([0.26832332, 0.27719974, 0.2723219 , 0.25808613, 0.28153328,
       0.27784872, 0.27219629, 0.26087048, 0.25488308, 0.26618795,
       0.27385015, 0.26955848, 0.28913266, 0.28486193, 0.27694852,
       0.25260117, 0.27766031, 0.24981682, 0.28335462, 0.28025624,
       0.29279628])]
[array([0.28616211, 0.28526646, 0.28571429, 0.29422302, 0.31168831,
       0.31751008, 0.31034483, 0.31258397, 0.29735781, 0.28034035,
       0.30541872, 0.31840573, 0.33811017, 0.36945813, 0.38916256,
       0.33990148, 0.2955665 , 0.67845947, 0.14464845, 0.21988356,
       0.2539185 ]), array([0.22821195, 0.22961459, 0.22318756, 0.22536479, 0.21688613,
       0.22747922, 0.22278979, 0.22431804, 0.24516926, 0.2261812

In [7]:
scores = np.zeros(2 * len(X_train)).reshape(len(X_train), 2)
print(scores.shape)

def calculate_score_values(X, pAcd, pCcd, pGcd, pTcd, class_priors):
    scores = np.zeros(2 * len(X)).reshape(len(X), 2)
    for c in range(len(class_priors)):
        for i in range(len(X)):
            sum = 0
            for d in range(len(X[0])):
                if (X[i][d] == "A"):
                    sum += np.log(pAcd[c - 1][d])
                if (X[i][d] == "C"):
                    sum += np.log(pCcd[c - 1][d])
                if (X[i][d] == "G"): 
                    sum += np.log(pGcd[c - 1][d])
                if (X[i][d] == "T"):
                    sum += np.log(pTcd[c - 1][d])
            scores[i][c - 1] = sum + np.log(class_priors[c - 1]) 
    return scores

(50000, 2)


In [8]:
scores_train = calculate_score_values(X_train, pAcd, pCcd, pGcd, pTcd, class_priors)
print(scores_train)

scores_test = calculate_score_values(X_test, pAcd, pCcd, pGcd, pTcd, class_priors)
print(scores_test)

[[-32.29602984 -28.67631805]
 [-35.36510932 -29.06687849]
 [-33.1594779  -28.50829296]
 ...
 [-37.17901126 -29.28659414]
 [-35.6365549  -29.75138901]
 [-28.72885394 -28.68471489]]
[[-31.88852108 -28.73182527]
 [-40.83809258 -29.40573888]
 [-30.6177392  -29.98270774]
 ...
 [-38.49757139 -28.9923932 ]
 [-24.40343148 -29.115305  ]
 [-37.58089652 -28.27846954]]


In [9]:
def calculate_confusion_matrix(y, scores):
    predicted_vals = np.zeros(len(scores))
    
    for i, score in enumerate(scores):
        if (score[0] > score[1]):
            predicted_vals[i] = 1
        else:
            predicted_vals[i] = 2
    
    confusion_matrix = pd.crosstab(predicted_vals.T, y.T)
    return confusion_matrix

In [10]:
confusion_train = calculate_confusion_matrix(y_train, scores_train)
print(confusion_train)

col_0     1      2
row_0             
1.0    1489   1460
2.0     744  46307


In [11]:
confusion_test = calculate_confusion_matrix(y_test, scores_test)
print(confusion_test)

col_0     1      2
row_0             
1.0    1314   1300
2.0     686  41427
