In [None]:
from humpsClassifier import HumpsClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import scipy.signal as scisignal
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
import math, time

In [None]:
clf_maker = HumpsClassifier()

index_to_aa = [c for c in 'CSAGTVNQMILYWFPHRKDE']
aa_to_index = {aa:i for i, aa in enumerate(index_to_aa)}

#### Accuracy metrics

In [None]:
def accuracy_without_repeats(y_test, test_pred_proba, classes):
    correct = 0
    for y_t, y_p in zip(y_test, test_pred_proba):
        correct += y_t == classes[np.argmax(y_p)]
    return correct/len(y_test)

def accuracy_with_2_repeats(y_test, test_pred_proba, classes):
    unique, counts = np.unique(y_test, return_counts=True)
    test_cnt = 0
    correct = 0
    for val in unique:
        y_indicies = np.where(y_test == val)[0]
        it = iter(y_indicies)
        for i_1 in it:
            try:
                i_2 = next(it)
                new_proba = [(p1 + p2)/2 for p1, p2 in zip(test_pred_proba[i_1], test_pred_proba[i_2])]
                correct += val == classes[np.argmax(new_proba)]
                test_cnt += 1
            except:
                break
    assert test_cnt != 0
    return correct/test_cnt

def accuracy_with_3_repeats(y_test, test_pred_proba, classes):
    unique, counts = np.unique(y_test, return_counts=True)
    correct = 0
    test_cnt = 0
    for val in unique:
        y_indicies = np.where(y_test == val)[0]
        it = iter(y_indicies)
        for i_1 in it:
            try:
                i_2 = next(it)
                i_3 = next(it)
                new_proba = [(p1 + p2 + p3)/3 for p1, p2, p3 in zip(test_pred_proba[i_1], test_pred_proba[i_2], test_pred_proba[i_3])]
                correct += val == classes[np.argmax(new_proba)]
                test_cnt += 1
            except:
                break
    return correct/test_cnt

def accuracy_with_4_repeats(y_test, test_pred_proba, classes):
    unique, counts = np.unique(y_test, return_counts=True)
    correct = 0
    test_cnt = 0
    for val in unique:
        y_indicies = np.where(y_test == val)[0]
        it = iter(y_indicies)
        for i_1 in it:
            try:
                i_2 = next(it)
                i_3 = next(it)
                i_4 = next(it)
                new_proba = [(p1 + p2 + p3 + p4)/4 for p1, p2, p3, p4 in zip(test_pred_proba[i_1], test_pred_proba[i_2], test_pred_proba[i_3], test_pred_proba[i_4])]
                correct += val == classes[np.argmax(new_proba)]
                test_cnt += 1
            except:
                break
    assert test_cnt != 0
    return correct/test_cnt

def accuracy_with_5_repeats(y_test, test_pred_proba, classes):
    unique, counts = np.unique(y_test, return_counts=True)
    correct = 0
    test_cnt = 0
    for val in unique:
        y_indicies = np.where(y_test == val)[0]
        it = iter(y_indicies)
        for i_1 in it:
            try:
                i_2 = next(it)
                i_3 = next(it)
                i_4 = next(it)
                i_5 = next(it)
                new_proba = [(p1 + p2 + p3 + p4 + p5)/5 for p1, p2, p3, p4, p5 in zip(test_pred_proba[i_1], test_pred_proba[i_2], test_pred_proba[i_3], test_pred_proba[i_4], test_pred_proba[i_5])]
                correct += val == classes[np.argmax(new_proba)]
                test_cnt += 1
            except:
                break
    assert test_cnt != 0
    return correct/test_cnt

def accuracy_with_6_repeats(y_test, test_pred_proba, classes):
    unique, counts = np.unique(y_test, return_counts=True)
    correct = 0
    test_cnt = 0
    for val in unique:
        y_indicies = np.where(y_test == val)[0]
        it = iter(y_indicies)
        for i_1 in it:
            try:
                i_2 = next(it)
                i_3 = next(it)
                i_4 = next(it)
                i_5 = next(it)
                i_6 = next(it)
                new_proba = [(p1 + p2 + p3 + p4 + p5 + p6)/6 for p1, p2, p3, p4, p5, p6 in zip(test_pred_proba[i_1], test_pred_proba[i_2], test_pred_proba[i_3], test_pred_proba[i_4], test_pred_proba[i_5], test_pred_proba[i_6])]
                correct += val == classes[np.argmax(new_proba)]
                test_cnt += 1
            except:
                break
    assert test_cnt != 0
    return correct/test_cnt

def accuracy_with_7_repeats(y_test, test_pred_proba, classes):
    unique, counts = np.unique(y_test, return_counts=True)
    correct = 0
    test_cnt = 0
    for val in unique:
        y_indicies = np.where(y_test == val)[0]
        it = iter(y_indicies)
        for i_1 in it:
            try:
                i_2 = next(it)
                i_3 = next(it)
                i_4 = next(it)
                i_5 = next(it)
                i_6 = next(it)
                i_7 = next(it)
                indexes = [i_2, i_3, i_4, i_5, i_6, i_7]
                assert len(indexes) == len(set(indexes))
                new_proba = [(p1 + p2 + p3 + p4 + p5 + p6 + p7)/7 for p1, p2, p3, p4, p5, p6, p7 in zip(test_pred_proba[i_1], test_pred_proba[i_2], test_pred_proba[i_3], test_pred_proba[i_4], test_pred_proba[i_5], test_pred_proba[i_6], test_pred_proba[i_7])]
                correct += val == classes[np.argmax(new_proba)]
                test_cnt += 1
            except:
                break
    assert test_cnt != 0
    return correct/test_cnt

def accuracy_with_8_repeats(y_test, test_pred_proba, classes):
    unique, counts = np.unique(y_test, return_counts=True)
    correct = 0
    test_cnt = 0
    for val in unique:
        y_indicies = np.where(y_test == val)[0]
        it = iter(y_indicies)
        for i_1 in it:
            try:
                i_2 = next(it)
                i_3 = next(it)
                i_4 = next(it)
                i_5 = next(it)
                i_6 = next(it)
                i_7 = next(it)
                i_8 = next(it)
                indexes = [i_2, i_3, i_4, i_5, i_6, i_7, i_8]
                assert len(indexes) == len(set(indexes))
                new_proba = [(p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8)/8
                            for p1, p2, p3, p4, p5, p6, p7, p8 in 
                            zip(test_pred_proba[i_1], test_pred_proba[i_2], test_pred_proba[i_3], 
                                test_pred_proba[i_4], test_pred_proba[i_5], test_pred_proba[i_6], 
                                test_pred_proba[i_7], test_pred_proba[i_8])]
                correct += val == classes[np.argmax(new_proba)]
                test_cnt += 1
            except:
                break
    assert test_cnt != 0
    return correct/test_cnt

def accuracy_with_9_repeats(y_test, test_pred_proba, classes):
    unique, counts = np.unique(y_test, return_counts=True)
    correct = 0
    test_cnt = 0
    for val in unique:
        y_indicies = np.where(y_test == val)[0]
        it = iter(y_indicies)
        for i_1 in it:
            try:
                i_2 = next(it)
                i_3 = next(it)
                i_4 = next(it)
                i_5 = next(it)
                i_6 = next(it)
                i_7 = next(it)
                i_8 = next(it)
                i_9 = next(it)
                indexes = [i_2, i_3, i_4, i_5, i_6, i_7, i_8, i_9]
                assert len(indexes) == len(set(indexes))
                new_proba = [(p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9)/9
                            for p1, p2, p3, p4, p5, p6, p7, p8, p9 in 
                            zip(test_pred_proba[i_1], test_pred_proba[i_2], test_pred_proba[i_3], 
                                test_pred_proba[i_4], test_pred_proba[i_5], test_pred_proba[i_6], 
                                test_pred_proba[i_7], test_pred_proba[i_8], test_pred_proba[i_9])]
                correct += val == classes[np.argmax(new_proba)]
                test_cnt += 1
            except:
                break
    assert test_cnt != 0
    return correct/test_cnt

def accuracy_with_10_repeats(y_test, test_pred_proba, classes):
    unique, counts = np.unique(y_test, return_counts=True)
    correct = 0
    test_cnt = 0
    for val in unique:
        y_indicies = np.where(y_test == val)[0]
        it = iter(y_indicies)
        for i_1 in it:
            try:
                i_2 = next(it)
                i_3 = next(it)
                i_4 = next(it)
                i_5 = next(it)
                i_6 = next(it)
                i_7 = next(it)
                i_8 = next(it)
                i_9 = next(it)
                i_10 = next(it)
                indexes = [i_2, i_3, i_4, i_5, i_6, i_7, i_8, i_9, i_10]
                assert len(indexes) == len(set(indexes))
                new_proba = [(p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + p10)/10
                            for p1, p2, p3, p4, p5, p6, p7, p8, p9, p10 in 
                            zip(test_pred_proba[i_1], test_pred_proba[i_2], test_pred_proba[i_3], 
                                test_pred_proba[i_4], test_pred_proba[i_5], test_pred_proba[i_6], 
                                test_pred_proba[i_7], test_pred_proba[i_8], test_pred_proba[i_9], 
                                test_pred_proba[i_10])]
                correct += val == classes[np.argmax(new_proba)]
                test_cnt += 1
            except:
                break
    assert test_cnt != 0
    return correct/test_cnt

def accuracy_with_11_repeats(y_test, test_pred_proba, classes):
    unique, counts = np.unique(y_test, return_counts=True)
    correct = 0
    test_cnt = 0
    for val in unique:
        y_indicies = np.where(y_test == val)[0]
        it = iter(y_indicies)
        for i_1 in it:
            try:
                i_2 = next(it)
                i_3 = next(it)
                i_4 = next(it)
                i_5 = next(it)
                i_6 = next(it)
                i_7 = next(it)
                i_8 = next(it)
                i_9 = next(it)
                i_10 = next(it)
                i_11 = next(it)
                indexes = [i_2, i_3, i_4, i_5, i_6, i_7, i_8, i_9, i_10, i_11]
                assert len(indexes) == len(set(indexes))
                new_proba = [(p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 + p10 + p11)/11
                            for p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11 in 
                            zip(test_pred_proba[i_1], test_pred_proba[i_2], test_pred_proba[i_3], 
                                test_pred_proba[i_4], test_pred_proba[i_5], test_pred_proba[i_6], 
                                test_pred_proba[i_7], test_pred_proba[i_8], test_pred_proba[i_9], 
                                test_pred_proba[i_10], test_pred_proba[i_11])]
                correct += val == classes[np.argmax(new_proba)]
                test_cnt += 1
            except:
                break
    assert test_cnt != 0
    return correct/test_cnt

#### Amino acid sets to test. These were determined by non-exhaustive tests of what sets would give the best accuracy, starting from knowledge of best pairwise discrimination accuracies

In [None]:
best_teplets = {2: ('Y', 'D'),
 3: ('G', 'Y', 'D'),
 4: ('A', 'W', 'R', 'D'),
 5: ('G', 'V', 'W', 'R', 'D'),
 6: ('C', 'G', 'L', 'Y', 'R', 'D'),
 7: ('G', 'Q', 'W', 'F', 'R', 'D', 'E'),
 8: ('A', 'T', 'I', 'Y', 'W', 'R', 'D', 'E'),
 9: ('G', 'V', 'N', 'L', 'Y', 'W', 'R', 'D', 'E'),
 10: ('A', 'G', 'V', 'N', 'Y', 'W', 'F', 'R', 'D', 'E'),
 11: ('C', 'A', 'G', 'V', 'N', 'Y', 'W', 'F', 'R', 'D', 'E'),
 12: ('C', 'A', 'G', 'T', 'V', 'Q', 'I', 'Y', 'W', 'R', 'D', 'E'),
 13: ('A', 'G', 'T', 'V', 'N', 'Q', 'M', 'Y', 'W', 'F', 'R', 'D', 'E'),
 14: ('C', 'A', 'G', 'T', 'V', 'N', 'Q', 'M', 'Y', 'W', 'F', 'R', 'D', 'E'),
 15: ('C',
  'S',
  'G',
  'T',
  'V',
  'N',
  'Q',
  'M',
  'Y',
  'W',
  'F',
  'R',
  'K',
  'D',
  'E'),
 16: ('C',
  'A',
  'G',
  'T',
  'V',
  'N',
  'Q',
  'M',
  'L',
  'Y',
  'W',
  'F',
  'R',
  'K',
  'D',
  'E'),
 17: ('C',
  'S',
  'A',
  'G',
  'T',
  'V',
  'N',
  'Q',
  'M',
  'I',
  'Y',
  'W',
  'F',
  'H',
  'R',
  'D',
  'E'),
 18: ('C',
  'S',
  'A',
  'G',
  'T',
  'N',
  'M',
  'I',
  'L',
  'Y',
  'W',
  'F',
  'P',
  'H',
  'R',
  'K',
  'D',
  'E'),
 19: ('C',
  'S',
  'A',
  'G',
  'T',
  'V',
  'N',
  'Q',
  'M',
  'I',
  'L',
  'Y',
  'W',
  'F',
  'P',
  'R',
  'K',
  'D',
  'E'),
 20: ['C',
  'S',
  'A',
  'G',
  'T',
  'V',
  'N',
  'Q',
  'M',
  'I',
  'L',
  'Y',
  'W',
  'F',
  'P',
  'H',
  'R',
  'K',
  'D',
  'E']}



#### Perform the simulation (~90 min) & save results

In [None]:
metrics = [accuracy_without_repeats, accuracy_with_2_repeats, accuracy_with_3_repeats, accuracy_with_4_repeats,
           accuracy_with_5_repeats, accuracy_with_6_repeats, accuracy_with_7_repeats, accuracy_with_8_repeats, 
           accuracy_with_9_repeats, accuracy_with_10_repeats, accuracy_with_11_repeats]

accuracies = accuracies = np.zeros(shape=(len(best_teplets),len(metrics)))
s = time.time()
for i, acids in best_teplets.items():
    for j, metric in enumerate(metrics):
        accuracies[i-2][j] = clf_maker.create_rf_upsample_feature_matrix(acids, use_proba=True,n_repeats=100, needs_classes=True,
                    include_matrix=True, accuracy_metric=metric, classifiers=[RandomForestClassifier(n_estimators=200)])[0]
        np.save(open('rereads_acc.npy','wb'), accuracies)
    e = time.time()
    print(f"{i} acids, {e-s} time")
    s = e
    np.save(open('rereads_acc.npy','wb'), accuracies)

for row in accuracies:
    plt.plot([i for i in range(len(row))], [r*100 for r in row])
plt.legend([str(i+2) for i in range(len(accuracies))], title="Number of amino acids", bbox_to_anchor=(1, 1.1))
plt.xticks([i for i in range(len(accuracies[0]))])
plt.ylabel('Accuracy (%)')
plt.xlabel("Number of rereads")
plt.ylim([0,100])
plt.title("Accuracy with rereads")
ax = plt.subplot(111)
ax.spines[['right', 'top']].set_visible(False)
