In [12]:
import numpy as np
import pandas as pd
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import sklearn
from sklearn.metrics import precision_score, accuracy_score
from tqdm import trange
import pickle

In [13]:
def same_seed(seed): 
    '''Fixes random number generator seeds for reproducibility.'''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def amino_encode_table_6(): # key: Amino Acid, value: tensor
    df = pd.read_csv('./6-pc', sep=' ', index_col=0)
    H1 = (df['H1'] - np.mean(df['H1'])) / (np.std(df['H1'], ddof=1))
    V = (df['V'] - np.mean(df['V'])) / (np.std(df['V'], ddof=1))
    P1 = (df['P1'] - np.mean(df['P1'])) / (np.std(df['P1'], ddof=1))
    Pl = (df['Pl'] - np.mean(df['Pl'])) / (np.std(df['Pl'], ddof=1))
    PKa = (df['PKa'] - np.mean(df['PKa'])) / (np.std(df['PKa'], ddof=1))
    NCI = (df['NCI'] - np.mean(df['NCI'])) / (np.std(df['NCI'], ddof=1))
    c = np.array([H1,V,P1,Pl,PKa,NCI], dtype=np.float32)
    amino = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
    table = {}
    for index,key in enumerate(amino):
        # table[key] = torch.from_numpy(c[0:6, index])
        table[key] = list(c[0:6, index])
    table['X'] = [0,0,0,0,0,0]
    return table

table = amino_encode_table_6()

def padding_seq(original_seq, length=50, pad_value='X'):
    padded_seq = original_seq.ljust(length, pad_value)
    return padded_seq

def seq_to_features(seq):
    features_list = []
    for aa in seq:
        features_list.append(table[aa])
    feature_tensor = torch.Tensor(features_list)
    return feature_tensor

def seq_to_features_ml(seq, conc):
    features_list = []
    for aa in seq:
        t = table[aa].copy()
        t.append(conc)
        features_list += t
    feature_tensor = np.array(features_list, dtype=np.float32)
    return feature_tensor

seed = 10902128+8403014


In [14]:
for threshold in trange(10, 100, 10):
    df = pd.read_parquet(f'./train{threshold}.parquet')
    Y = df['label'].to_numpy(dtype=int)
    X = np.array([]).reshape(0,350)
    for i, row in df.iterrows():
        X = np.vstack([X,seq_to_features_ml(padding_seq(row['sequence']), row['concentration'])])
        
    test_df = pd.read_parquet(f'./valid{threshold}.parquet')
    test_Y = test_df['label'].to_numpy(dtype=int)
    test_X = np.array([]).reshape(0,350)
    for i, row in test_df.iterrows():
        test_X = np.vstack([test_X,seq_to_features_ml(padding_seq(row['sequence']), row['concentration'])])

    from sklearn import svm
    svc_clf = svm.SVC(random_state=seed)

    from sklearn.ensemble import RandomForestClassifier
    rf_clf = RandomForestClassifier(random_state=seed)

    from sklearn.ensemble import AdaBoostClassifier
    ada_clf = AdaBoostClassifier(random_state=seed)

    from sklearn.neural_network import MLPClassifier
    mlp_clf = MLPClassifier(max_iter=5000, random_state=seed)

    from sklearn.neighbors import KNeighborsClassifier
    knn_clf = KNeighborsClassifier()

    import xgboost as xgb
    xgb_clf = xgb.XGBClassifier(random_state=seed)

    from sklearn.ensemble import VotingClassifier
    from sklearn import model_selection
    from itertools import chain, combinations
    def powerset(s :list):
        return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

    max_acc = -math.inf
    max_prec = -math.inf
    best_choices = []
    all_estimators = [('svc',svc_clf), ('rf',rf_clf), ('ada',ada_clf), ('mlp',mlp_clf), ('knn',knn_clf), ('xgb',xgb_clf)]
    for estimators in list(powerset(all_estimators))[1:]:
        estimators = list(estimators)
        ensemble = VotingClassifier(estimators)
        ensemble.fit(X,Y)
        ensembled = ensemble.predict(test_X)
        if accuracy_score(test_Y, ensembled) > max_acc:
            max_acc = accuracy_score(test_Y, ensembled)
            max_prec = precision_score(test_Y, ensembled)
            best_choices = estimators
            pickle.dump(ensemble, open(f'{threshold}_clf.pickle', 'wb'))

    print(f'Threshold {threshold} Best acc: {max_acc}, prec: {max_prec}')
    print([x[0] for x in best_choices])

 11%|█         | 1/9 [03:03<24:29, 183.69s/it]

Threshold 10 Best acc: 0.8431372549019608, prec: 0.8548387096774194
['rf', 'ada', 'xgb']


 22%|██▏       | 2/9 [05:53<20:28, 175.55s/it]

Threshold 20 Best acc: 0.8072289156626506, prec: 0.8347826086956521
['rf']


 33%|███▎      | 3/9 [09:28<19:20, 193.36s/it]

Threshold 30 Best acc: 0.8434782608695652, prec: 0.8434782608695652
['svc', 'rf', 'knn', 'xgb']


 44%|████▍     | 4/9 [12:18<15:21, 184.23s/it]

Threshold 40 Best acc: 0.8136363636363636, prec: 0.8285714285714286
['rf']


 56%|█████▌    | 5/9 [13:10<09:06, 136.54s/it]

Threshold 50 Best acc: 0.8387096774193549, prec: 0.8888888888888888
['rf', 'xgb']


 67%|██████▋   | 6/9 [14:06<05:27, 109.21s/it]

Threshold 60 Best acc: 0.7307692307692307, prec: 0.7142857142857143
['rf', 'ada', 'knn']


 78%|███████▊  | 7/9 [14:47<02:53, 86.96s/it] 

Threshold 70 Best acc: 0.8222222222222222, prec: 0.8260869565217391
['svc', 'rf', 'ada', 'knn']


 89%|████████▉ | 8/9 [15:31<01:13, 73.28s/it]

Threshold 80 Best acc: 0.6486486486486487, prec: 0.6875
['rf', 'ada', 'xgb']


100%|██████████| 9/9 [16:05<00:00, 107.28s/it]

Threshold 90 Best acc: 0.7407407407407407, prec: 0.7058823529411765
['ada', 'knn', 'xgb']





In [15]:
# threshold=10: final acc: 0.8196078431372549,  prec: 0.8660714285714286    (rf, xgb)
# threshold=20: final acc: 0.8072289156626506,  prec: 0.8347826086956521    (rf)
# threshold=30: final acc: 0.8347826086956521,  prec: 0.8130081300813008    (svm, rf, xgb)
# threshold=40: final acc: 0.8136363636363636,  prec: 0.8285714285714286    (rf)
# threshold=50: final acc: 0.8387096774193549,  prec: 0.8888888888888888    (rf, xgb)
# threshold=60: final acc: 0.7115384615384616,  prec: 0.72                  (rf, xgb)
# threshold=70: final acc: 0.8222222222222222,  prec: 0.8260869565217391    (svm, knn, rf, ada)
# threshold=80: final acc: 0.8095238095238095,  prec: 0.8095238095238095    (All without mlp, svm)
# threshold=90: final acc: 0.8387096774193549,  prec: 0.9166666666666666    (Only xgb, ada)

In [None]:
# Threshold 10 Best acc: 0.8431372549019608, prec: 0.8548387096774194
# ['rf', 'ada', 'xgb']
# Threshold 20 Best acc: 0.8072289156626506, prec: 0.8347826086956521
# ['rf']
# Threshold 30 Best acc: 0.8434782608695652, prec: 0.8434782608695652
# ['svc', 'rf', 'knn', 'xgb']
# Threshold 40 Best acc: 0.8136363636363636, prec: 0.8285714285714286
# ['rf']
# Threshold 50 Best acc: 0.8387096774193549, prec: 0.8888888888888888
# ['rf', 'xgb']
# Threshold 60 Best acc: 0.7307692307692307, prec: 0.7142857142857143
# ['rf', 'ada', 'knn']
# Threshold 70 Best acc: 0.8222222222222222, prec: 0.8260869565217391
# ['svc', 'rf', 'ada', 'knn']
# Threshold 80 Best acc: 0.6486486486486487, prec: 0.6875
# ['rf', 'ada', 'xgb']
# Threshold 90 Best acc: 0.7407407407407407, prec: 0.7058823529411765
# ['ada', 'knn', 'xgb']