In [231]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import tensorflow as tf
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, Dropout

In [66]:
# load training dataset
with open('datasets/train.dataset.6mer.npy', 'rb') as open_file:
    df = np.load(open_file)
df = pd.DataFrame(df)

In [70]:
six_mers = pd.read_table('datasets/6mer_columns.txt', header=None)
df.columns = six_mers[:-1]
df.columns = [col[0] for col in df.columns]

In [71]:
df.head()

Unnamed: 0,AAAAAA,AAAAAT,AAAAAG,AAAAAC,AAAATA,AAAATT,AAAATG,AAAATC,AAAAGA,AAAAGT,...,CCAAGG,CCATGG,CCAGGG,CCACGG,CCTAGG,CCCAGG,CCGAGG,CCCCGG,CCGCGG,CCCGGG
0,0.00247,0.004528,0.003292,0.000823,0.003704,0.00288,0.001646,0.001646,0.001646,0.001646,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.001818,0.002857,0.002077,0.001558,0.003635,0.002338,0.002338,0.001039,0.001818,0.001039,...,0.0,0.000519,0.0,0.0,0.000519,0.00026,0.0,0.0,0.0,0.0
2,0.003702,0.003084,0.001234,0.001851,0.002468,0.003084,0.003084,0.001851,0.001234,0.000617,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.001102,0.002756,0.003584,0.001378,0.003307,0.002481,0.002481,0.001102,0.001654,0.001378,...,0.000276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.004318,0.003534,0.002748,0.001701,0.003534,0.002356,0.002224,0.002617,0.003271,0.00144,...,0.000262,0.0,0.0,0.0,0.000262,0.000131,0.0,0.0,0.0,0.0


In [72]:
labels = pd.read_csv('datasets/train_labels.csv')
df['genome_label'] = labels
df.head()

Unnamed: 0,AAAAAA,AAAAAT,AAAAAG,AAAAAC,AAAATA,AAAATT,AAAATG,AAAATC,AAAAGA,AAAAGT,...,CCATGG,CCAGGG,CCACGG,CCTAGG,CCCAGG,CCGAGG,CCCCGG,CCGCGG,CCCGGG,genome_label
0,0.00247,0.004528,0.003292,0.000823,0.003704,0.00288,0.001646,0.001646,0.001646,0.001646,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,staphylococcus_aureus
1,0.001818,0.002857,0.002077,0.001558,0.003635,0.002338,0.002338,0.001039,0.001818,0.001039,...,0.000519,0.0,0.0,0.000519,0.00026,0.0,0.0,0.0,0.0,staphylococcus_aureus
2,0.003702,0.003084,0.001234,0.001851,0.002468,0.003084,0.003084,0.001851,0.001234,0.000617,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,staphylococcus_aureus
3,0.001102,0.002756,0.003584,0.001378,0.003307,0.002481,0.002481,0.001102,0.001654,0.001378,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,staphylococcus_aureus
4,0.004318,0.003534,0.002748,0.001701,0.003534,0.002356,0.002224,0.002617,0.003271,0.00144,...,0.0,0.0,0.0,0.000262,0.000131,0.0,0.0,0.0,0.0,staphylococcus_aureus


In [None]:
df['genome_label'].value_counts()

In [None]:
df['genome_label'].value_counts().sum()

In [None]:
len(df['genome_label'].unique())

In [None]:
df[df.isna().any(axis=1)]

There are 30 different pathogens and one decoy label. There are 2080 features and 505536 samples, as well as no missing values, in the training set.

In [73]:
# subsample with random seed for reproducibility
sample_size = df.genome_label.value_counts().to_frame().min().values[0]
df_train = df.groupby('genome_label').apply(lambda x: x.sample(sample_size, random_state=4))
# df_train = df # use all the training samples

In [None]:
# eda
pd.plotting.scatter_matrix(df_train, figsize=[8, 8], s=150, marker='D')
# sns.pairplot(df_train)
# plt.show()

Feature selection: Remove highly correlated features

In [7]:
df_train.corr().abs()

  df_train.corr().abs()


Unnamed: 0,AAAAAT,AAAAAG,AAAAAC,AAAATA,AAAATT,AAAATG,AAAATC,AAAAGA,AAAAGT,AAAAGG,...,CCATGG,CCAGGG,CCACGG,CCTAGG,CCCAGG,CCGAGG,CCCCGG,CCGCGG,CCCGGG,IGNORE
AAAAAT,1.000000,0.769758,0.664791,0.581132,0.680353,0.678322,0.533906,0.502745,0.618094,0.550115,...,0.124568,0.167085,0.271391,0.390979,0.046168,0.181678,0.358870,0.336093,0.352248,0.240443
AAAAAG,0.769758,1.000000,0.659485,0.557788,0.848764,0.844474,0.684899,0.658493,0.670451,0.619597,...,0.164832,0.199750,0.348194,0.446284,0.002650,0.334705,0.423970,0.396722,0.407136,0.306171
AAAAAC,0.664791,0.659485,1.000000,0.521356,0.609439,0.612794,0.544853,0.534992,0.760886,0.671049,...,0.123991,0.196805,0.365378,0.439851,0.017494,0.356154,0.439575,0.410603,0.421185,0.325031
AAAATA,0.581132,0.557788,0.521356,1.000000,0.507518,0.510143,0.472369,0.499721,0.485565,0.448622,...,0.130830,0.179204,0.346977,0.377693,0.002741,0.341641,0.416508,0.358177,0.405856,0.297802
AAAATT,0.680353,0.848764,0.609439,0.507518,1.000000,0.728463,0.580295,0.529986,0.648512,0.605940,...,0.176256,0.198680,0.333347,0.437933,0.000213,0.320852,0.411653,0.385005,0.391793,0.298320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CCGAGG,0.181678,0.334705,0.356154,0.341641,0.320852,0.302869,0.352116,0.375128,0.315892,0.314651,...,0.125222,0.214050,0.585631,0.347284,0.047251,1.000000,0.432100,0.375942,0.353094,0.325415
CCCCGG,0.358870,0.423970,0.439575,0.416508,0.411653,0.376278,0.445111,0.415643,0.408173,0.389600,...,0.144515,0.202064,0.474835,0.415690,0.011588,0.432100,1.000000,0.451809,0.512733,0.373638
CCGCGG,0.336093,0.396722,0.410603,0.358177,0.385005,0.355296,0.400295,0.366193,0.392626,0.365887,...,0.047112,0.153317,0.410235,0.405766,0.049242,0.375942,0.451809,1.000000,0.501926,0.543237
CCCGGG,0.352248,0.407136,0.421185,0.405856,0.391793,0.354968,0.423543,0.408663,0.394824,0.371658,...,0.057133,0.144192,0.367922,0.402518,0.049057,0.353094,0.512733,0.501926,1.000000,0.434817


In [74]:
mask = np.triu(np.ones_like(df_train.corr(), dtype=bool)) # remove duplicate upper triangle correlation values
corr_matrix = df_train.corr().abs() # create positive correlation matrix
tri_df = corr_matrix.mask(mask) # create and apply upper triangle mask

  mask = np.triu(np.ones_like(df_train.corr(), dtype=bool)) # remove duplicate upper triangle correlation values
  corr_matrix = df_train.corr().abs() # create positive correlation matrix


In [75]:
to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.85)]
len(to_drop)

15

In [76]:
df_train.drop(columns=to_drop, inplace=True)

Remove features with high intraclass variance and low interclass variance

In [33]:
# interclass variance
df_train.groupby('genome_label').apply(lambda x: x.median()).var() # can try with mean afterwards

AAAAAT    3.516674e-06
AAAAAC    1.132488e-06
AAAATA    5.960464e-07
AAAATT    2.086163e-06
AAAATG    2.145767e-06
              ...     
CCGAGG    1.788139e-07
CCCCGG    2.384186e-07
CCGCGG    1.192093e-07
CCCGGG    7.152557e-07
IGNORE    1.192093e-07
Length: 2065, dtype: float16

In [34]:
# intraclass variance
df_train.groupby('genome_label').apply(lambda x: x.var()).median()

AAAAAT    1.668930e-06
AAAAAC    6.556511e-07
AAAATA    6.556511e-07
AAAATT    1.132488e-06
AAAATG    5.364418e-07
              ...     
CCGAGG    1.192093e-07
CCCCGG    1.192093e-07
CCGCGG    1.192093e-07
CCCGGG    1.788139e-07
IGNORE    1.192093e-07
Length: 2065, dtype: float16

In [105]:
df_train

Unnamed: 0_level_0,Unnamed: 1_level_0,AAAAAA,AAAATT,AAAAGA,AAATTA,AAATTT,AATTAA,AATTTA,ATAAAA,ATATAA,ATACTA,...,CCGACG,CGCCCG,CCGCCG,CGGCCG,CCAGCG,CCCGCG,CGCGCG,CCGGCG,CCGCGG,genome_label
genome_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
acinetobacter_baumannii,50199,0.001087,0.000362,0.000362,0.000725,0.001450,0.000725,0.001087,0.000725,0.000362,0.000000,...,0.000000,0.000000,0.000362,0.000000,0.000725,0.000000,0.000000,0.000000,0.000000,acinetobacter_baumannii
acinetobacter_baumannii,49240,0.002443,0.003197,0.002068,0.001880,0.002632,0.002632,0.002256,0.002821,0.001128,0.000376,...,0.000000,0.000000,0.000188,0.000000,0.000376,0.000000,0.000000,0.000000,0.000000,acinetobacter_baumannii
acinetobacter_baumannii,49495,0.006226,0.004395,0.003296,0.001831,0.003662,0.001465,0.001465,0.002563,0.000000,0.000366,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000732,0.000000,0.000000,acinetobacter_baumannii
acinetobacter_baumannii,49212,0.003307,0.003967,0.001323,0.001984,0.004631,0.002316,0.002975,0.004631,0.000661,0.000331,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,acinetobacter_baumannii
acinetobacter_baumannii,49659,0.000734,0.003304,0.001835,0.002386,0.002569,0.001652,0.001468,0.000734,0.000734,0.000551,...,0.000000,0.000000,0.000000,0.000367,0.000000,0.000000,0.000000,0.000183,0.000000,acinetobacter_baumannii
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yersinia_enterocolitica,74378,0.005455,0.000000,0.000909,0.001818,0.000000,0.000909,0.001818,0.002728,0.000909,0.001818,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,yersinia_enterocolitica
yersinia_enterocolitica,74567,0.000812,0.002436,0.001623,0.000000,0.000000,0.000000,0.000000,0.000812,0.000000,0.000000,...,0.000812,0.000812,0.000000,0.000000,0.001623,0.000000,0.001623,0.000812,0.000000,yersinia_enterocolitica
yersinia_enterocolitica,73280,0.000543,0.001221,0.000543,0.001356,0.001899,0.000407,0.001763,0.001763,0.000949,0.000136,...,0.000136,0.000678,0.001221,0.000271,0.000949,0.000407,0.000271,0.000678,0.000543,yersinia_enterocolitica
yersinia_enterocolitica,73370,0.002220,0.001110,0.001110,0.001110,0.003330,0.002220,0.001110,0.002775,0.000555,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.001110,0.000000,0.000000,0.000555,0.001110,yersinia_enterocolitica


In [78]:
# calculate F-score = intergroup variance / intragroup variance
inter_var = df_train.groupby('genome_label').apply(lambda x: x.median()).var()
intra_var = df_train.groupby('genome_label').apply(lambda x: x.var()).median()
f_score = inter_var / intra_var

In [79]:
# keep top 100 features with the highest F-score
f_score.sort_values(ascending=False, inplace=True)

In [291]:
f_score.iloc[100:]

CTTTTA    2.0
AGGCGC    2.0
CGACCG    2.0
CGCTGC    2.0
CCGAGG    2.0
         ... 
AACGAG    0.0
GTGCTA    0.0
CTAGTC    NaN
GGACCC    NaN
CCCTAG    NaN
Length: 1965, dtype: float16

In [81]:
df_train.drop(columns=f_score.iloc[100:].index, inplace=True)

Label processing: Encode categorical labels

In [104]:
labels = []
for name, group in df_train.groupby('genome_label'):
    labels += [name] * 910
df_train['genome_label'] = labels

In [106]:
df_train = pd.get_dummies(df_train, columns=['genome_label'], prefix='', prefix_sep='')
df_train

Unnamed: 0_level_0,Unnamed: 1_level_0,AAAAAA,AAAATT,AAAAGA,AAATTA,AAATTT,AATTAA,AATTTA,ATAAAA,ATATAA,ATACTA,...,staphylococcus_pseudintermedius,staphylococcus_pyogenes,stenotrophomonas_maltophilia,streptococcus_agalactiae,streptococcus_equi,streptococcus_pneumoniae,streptococcus_suis,vibrio_cholerae,vibrio_parahaemolyticus,yersinia_enterocolitica
genome_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
acinetobacter_baumannii,50199,0.001087,0.000362,0.000362,0.000725,0.001450,0.000725,0.001087,0.000725,0.000362,0.000000,...,0,0,0,0,0,0,0,0,0,0
acinetobacter_baumannii,49240,0.002443,0.003197,0.002068,0.001880,0.002632,0.002632,0.002256,0.002821,0.001128,0.000376,...,0,0,0,0,0,0,0,0,0,0
acinetobacter_baumannii,49495,0.006226,0.004395,0.003296,0.001831,0.003662,0.001465,0.001465,0.002563,0.000000,0.000366,...,0,0,0,0,0,0,0,0,0,0
acinetobacter_baumannii,49212,0.003307,0.003967,0.001323,0.001984,0.004631,0.002316,0.002975,0.004631,0.000661,0.000331,...,0,0,0,0,0,0,0,0,0,0
acinetobacter_baumannii,49659,0.000734,0.003304,0.001835,0.002386,0.002569,0.001652,0.001468,0.000734,0.000734,0.000551,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yersinia_enterocolitica,74378,0.005455,0.000000,0.000909,0.001818,0.000000,0.000909,0.001818,0.002728,0.000909,0.001818,...,0,0,0,0,0,0,0,0,0,1
yersinia_enterocolitica,74567,0.000812,0.002436,0.001623,0.000000,0.000000,0.000000,0.000000,0.000812,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,1
yersinia_enterocolitica,73280,0.000543,0.001221,0.000543,0.001356,0.001899,0.000407,0.001763,0.001763,0.000949,0.000136,...,0,0,0,0,0,0,0,0,0,1
yersinia_enterocolitica,73370,0.002220,0.001110,0.001110,0.001110,0.003330,0.002220,0.001110,0.002775,0.000555,0.000000,...,0,0,0,0,0,0,0,0,0,1


In [113]:
df_train.iloc[:, :100].values

array([[0.001087 , 0.0003624, 0.0003624, ..., 0.       , 0.       ,
        0.       ],
       [0.002443 , 0.003197 , 0.002068 , ..., 0.       , 0.       ,
        0.       ],
       [0.006226 , 0.004395 , 0.003296 , ..., 0.0007324, 0.       ,
        0.       ],
       ...,
       [0.0005426, 0.001221 , 0.0005426, ..., 0.0002713, 0.000678 ,
        0.0005426],
       [0.00222  , 0.00111  , 0.00111  , ..., 0.       , 0.000555 ,
        0.00111  ],
       [0.001229 , 0.0008783, 0.0003512, ..., 0.0003512, 0.0001756,
        0.0003512]], dtype=float16)

Dimensionality reduction using PCA (We can try UMAP?) - don't use PCA

In [11]:
pca = PCA(n_components=50, svd_solver='full')
pca.fit_transform(df_train.iloc[:, :2024].values)


array([[-0.004815  , -0.00411713, -0.0013386 , ...,  0.0005676 ,
        -0.00022195, -0.00050624],
       [-0.01187492,  0.00110101,  0.00152402, ...,  0.00109905,
         0.00027716, -0.00052278],
       [-0.01218702,  0.00010893, -0.00149988, ..., -0.00120418,
        -0.00022736, -0.00039659],
       ...,
       [-0.00396328, -0.00249141,  0.00054212, ...,  0.00051752,
        -0.00050016,  0.00052863],
       [-0.00106742, -0.0036179 , -0.00099751, ...,  0.00051922,
        -0.00064791,  0.00019041],
       [ 0.00600982, -0.00499511, -0.0002203 , ..., -0.00033927,
         0.00099336,  0.00226937]])

In [12]:
X_train = pca.fit_transform(df_train.iloc[:, :2024].values) # filter by explained variance ratio?
y_train = df_train.iloc[:, 2025:].values
X_train.shape, y_train.shape

((6200, 50), (6200, 31))

Prepare training set

In [111]:
X_train = df_train.iloc[:, :100].values
y_train = df_train.iloc[:, 100:].values
X_train.shape, y_train.shape

((28210, 100), (28210, 31))

Use neural network as model

In [229]:
# set random seed for reproducibility
np.random.seed(4220)
tf.random.set_seed(4220)

In [277]:
# define the model
n_inputs = 100 # no. of features
n_outputs = 31 # no. of classes
model = Sequential()

# input layer
model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))

# dropout regularisation
# model.add(Dropout(rate=0.5))

# hidden layer
model.add(Dense(32, activation='relu')) # units represents no. of neurons in layer, more units can increase capacity but risks overfitting

# output layer
model.add(Dense(n_outputs, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')

# updated neural network architecture
model.summary()

Model: "sequential_27"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_82 (Dense)            (None, 20)                2020      
                                                                 
 dense_83 (Dense)            (None, 32)                672       
                                                                 
 dense_84 (Dense)            (None, 31)                1023      
                                                                 
Total params: 3,715
Trainable params: 3,715
Non-trainable params: 0
_________________________________________________________________


In [278]:
# fit model
model.fit(X_train, y_train, verbose=0, epochs=100) # do cross-validation to optimise
model.save('model.h5')

In [130]:
def precision_per_patient(patient_id, preds):
    df_true = pd.read_csv(f'datasets/validation/patient{patient_id}_labels.txt')
    tp, fp, tp_labels = 0, 0, df_true['true_label'].shape[0]
    print(f'my prediction(s) for patient {patient_id}: {preds}')
    truth = df_true['true_label'].values
    print(f'true pathogen(s): {truth}')
    #if don't predict any pathogen, it means there is only decoy in the test dataset (your prediction)
    if len(preds) == 0:
        preds = ['decoy']
    for item in np.unique(preds):
        if item in df_true['true_label'].values:
            tp+=1
        else:
            fp+=1
    #you have to predict all labels correctly, but you are penalized for any false positive
    return tp/(tp_labels+fp)

In [279]:
# load trained model
model = load_model('model.h5')

In [126]:
feat = df_train.columns[:100].values

In [280]:
threshold = 0.99
classes = df['genome_label'].unique()

all_precision = []
for patient_id in range(1, 11):
    print(f'predicting for patient {patient_id}')
    
    with open(f'datasets/validation/patient{patient_id}.6mer.npy', 'rb') as read_file:
        df_test = np.load(read_file)
    df_test = pd.DataFrame(df_test)
    df_test.columns = six_mers[:-1]
    df_test.columns = [col[0] for col in df_test.columns]
    X_test = df_test.loc[:, feat].values
        
    # predict test set
    y_pred = model.predict(X_test) # each column is the probability that the row is a particular pathogen

    final_predictions = [classes[lab] for lab in np.unique([np.argmax(item) for item in y_pred if np.max(item) >= threshold])]
    
    # my pathogens detected, decoy will be ignored
    final_predictions = [item for item in final_predictions if item !='decoy']
    
    precision = precision_per_patient(patient_id, final_predictions)
    print(f'precision: {precision}')
    all_precision.append(precision)

predicting for patient 1
my prediction(s) for patient 1: ['mycobacterium_ulcerans', 'corynebacterium_ulcerans', 'mycobacterium_tuberculosis', 'clostridioides_difficile', 'streptococcus_suis']
true pathogen(s): ['staphylococcus_aureus']
precision: 0.0
predicting for patient 2
my prediction(s) for patient 2: ['staphylococcus_pyogenes', 'mycobacterium_ulcerans', 'mycobacterium_tuberculosis', 'streptococcus_suis']
true pathogen(s): ['staphylococcus_pyogenes']
precision: 0.25
predicting for patient 3
my prediction(s) for patient 3: ['staphylococcus_pyogenes', 'mycobacterium_ulcerans', 'corynebacterium_ulcerans', 'mycobacterium_tuberculosis', 'streptococcus_suis']
true pathogen(s): ['burkholderia_pseudomallei' 'corynebacterium_ulcerans']
precision: 0.16666666666666666
predicting for patient 4
my prediction(s) for patient 4: ['corynebacterium_ulcerans', 'mycobacterium_tuberculosis', 'clostridioides_difficile', 'streptococcus_suis']
true pathogen(s): ['pseudomonas_aeruginosa']
precision: 0.0
p

In [281]:
# performance per patient and its final average
print([f'patient {c}: {item}' for c, item in enumerate(all_precision, start=1)])
print(f'avg: {np.mean(all_precision)}')

['patient 1: 0.0', 'patient 2: 0.25', 'patient 3: 0.16666666666666666', 'patient 4: 0.0', 'patient 5: 0.0', 'patient 6: 0.0', 'patient 7: 0.0', 'patient 8: 0.16666666666666666', 'patient 9: 0.0', 'patient 10: 0.0']
avg: 0.05833333333333333
