In [None]:
import os
import timeit
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
from sklearn.decomposition import PCA
from keras import metrics
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import ModelCheckpoint
from numpy.random import seed
seed(1)
from tensorflow.random import set_seed
set_seed(2)

# part 1

In [None]:
# load training dataset
with open('datasets/train.dataset.6mer.npy', 'rb') as open_file:
    df = np.load(open_file)
df = pd.DataFrame(df)

In [None]:
six_mers = pd.read_table('datasets/6mer_columns.txt', header=None)
df.columns = six_mers[:2080]
df.columns = [col[0] for col in df.columns]

In [None]:
labels = pd.read_csv('datasets/train_labels.csv')
df['genome_label'] = labels
df.head()

Unnamed: 0,AAAAAA,AAAAAT,AAAAAG,AAAAAC,AAAATA,AAAATT,AAAATG,AAAATC,AAAAGA,AAAAGT,...,CCATGG,CCAGGG,CCACGG,CCTAGG,CCCAGG,CCGAGG,CCCCGG,CCGCGG,CCCGGG,genome_label
0,0.00247,0.004528,0.003292,0.000823,0.003704,0.00288,0.001646,0.001646,0.001646,0.001646,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,staphylococcus_aureus
1,0.001818,0.002857,0.002077,0.001558,0.003635,0.002338,0.002338,0.001039,0.001818,0.001039,...,0.000519,0.0,0.0,0.000519,0.00026,0.0,0.0,0.0,0.0,staphylococcus_aureus
2,0.003702,0.003084,0.001234,0.001851,0.002468,0.003084,0.003084,0.001851,0.001234,0.000617,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,staphylococcus_aureus
3,0.001102,0.002756,0.003584,0.001378,0.003307,0.002481,0.002481,0.001102,0.001654,0.001378,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,staphylococcus_aureus
4,0.004318,0.003534,0.002748,0.001701,0.003534,0.002356,0.002224,0.002617,0.003271,0.00144,...,0.0,0.0,0.0,0.000262,0.000131,0.0,0.0,0.0,0.0,staphylococcus_aureus


In [None]:
df['genome_label'].value_counts()

decoy                              446209
burkholderia_pseudomallei            3787
pseudomonas_aeruginosa               3342
klebsiella_michiganensis             3167
mycobacterium_ulcerans               2999
klebsiella_pneumoniae                2840
serratia_liquefaciens                2832
citrobacter_freundii                 2718
salmonella_enterica_typhimurium      2595
salmonella_enterica_paratyphi        2579
yersinia_enterocolitica              2416
stenotrophomonas_maltophilia         2388
mycobacterium_tuberculosis           2354
clostridioides_difficile             2249
acinetobacter_baumannii              2133
legionella_pneumophila               1814
vibrio_parahaemolyticus              1743
listeria_monocytogenes               1588
vibrio_cholerae                      1564
staphylococcus_aureus                1493
staphylococcus_pseudintermedius      1381
corynebacterium_ulcerans             1306
corynebacterium_diphtheriae          1274
neisseria_meningitidis            

There are 30 different pathogens and one decoy label. There are 2080 features and 505536 samples, as well as no missing values, in the training set.

In [None]:
print(df['genome_label'].value_counts().sum())
print(len(df['genome_label'].unique()))
# print(df.index.get_level_values(0).unique())
df[df.isna().any(axis=1)]

505536
31


Unnamed: 0,AAAAAA,AAAAAT,AAAAAG,AAAAAC,AAAATA,AAAATT,AAAATG,AAAATC,AAAAGA,AAAAGT,...,CCATGG,CCAGGG,CCACGG,CCTAGG,CCCAGG,CCGAGG,CCCCGG,CCGCGG,CCCGGG,genome_label


In [None]:
# subsample with random seed for reproducibility
# sample_size = df.genome_label.value_counts().to_frame().min().values[0]
def sampling(x, n):
    current = x.shape[0]
    if n > current:
        n = current 
    x = x.sample(n, random_state=4)
    return x

df_train = df.groupby('genome_label').apply(lambda x: sampling(x,3787)) #3787
labels = df_train['genome_label']

In [None]:
df_train['genome_label'].value_counts()

burkholderia_pseudomallei          3787
decoy                              3787
pseudomonas_aeruginosa             3342
klebsiella_michiganensis           3167
mycobacterium_ulcerans             2999
klebsiella_pneumoniae              2840
serratia_liquefaciens              2832
citrobacter_freundii               2718
salmonella_enterica_typhimurium    2595
salmonella_enterica_paratyphi      2579
yersinia_enterocolitica            2416
stenotrophomonas_maltophilia       2388
mycobacterium_tuberculosis         2354
clostridioides_difficile           2249
acinetobacter_baumannii            2133
legionella_pneumophila             1814
vibrio_parahaemolyticus            1743
listeria_monocytogenes             1588
vibrio_cholerae                    1564
staphylococcus_aureus              1493
staphylococcus_pseudintermedius    1381
corynebacterium_ulcerans           1306
corynebacterium_diphtheriae        1274
neisseria_meningitidis             1196
streptococcus_equi                 1187


In [None]:
print('rows')
print(sum(df_train[df_train == 0].count(axis=1))/(len(df_train.columns)*len(df_train.index)))
print('cols')
print(sum(df_train[df_train == 0].count(axis=0))/(len(df_train.columns)*len(df_train.index)))

rows
0.3776588403613252
cols
0.3776588403613252


In [None]:
x = df_train[df_train == 0].count(axis=1))/(len(df_train.columns)*len(df_train.index)
x = x.to_frame()

## Feature preprocessing

Feature selection: Remove highly correlated features

In [None]:
corr_matrix = df_train.corr()

mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) # remove duplicate upper triangle correlation values
corr_matrix = corr_matrix.abs() # create positive correlation matrix
tri_df = corr_matrix.mask(mask) # create and apply upper triangle mask
# tri_df.to_csv('corr_matrix.csv')

  corr_matrix = df_train.corr()


In [None]:
tri_df= pd.read_csv('corr_matrix.csv', index_col=0)
to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.7)]
df_train.drop(columns=to_drop, inplace=True)
len(to_drop)

162

Label processing: Encode categorical labels

In [None]:
df_train = pd.get_dummies(df_train, columns=['genome_label'], prefix='', prefix_sep='')
df_train

Unnamed: 0_level_0,Unnamed: 1_level_0,AAAAAA,AAAAAT,AAAAAG,AAAAAC,AAAATA,AAAATT,AAAATG,AAAATC,AAAAGA,AAAAGT,...,staphylococcus_pseudintermedius,staphylococcus_pyogenes,stenotrophomonas_maltophilia,streptococcus_agalactiae,streptococcus_equi,streptococcus_pneumoniae,streptococcus_suis,vibrio_cholerae,vibrio_parahaemolyticus,yersinia_enterocolitica
genome_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
acinetobacter_baumannii,50199,0.001087,0.000000,0.001812,0.001812,0.000725,0.000362,0.000362,0.000725,0.000362,0.001087,...,0,0,0,0,0,0,0,0,0,0
acinetobacter_baumannii,49240,0.002443,0.001880,0.001504,0.001880,0.001692,0.003197,0.001128,0.001316,0.002068,0.000940,...,0,0,0,0,0,0,0,0,0,0
acinetobacter_baumannii,49495,0.006226,0.004028,0.004761,0.001831,0.001465,0.004395,0.000732,0.002930,0.003296,0.001099,...,0,0,0,0,0,0,0,0,0,0
acinetobacter_baumannii,49212,0.003307,0.005623,0.002316,0.002316,0.005291,0.003967,0.002645,0.002645,0.001323,0.000661,...,0,0,0,0,0,0,0,0,0,0
acinetobacter_baumannii,49659,0.000734,0.002203,0.001101,0.001101,0.001285,0.003304,0.001101,0.001652,0.001835,0.000183,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yersinia_enterocolitica,74378,0.005455,0.000909,0.003635,0.003635,0.000909,0.000000,0.001818,0.000909,0.000909,0.003635,...,0,0,0,0,0,0,0,0,0,1
yersinia_enterocolitica,74567,0.000812,0.000000,0.001623,0.002436,0.000812,0.002436,0.001623,0.000812,0.001623,0.000812,...,0,0,0,0,0,0,0,0,0,1
yersinia_enterocolitica,73280,0.000543,0.000814,0.001356,0.001221,0.001492,0.001221,0.001356,0.001221,0.000543,0.001085,...,0,0,0,0,0,0,0,0,0,1
yersinia_enterocolitica,73370,0.002220,0.002775,0.002775,0.002220,0.004440,0.001110,0.000555,0.001665,0.001110,0.000000,...,0,0,0,0,0,0,0,0,0,1


In [None]:
df_train.iloc[:, :-31]

Unnamed: 0_level_0,Unnamed: 1_level_0,AAAAAC,AAAAGT,AAAAGG,AAAAGC,AAAACT,AAAACG,AAAACC,AAATAG,AAATAC,AAATTG,...,CCAAGG,CCATGG,CCAGGG,CCACGG,CCTAGG,CCCAGG,CCGAGG,CCCCGG,CCGCGG,CCCGGG
genome_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
acinetobacter_baumannii,50199,0.001812,0.001087,0.001087,0.002537,0.000725,0.001087,0.001812,0.000725,0.000362,0.001450,...,0.000362,0.000000,0.000000,0.000362,0.000000,0.000000,0.000725,0.000000,0.0,0.000000
acinetobacter_baumannii,49240,0.001880,0.000940,0.000564,0.001316,0.002443,0.000564,0.000376,0.001128,0.000564,0.001692,...,0.000188,0.000376,0.000000,0.000188,0.000000,0.000000,0.000188,0.000000,0.0,0.000000
acinetobacter_baumannii,49495,0.001831,0.001099,0.001465,0.001831,0.000732,0.001465,0.000000,0.000366,0.001099,0.001465,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
acinetobacter_baumannii,49212,0.002316,0.000661,0.000661,0.003307,0.000992,0.000331,0.001323,0.001984,0.001323,0.000992,...,0.000000,0.000000,0.000331,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
acinetobacter_baumannii,49659,0.001101,0.000183,0.000367,0.000367,0.000734,0.000734,0.000734,0.000367,0.000917,0.001835,...,0.000000,0.000367,0.000000,0.000183,0.000000,0.000000,0.000183,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yersinia_enterocolitica,75462,0.002151,0.000615,0.001229,0.000615,0.001843,0.000922,0.001536,0.001536,0.000307,0.000922,...,0.000307,0.000000,0.000307,0.000307,0.000000,0.000307,0.000307,0.000615,0.0,0.000000
yersinia_enterocolitica,74975,0.000000,0.000429,0.000000,0.000000,0.000000,0.000000,0.000858,0.000000,0.000000,0.000429,...,0.000000,0.000000,0.000000,0.000429,0.000000,0.000858,0.000000,0.000000,0.0,0.000858
yersinia_enterocolitica,73763,0.001320,0.001056,0.000000,0.001056,0.000792,0.001848,0.001848,0.000264,0.000528,0.000264,...,0.000528,0.000000,0.000264,0.001320,0.000528,0.000000,0.000528,0.000264,0.0,0.000000
yersinia_enterocolitica,73228,0.001079,0.001079,0.000000,0.000719,0.001079,0.000360,0.001798,0.000360,0.000360,0.001079,...,0.000719,0.000000,0.000719,0.000000,0.000000,0.001438,0.000000,0.000000,0.0,0.000000


In [None]:
new_pca = PCA(n_components=1918, random_state=4220)
pca_data = new_pca.fit_transform(df_train.iloc[:,:-31])

with open('pca_n1918.pkl', 'wb') as pickle_file:
    pkl.dump(new_pca, pickle_file)

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500, random_state=4220)
svd_data = svd.fit_transform(df_train.iloc[:, :-31]) 

with open('svd_n500.pkl', 'wb') as pickle_file:
    pkl.dump(svd, pickle_file)

In [None]:
svd

In [None]:
lim = 0.9
ACC_VAR = 0
for i, var in enumerate(new_pca.explained_variance_ratio_):
    ACC_VAR+=var
    # print(var)
    if i >=899:
        print(f"{i+1} components explained {ACC_VAR} of total var")
        break
    if ACC_VAR > lim: 
        print(f"{i+1} components explained {lim}S of total var")
        break
ACC_VAR

900 components explained 0.8859885877176671 of total var


0.8859885877176671

# part 2

In [None]:
#loading training dataset
with open('datasets/train.dataset.6mer.npy', 'rb') as open_file:
    df = np.load(open_file)
df = pd.DataFrame(df)
df.shape

(505536, 2080)

In [None]:
# loading the 6mer
six_mers = pd.read_table('datasets/6mer_columns.txt', header=None)
df.columns = six_mers[:2080]
df.columns = [col[0] for col in df.columns]

# loading the training labels
df_y = pd.read_csv('datasets/train_labels.csv')
## labelling the genome
le = preprocessing.LabelEncoder()
le.fit(df_y['genome_name'].unique())
y_index = le.transform(df_y['genome_name'].values)
df['genome_label'] = y_index
df.head()

Unnamed: 0,AAAAAA,AAAAAT,AAAAAG,AAAAAC,AAAATA,AAAATT,AAAATG,AAAATC,AAAAGA,AAAAGT,...,CCATGG,CCAGGG,CCACGG,CCTAGG,CCCAGG,CCGAGG,CCCCGG,CCGCGG,CCCGGG,genome_label
0,0.00247,0.004528,0.003292,0.000823,0.003704,0.00288,0.001646,0.001646,0.001646,0.001646,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20
1,0.001818,0.002857,0.002077,0.001558,0.003635,0.002338,0.002338,0.001039,0.001818,0.001039,...,0.000519,0.0,0.0,0.000519,0.00026,0.0,0.0,0.0,0.0,20
2,0.003702,0.003084,0.001234,0.001851,0.002468,0.003084,0.003084,0.001851,0.001234,0.000617,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20
3,0.001102,0.002756,0.003584,0.001378,0.003307,0.002481,0.002481,0.001102,0.001654,0.001378,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20
4,0.004318,0.003534,0.002748,0.001701,0.003534,0.002356,0.002224,0.002617,0.003271,0.00144,...,0.0,0.0,0.0,0.000262,0.000131,0.0,0.0,0.0,0.0,20


In [None]:
print(df_y.shape)
print(df_y['genome_name'].unique().size)
df_y['genome_name'].value_counts()

(505536, 1)
31


decoy                              446209
burkholderia_pseudomallei            3787
pseudomonas_aeruginosa               3342
klebsiella_michiganensis             3167
mycobacterium_ulcerans               2999
klebsiella_pneumoniae                2840
serratia_liquefaciens                2832
citrobacter_freundii                 2718
salmonella_enterica_typhimurium      2595
salmonella_enterica_paratyphi        2579
yersinia_enterocolitica              2416
stenotrophomonas_maltophilia         2388
mycobacterium_tuberculosis           2354
clostridioides_difficile             2249
acinetobacter_baumannii              2133
legionella_pneumophila               1814
vibrio_parahaemolyticus              1743
listeria_monocytogenes               1588
vibrio_cholerae                      1564
staphylococcus_aureus                1493
staphylococcus_pseudintermedius      1381
corynebacterium_ulcerans             1306
corynebacterium_diphtheriae          1274
neisseria_meningitidis            

In [None]:
x = df.loc[df['genome_label']=='campylobacter_jejuni']
print(x.shape)
kmer = x.sum(axis=0).to_frame()
read = x.sum(axis=1).to_frame()

(0, 2081)


# data cleaning

In [None]:
sum_row = df.iloc[:,:-1].sum(axis=1)
print(sum_row.describe())
print((sum_row < 0.9).sum())

count    505536.000000
mean               NaN
std           0.000000
min           0.000000
25%           0.975098
50%           1.010742
75%           1.039062
max           1.311523
dtype: float64
22011


In [None]:
# to remove samples with low kmer count
print(df.shape)
df = df.loc[sum_row >= 0.9,:]
print(df.shape)
y_index = y_index[sum_row >= 0.9]

(505536, 2081)
(483525, 2081)


In [None]:
def sampling(x, n):
    current = x.shape[0]
    if n > current:
        n = current # takes the minimum number
    return x.sample(n, random_state=4)
df = df.groupby('genome_label').apply(lambda x: sampling(x, n=3787)) 

In [None]:
sample_size = df.genome_label.value_counts().to_frame().min().values[0] # change to df, get min, get value
# sample_size = 500
df_train = df.groupby('genome_label').apply(lambda x: x.sample(sample_size))
# df_train['genome_label'].value_counts()

In [None]:
tri_df= pd.read_csv('corr_matrix.csv', index_col=0) # created from running the full data
to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.7)] # if column is labeled
# to_drop = [c for c in range(len(tri_df.columns)) if any(tri_df[tri_df.columns[c]] > 0.7)] # if column is indexed
df_train.drop(columns=to_drop, inplace=True)
df_train.shape

(28210, 1919)

In [None]:
### running PCA
with open('svd_n500.pkl', 'rb') as pickle_file: # PCA embeddings trained on full data
    preprocess=pkl.load(pickle_file) 
y_train = df_train['genome_label'].astype(int).values
x_train = preprocess.transform(df_train.iloc[:,:-1])
print(x_train.shape, y_train.shape)

### training the model
clf = SVC(kernel='rbf', probability=True)
clf.fit(x_train, y_train)

(28210, 500) (28210,)


In [None]:
from joblib import dump, load
dump(clf,'svm_svdfull_910.joblib') # saving the model
preprocess