# Classification problem of severe COVID-19 patients

## Library import

In [1]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

In [2]:
INPUT_MAESTRO_DATA = "data/MAESTRO-d6178bdd-identified_variants_merged_protein_regions-main.tsv"
variants = pd.read_csv(INPUT_MAESTRO_DATA, sep='\t', low_memory=True)

  variants = pd.read_csv(INPUT_MAESTRO_DATA, sep='\t', low_memory=True)


In [3]:
variants.head()

Unnamed: 0,rowid,ccms_row_id,Algorithm,Filename,Cluster_index,Peptide,Unmodified_sequence,Charge,_dyn_#Intensity_for_cluster,_dyn_#Intensity_for_unmodified_sequence,...,PSP_site_match,DrugBank_drugs,Parent_mass,Num_PSP_Drugbank_events,Start_AA_1_based,End_AA_1_based,Num_spectra_for_cluster,Num_spectra_for_unmodified_sequence,Num_spectra_for_peptide_variant,Internal_ref_orig_intensity
0,1,1,.MODA.,specs_ms.mgf,960991,"K.[304.207]GARLIPEMDQIFTEVEMTTLE(K,304.207).V",.GARLIPEMDQIFTEVEMTTLEK.,4,36.905893,36.905893,...,,,1580.81,0,,,1,1,1,8204.159
1,2,2,.MODA.,specs_ms.mgf,763982,"I.[304.207]FTEVEMTTLE(K,304.207).V",.FTEVEMTTLEK.,3,11.686782,11.686782,...,,,1934.91,0,,,1,2,2,493689.4
2,3,3,.MSGFPLUS.,specs_ms.mgf,902201,K.[304.207]LYQPEYQEVSTEEQR.E,.LYQPEYQEVSTEEQR.,3,15.690234,15.690234,...,,,2203.09,0,,,5,6,6,195156.6
3,4,4,.MSGFPLUS.,specs_ms.mgf,935503,"K.[304.207]AANSLEAFIFETQD(K,304.207).L",.AANSLEAFIFETQDK.,3,15.016824,15.016824,...,,,2292.24,0,,,3,4,4,2877781.0
4,5,5,.MODA.,specs_ms.mgf,297961,"R.[304.207]YSHDF(N,-56.985)FH.I",.YSHDFNFH.,3,33.768015,33.768015,...,,,1313.66,0,,,3,3,3,70884.4


In [4]:
variants_processed = variants[
    ['Peptide'] + [c for c in variants.columns if 'intensity_for_peptide_variant' in c]
].copy()

variants_processed.replace(0.0, np.nan, inplace=True)

variants_processed = variants_processed.set_index('Peptide')

variants_processed = variants_processed.T

variants_processed.index = variants_processed.index.map(lambda x: '.'.join(x.split('.')[:2]))

variants_processed['Condition'] = variants_processed.index.map(lambda x: x.split('.')[0][6:])

variants_processed = variants_processed[(variants_processed['Condition'] == 'Non-severe-COVID-19')\
                                        | (variants_processed['Condition'] == 'Symptomatic-non-COVID-19')\
                                        | (variants_processed['Condition'] == 'Healthy')\
                                        | (variants_processed['Condition'] == 'Severe-COVID-19')]

print(variants_processed['Condition'].value_counts())
print(variants_processed.shape)

Condition
Non-severe-COVID-19         25
Symptomatic-non-COVID-19    25
Healthy                     22
Severe-COVID-19             18
Name: count, dtype: int64
(90, 101462)


In [5]:
# for col in variants_processed.columns:
#     print(col, '$$$$', variants_processed[col].count(),'$$$$',  '[', variants_processed[col].min(), ',',variants_processed[col].max(), ']')

In [6]:
variants_processed.head(5)

Peptide,"K.[304.207]GARLIPEMDQIFTEVEMTTLE(K,304.207).V","I.[304.207]FTEVEMTTLE(K,304.207).V",K.[304.207]LYQPEYQEVSTEEQR.E,"K.[304.207]AANSLEAFIFETQD(K,304.207).L","R.[304.207]YSHDF(N,-56.985)FH.I","R.[304.207](P,143.096)SV(C,57.021)REAGPQAHMQQVTSSL(K,304.207).G",K.[304.207]QGSTGEEFHFQTGGR.D,"K.[304.207]HGTDDGVVW(M,15.995)NW(K,304.207).G","K.[304.207](H,100.027)GTDDGVVWMNW(K,304.207).G","K.[304.207]H(G,304.213)TDDGVVWMNW(K,304.207).G",...,"K.[304.207]YLGEE(Y,-57.005)V(K,304.207).A","K.[304.207]YLGEE(Y,-58.064)V(K,304.207).A","K.[304.207]YLGEE(Y,-60.599)V(K,304.207).A","K.[304.207]YLGEE(Y,-63.608)V(K,304.207).A","K.[304.207]YLGE(E,-68.078)YV(K,304.207).A","K.[304.207]YL(G,55.921)EEYV(K,304.207).A","K.{187.018}[304.207]YLGEEYV(K,304.207).A","R.[304.207]NTYE(K,361.237)YLGEEYV(K,304.207).A","K.[304.207]YLGE(E,125.898)YV(K,304.207).A",Condition
_dyn_#Healthy.HC1,,,0.935916,,,,,,,0.288821,...,1.958825,,,1.174198,,,,0.37004,,Healthy
_dyn_#Healthy.HC10,6.645649,,,,,,,,,0.026024,...,2.544654,,,,,,,,,Healthy
_dyn_#Healthy.HC12,3.391896,,,,,,,,,0.009033,...,0.913199,,,,,,,,,Healthy
_dyn_#Healthy.HC13,1.919552,,,,,,,,,0.028415,...,1.035449,,,,,,,,,Healthy
_dyn_#Healthy.HC17,,,0.28653,1.259306,,,,,,,...,0.110892,0.81455,0.863862,,,,,0.791905,,Healthy


In [7]:
X = variants_processed.iloc[:, :-1].to_numpy()
labels, Y = np.unique(variants_processed.iloc[:, -1].to_numpy(), return_inverse=True)
print(X.shape)
print(Y.shape)

(90, 101461)
(90,)


In [8]:
print(X[:5])
print(Y)

[[       nan        nan 0.93591588 ...        nan 0.37003983        nan]
 [6.64564884        nan        nan ...        nan        nan        nan]
 [3.39189595        nan        nan ...        nan        nan        nan]
 [1.91955205        nan        nan ...        nan        nan        nan]
 [       nan        nan 0.28653002 ...        nan 0.79190531        nan]]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

In [10]:
print(X_train.shape, X_test.shape)
print(y_train, y_test)

(72, 101461) (18, 101461)
[2 3 1 1 1 2 0 2 0 3 3 0 3 2 0 1 1 0 3 1 1 0 0 1 3 2 2 3 2 1 3 1 0 0 3 3 2
 1 1 3 1 3 1 3 1 0 3 3 0 1 3 2 2 0 1 3 1 3 0 2 0 1 0 2 0 0 3 2 0 1 2 3] [3 1 0 0 1 3 0 3 2 1 1 2 2 3 0 3 2 1]


In [11]:
valid_per_sample = np.zeros(X_train.shape[0])
for i in range(X_train.shape[0]):
    valid_per_sample[i] = np.count_nonzero(~np.isnan(X_train[i]))
valid_per_sample = X_train.shape[1] - valid_per_sample
print(valid_per_sample)
print(np.min(valid_per_sample), np.max(valid_per_sample), np.mean(valid_per_sample), np.median(valid_per_sample))

[66120. 56135. 67715. 66306. 56229. 67592. 56238. 59893. 59921. 56206.
 65190. 59815. 56168. 65170. 58912. 66120. 65179. 64835. 59109. 57137.
 56084. 67616. 56250. 65108. 66043. 57010. 58863. 67731. 33313. 58969.
 56035. 57049. 58934. 67500. 46053. 59088. 66158. 59907. 66244. 57197.
 59934. 58803. 59950. 66100. 59002. 66190. 56909. 67624. 66133. 65240.
 56049. 56926. 66146. 56142. 57030. 67564. 59009. 59898. 41778. 58860.
 67515. 56238. 56950. 56056. 56127. 65109. 59810. 56211. 65103. 56281.
 56207. 56990.]
33313.0 67731.0 60069.805555555555 59048.5


In [12]:
valid_per_feature = np.zeros(X_train.shape[1])
for i in range(X_train.shape[1]):
    valid_per_feature[i] = np.count_nonzero(~np.isnan(X_train[:, i]))
valid_per_feature = X_train.shape[0] - valid_per_feature
print(valid_per_feature)
print(np.min(valid_per_feature), np.max(valid_per_feature), np.mean(valid_per_feature), np.median(valid_per_feature))

[64. 61. 25. ... 52. 46. 53.]
0.0 72.0 42.62747262494949 53.0


In [13]:
X_train_normalized = stats.zscore(X_train, nan_policy='omit')
X_train_normalized

array([[        nan, -0.85755533, -0.3811363 , ...,         nan,
                nan,         nan],
       [        nan,         nan,  2.62295481, ...,         nan,
        -0.54622186,         nan],
       [        nan,         nan,         nan, ..., -0.91798291,
                nan, -0.79241789],
       ...,
       [        nan,         nan, -0.19347276, ...,         nan,
        -0.5285681 ,         nan],
       [        nan,         nan,  0.68180655, ...,         nan,
        -0.25351343,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,  0.25227816]])

In [14]:
imputer = KNNImputer(n_neighbors=2)
X_knn = imputer.fit_transform(X_train_normalized)
X_knn

array([[-0.4203426 , -0.85755533, -0.3811363 , ..., -0.89175026,
         0.12673614, -0.76335994],
       [-0.04935335,  0.14508982,  2.62295481, ..., -0.55211438,
        -0.54622186, -0.82683088],
       [-0.27264962, -1.04228745, -0.4809648 , ..., -0.91798291,
        -0.12186765, -0.79241789],
       ...,
       [ 0.39692085, -0.24125504, -0.19347276, ...,  0.07586786,
        -0.5285681 , -0.48023079],
       [-0.04935335,  0.33868829,  0.68180655, ...,  0.04874033,
        -0.25351343, -0.4764932 ],
       [-0.14937939,  0.33868829, -0.7236341 , ...,  0.07586786,
        -0.44974761,  0.25227816]])