# Classification problem of severe COVID-19 patients

## Library import

In [None]:
%matplotlib inline

import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer

In [None]:
INPUT_MAESTRO_DATA = "data/MAESTRO-d6178bdd-identified_variants_merged_protein_regions-main.tsv"
variants = pd.read_csv(INPUT_MAESTRO_DATA, sep='\t', low_memory=False)

In [None]:
variants.head()

In [None]:
variants_processed = variants[
    ['Peptide'] + [c for c in variants.columns if 'intensity_for_peptide_variant' in c]
].copy()

variants_processed.replace(0.0, np.nan, inplace=True)

variants_processed = variants_processed.set_index('Peptide')

variants_processed = variants_processed.T

variants_processed.index = variants_processed.index.map(lambda x: '.'.join(x.split('.')[:2]))

variants_processed['Condition'] = variants_processed.index.map(lambda x: x.split('.')[0][6:])

variants_processed = variants_processed[(variants_processed['Condition'] == 'Non-severe-COVID-19')\
                                        | (variants_processed['Condition'] == 'Symptomatic-non-COVID-19')\
                                        | (variants_processed['Condition'] == 'Healthy')\
                                        | (variants_processed['Condition'] == 'Severe-COVID-19')]

print(variants_processed['Condition'].value_counts())
print(variants_processed.shape)

In [None]:
# for col in variants_processed.columns:
#     print(col, '$$$$', variants_processed[col].count(),'$$$$',  '[', variants_processed[col].min(), ',',variants_processed[col].max(), ']')

In [None]:
variants_processed.head(5)

In [None]:
X = variants_processed.iloc[:, :-1].to_numpy()
labels, Y = np.unique(variants_processed.iloc[:, -1].to_numpy(), return_inverse=True)
print(X.shape)
print(Y.shape)

In [None]:
print(X[:5])
print(Y)

In [None]:
spliter = StratifiedShuffleSplit(n_splits=5, random_state=42)
X_folded = []
Y_folded = []
for train_index, test_index in StratifiedShuffleSplit.split(X, Y):
    X_folded.append(X[train_index], X[test_index])
    Y_folded.append(Y[train_index], Y[test_index])

In [None]:
valid_per_sample = np.zeros(X.shape[0])
for i in range(X.shape[0]):
    valid_per_sample[i] = np.count_nonzero(~np.isnan(X[i]))
valid_per_sample = X.shape[1] - valid_per_sample
print(valid_per_sample)
print(np.min(valid_per_sample), np.max(valid_per_sample), np.mean(valid_per_sample), np.median(valid_per_sample))

In [None]:
valid_per_feature = np.zeros(X.shape[1])
for i in range(X.shape[1]):
    valid_per_feature[i] = np.count_nonzero(~np.isnan(X[:, i]))
valid_per_feature = X.shape[0] - valid_per_feature
print(valid_per_feature)
print(np.min(valid_per_feature), np.max(valid_per_feature), np.mean(valid_per_feature), np.median(valid_per_feature))

In [None]:
X_normalized = stats.zscore(X, nan_policy='omit')
X_normalized

In [None]:
imputer = KNNImputer(n_neighbors=2)
X_knn = imputer.fit_transform(X_normalized)
X_knn

In [None]:
# imputer = IterativeImputer(random_state=42)
# X_iter = imputer.fit_transform(X_normalized)
# X_iter