In [7]:
import pandas as pd
import re

In [8]:
# Constants
labelled_column_name = 'Labelled'

### Comparative binding to wild type vs mutant SARS-CoV-2 NP antigens by different epitope groups

In [None]:
antigens_by_epitope_groups = pd.read_csv("antigens_by_epitope_groups.csv", sep='\t')
antigens_by_epitope_groups.dropna(inplace=True)
antigens_by_epitope_groups = antigens_by_epitope_groups.reset_index()
antigens_by_epitope_groups

In [None]:
antigen_names = antigens_by_epitope_groups['blank'].tolist()

### Cross inhibition raw data

In [15]:
ci_raw_data = pd.read_csv("../Data/cross_inhibitor_raw_data.csv", sep='\t')
ci_raw_data.dropna(inplace=True)
ci_raw_data_filtered = ci_raw_data.loc[ci_raw_data[labelled_column_name].isin(antigen_names)]
ci_raw_data_filtered.reset_index(level=0, drop=True, inplace=True)
ci_raw_data_filtered = ci_raw_data_filtered.drop('Neat', axis=1)

In [16]:
ci_raw_data_filtered

Unnamed: 0,Labelled,NP1501*,NP1502*,NP1503*,NP1508*,NP1510*,NP1514*,NP1516*,NP1517*,NP1518*,...,X202*,X211*,X213*,X215*,X217*,X220*,X221*,X223*,X233*,X271*
0,blank,1.089,1.067,1.3664,1.412,1.67,1.07,1.1704,1.11,1.1616,...,1.1844,1.422,1.3617,1.105,1.3671,1.484,1.3536,1.4688,1.234,1.412
1,NP1501,0.449,0.715,1.0664,1.0248,1.136,0.26,0.4172,0.268,0.8512,...,0.5316,1.2267,1.2519,0.64,1.0224,1.189,0.8856,1.4616,0.769,1.228
2,NP1502,0.893,0.425,0.336,0.196,0.349,0.3625,0.665,0.45,1.0,...,0.6492,1.2087,1.2753,0.55,1.0593,1.203,0.2979,1.4184,0.809,0.407
3,NP1503,0.768,0.309,0.068,0.052,0.095,0.305,0.7126,0.408,0.9888,...,0.642,1.197,1.2582,0.6075,1.0791,1.215,0.1035,1.4704,0.93,0.109
4,NP1507,0.422,0.856,0.7216,0.6088,0.785,0.1825,0.4256,0.154,1.0352,...,0.6432,1.2213,1.2672,0.5125,1.0215,1.244,0.6624,1.4168,1.018,0.843
5,NP1508,0.732,0.388,0.1456,0.0848,0.19,0.345,0.8456,0.411,1.1344,...,0.6492,1.2474,1.2699,0.5575,1.0089,1.151,0.207,1.376,1.052,0.233
6,NP1510,0.781,0.382,0.2152,0.1056,0.233,0.495,0.7826,0.527,1.0,...,0.6744,1.1547,1.1934,0.575,0.9513,1.117,0.2331,1.42,0.942,0.22
7,NP1512,0.79,0.789,0.876,0.7504,0.979,0.735,1.015,0.737,0.816,...,0.6012,1.1826,1.0251,0.7425,0.8532,1.123,0.6534,1.2544,1.018,0.627
8,NP1514,0.448,0.822,1.0968,1.0088,1.189,0.2475,0.4858,0.253,1.0208,...,0.6816,1.2384,1.224,0.8075,1.0413,1.332,0.8712,1.4616,0.772,0.964
9,NP1516,0.385,1.034,0.9832,0.9304,1.053,0.1775,0.3052,0.152,0.9504,...,0.5916,1.2078,1.1844,0.5925,1.0116,1.261,0.792,1.4944,0.878,0.974


### Cross inhibition sorted by homology

In [None]:
labelled_column_location = ci_raw_data_filtered.columns.get_loc(labelled_column_name)

In [None]:
# Get groups and insert
groups = ci_raw_data_filtered.apply(
    lambda row: antigens_by_epitope_groups.loc[antigens_by_epitope_groups['blank'] == row[labelled_column_name], 'group'].iloc[0], axis=1
)
ci_raw_data_filtered.insert(labelled_column_location+1, 'group', groups)

# Get sorting order and insert
sorting_order = ci_raw_data_filtered.apply(
    lambda row: antigens_by_epitope_groups.loc[antigens_by_epitope_groups['blank'] == row[labelled_column_name], 'index'].iloc[0], axis=1
)
ci_raw_data_filtered.insert(
    len(ci_raw_data_filtered.columns), 'sorting_order', sorting_order)


In [None]:
# Group by group and sort rows by sorting_order
ci_raw_data_filtered.groupby("group")
ci_raw_data_filtered = ci_raw_data_filtered.set_index('group', append=True)
ci_raw_data_filtered = ci_raw_data_filtered.swaplevel(0, 1)

ci_raw_data_filtered = ci_raw_data_filtered.reset_index(level=1, drop=True)
ci_raw_data_filtered = ci_raw_data_filtered.set_index('Labelled', append=True)

ci_raw_data_filtered = ci_raw_data_filtered.sort_values(
    by='sorting_order', key=lambda x: x.astype(int))


In [None]:
def ci_column_compare(x):
    index = antigens_by_epitope_groups.loc[
                antigens_by_epitope_groups['blank'].str.contains(x[:-1]), 'index']
    return index.values[0] if len(index)>0 else float('inf')

cols = ci_raw_data_filtered.columns.tolist()
cols.sort(key=ci_column_compare)

ci_raw_data_filtered = ci_raw_data_filtered[cols]

In [None]:
ci_raw_data_filtered

In [None]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
# pip install kneed
from kneed import KneeLocator
%matplotlib inline

plt.rcParams['figure.figsize'] = [15, 10]

## Test K-MODS algorithm

In [None]:
# !pip install kmodes
from kmodes.kmodes import KModes

In [None]:
# Reading file
data_color = pd.read_csv("C:\\Users\\Oleksandr\Desktop\\Xema\\Xema\\Xema.WebApp\\drawn_color.csv")
data_color = data_color.set_index('label')

In [None]:
# Elbow curve to find optimal K
cost = []
K = range(1, data_color.shape[0])
for num_clusters in list(K):
    kmode = KModes(n_clusters=num_clusters, init="random", n_init=5, verbose=1)
    kmode.fit_predict(data_color)
    cost.append(kmode.cost_)

plt.plot(K, cost, 'o-', color="blue",
         markerfacecolor='red', markeredgecolor='red')
plt.xlabel('No. of clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
# Get exact cluster amount
kl = KneeLocator(K, cost, curve="convex", direction="decreasing")
exact_cluster_amount = kl.elbow
exact_cluster_amount

In [None]:
# Building the model with N clusters
kmode = KModes(n_clusters=exact_cluster_amount, init = "random", n_init = 5, verbose=1)
clusters = kmode.fit_predict(data_color)
clusters

In [None]:

data_color_clusterize = data_color
# data_color_clusterize = data_color_clusterize.sort_index(ascending=False)
data_color_clusterize.insert(0, "cluster", clusters, True)
data_color_clusterize = data_color_clusterize.sort_values(by=['cluster'])
data_color_clusterize.to_excel('result_color.xlsx')

## Test K-MINS algorithm

In [None]:
# !pip install sklearn
from sklearn.cluster import KMeans

In [None]:
# Reading file
data_index = pd.read_csv("C:\\Users\\Oleksandr\Desktop\\Xema\\Xema\\Xema.WebApp\\drawn_index.csv")
data_index = data_index.set_index('label')

In [None]:
# Elbow curve to find optimal K
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

cost = []
K = range(1, data_index.shape[0])
for k in K:
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(data_index)
    cost.append(kmeans.inertia_)

plt.plot(K, cost, 'o-', color="blue",
         markerfacecolor='red', markeredgecolor='red')
plt.xlabel('Number of clusters')
plt.ylabel('Cost')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
# Get exact cluster amount
kl = KneeLocator(K, cost, curve="convex", direction="decreasing")
exact_cluster_amount = kl.elbow
exact_cluster_amount

In [None]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

kmeans = KMeans(n_clusters=exact_cluster_amount, **kmeans_kwargs)
kmeans.fit(data_index)
kmeans.labels_

In [None]:
data_index_clusterize = data_index
data_index_clusterize = data_index_clusterize.sort_index(ascending=False)
data_index_clusterize.insert(0, "cluster", kmeans.labels_, True)
data_index_clusterize = data_index_clusterize.sort_values(by=['cluster'])
data_index_clusterize.to_excel('result_index.xlsx')