In [1]:
import pandas as pd
import numpy as np

In [148]:
import matplotlib.pyplot as plt
import seaborn as sns

In [140]:
from sklearn.manifold import MDS

In [45]:
from sklearn.metrics.pairwise import cosine_similarity

In [750]:
co_embedding = pd.read_csv("../../../../data/social_niche_embedding_100.txt",
                          header=None, sep=" ", low_memory=False, index_col=0)
co_embedding.drop("<unk>", inplace=True)
phy_embedding = pd.read_csv("../../../../data/Embedding_list/PCA_100.txt",
                          header=None, sep=" ", low_memory=False, index_col=0)
phy_embedding.drop("<unk>", inplace=True)
inter_id = np.intersect1d(co_embedding.index, phy_embedding.index)
co_embedding = co_embedding.loc[inter_id]
phy_embedding = phy_embedding.loc[inter_id]

In [876]:
metadata = pd.read_csv("../../Data/disease_data/IBD/metadata.tsv", sep="\t", index_col=0)
IBD_study = ["PRJNA324147", "PRJNA368966", "PRJNA422193", "PRJNA431126", "PRJNA450340", "qiita_1629", "qiita_2538", "RISK_PRISM_f"]

In [883]:
shap_test = {}
fids = []
sids = []
for n in IBD_study:
    sid = metadata.loc[metadata.study.values == n].index.values
    df = pd.read_csv(f"../../Data/biomark/shap_table_{n}.csv", index_col=0)
    df = df.loc[sid]
    inter_id = np.intersect1d(df.columns.values, co_embedding.index.values)
    fids = fids + list(inter_id)
    sids = sids + list(df.index.values)
    df = df.loc[:, inter_id]
    shap_test[n] = df

In [884]:
fids = np.unique(fids)

In [885]:
shape_df = pd.DataFrame(0, index=sids, columns=fids)

In [887]:
for n in IBD_study:
    shape_df.loc[shap_test[n].index.values, shap_test[n].columns.values] = shap_test[n].values

In [888]:
fid = shape_df.columns
fid_importacne = shape_df.mean().values
df_shap_mean = pd.DataFrame({"fid":fid, "fid_importacne":fid_importacne})
df_shap_mean = df_shap_mean.sort_values("fid_importacne")

In [889]:
df_shap_mean.head()

Unnamed: 0,fid,fid_importacne
1413,EF451616.1.1203,-0.015341
1915,EU766019.1.1351,-0.014384
4022,JQ456273.1.1334,-0.013953
1922,EU766108.1.1338,-0.01286
1896,EU765003.1.1353,-0.01088


In [891]:
health_fid_biomark = df_shap_mean.iloc[0:50].fid.values
disease_fid_biomark = df_shap_mean.iloc[-50:].fid.values

In [892]:
ibd_biomark_enrich_control = health_fid_biomark
ibd_biomark_enrich_ibd = disease_fid_biomark

In [938]:
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)

data = phy_embedding.loc[list(ibd_biomark_enrich_control) + list(ibd_biomark_enrich_ibd)]
data = 1 - cosine_similarity(data)
mds_coordinates = mds.fit_transform(data)
id_ = list(health_fid_biomark) + list(disease_fid_biomark)
group = [f"IBD" if i in disease_fid_biomark else "Ctrl" for i in id_]
emb_type = ["Phylo"] * len(group)
plot_data_1 = pd.DataFrame({"t_SNE_1": mds_coordinates[:,0], "t_SNE_2": mds_coordinates[:,1], 
                            "group": group, "emb_type": emb_type, "fid": id_})
# abundance percentile
data = co_embedding.loc[list(ibd_biomark_enrich_control) + list(ibd_biomark_enrich_ibd)]
data = 1 - cosine_similarity(data)
mds_coordinates = mds.fit_transform(data)
id_ = list(health_fid_biomark) + list(disease_fid_biomark)
group = [f"IBD" if i in disease_fid_biomark else "Ctrl" for i in id_]
emb_type = ["SNE"] * len(group)
plot_data_2 = pd.DataFrame({"t_SNE_1": mds_coordinates[:,0], "t_SNE_2": mds_coordinates[:,1], 
                          "group": group, "emb_type": emb_type, "fid": id_})



In [939]:
plot_data = pd.concat([plot_data_1, plot_data_2])
plot_data.to_csv("../../Data/biomark/MDS_IBD_shap.csv")

### CRC

In [920]:
metadata = pd.read_csv("../../Data/disease_data/IBD/metadata.tsv", sep="\t", index_col=0)
CRC_study = ["PRJDB11845", "PRJEB36789", "PRJEB6070", "PRJNA290926", "PRJNA318004", "PRJNA430990", "PRJNA824020"]

In [921]:
shap_test = {}
fids = []
sids = []
for n in CRC_study:
    sid = metadata.loc[metadata.study.values == n].index.values
    df = pd.read_csv(f"../../Data/biomark/shap_table_test_{n}.csv", index_col=0)
    df = df.loc[sid]
    inter_id = np.intersect1d(df.columns.values, co_embedding.index.values)
    fids = fids + list(inter_id)
    sids = sids + list(df.index.values)
    df = df.loc[:, inter_id]
    shap_test[n] = df

In [922]:
fids = np.unique(fids)
shape_df = pd.DataFrame(0, index=sids, columns=fids)
for n in CRC_study:
    shape_df.loc[shap_test[n].index.values, shap_test[n].columns.values] = shap_test[n].values

In [923]:
fid = shape_df.columns
fid_importacne = shape_df.mean().values
df_shap_mean = pd.DataFrame({"fid":fid, "fid_importacne":fid_importacne})
df_shap_mean = df_shap_mean.sort_values("fid_importacne")

In [924]:
health_fid_biomark = df_shap_mean.iloc[0:50].fid.values
disease_fid_biomark = df_shap_mean.iloc[-50:].fid.values

In [925]:
crc_biomark_enrich_control = health_fid_biomark
crc_biomark_enrich_crc = disease_fid_biomark

In [942]:
mds = MDS(n_components=2, dissimilarity='precomputed', random_state=42)

data = phy_embedding.loc[list(crc_biomark_enrich_control) + list(crc_biomark_enrich_crc)]
data = 1 - cosine_similarity(data)
mds_coordinates = mds.fit_transform(data)
id_ = list(health_fid_biomark) + list(disease_fid_biomark)
group = [f"CRC" if i in disease_fid_biomark else "Ctrl" for i in id_]
emb_type = ["Phylo"] * len(group)
plot_data_1 = pd.DataFrame({"t_SNE_1": mds_coordinates[:,0], "t_SNE_2": mds_coordinates[:,1], 
                            "group": group, "emb_type": emb_type, "fid": id_})
# abundance percentile
data = co_embedding.loc[list(crc_biomark_enrich_control) + list(crc_biomark_enrich_crc)]
data = 1 - cosine_similarity(data)
mds_coordinates = mds.fit_transform(data)
id_ = list(health_fid_biomark) + list(disease_fid_biomark)
group = [f"CRC" if i in disease_fid_biomark else "Ctrl" for i in id_]
emb_type = ["SNE"] * len(group)
plot_data_2 = pd.DataFrame({"t_SNE_1": mds_coordinates[:,0], "t_SNE_2": mds_coordinates[:,1], 
                          "group": group, "emb_type": emb_type, "fid": id_})



In [943]:
plot_data = pd.concat([plot_data_1, plot_data_2])
plot_data.to_csv("../../Data/biomark/MDS_CRC_shap.csv")