In [1]:
""" 
Run nSimplices + MDS vs MDS on subset (STOOL and VAGINA only) of HMP dataset 
"""

' \nRun nSimplices + MDS vs MDS on subset (STOOL and VAGINA only) of HMP dataset \n'

In [2]:
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os
import pandas as pd
import random as alea
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import MDS


data_dir = "../data/"
output_dir = "../outputs"
target_sites = ['STOOL', 'VAGINA']
target_colors = ["cornflowerblue", "orange"]

In [3]:
"""
Prepare dataset
"""

# iterate over QE and NB
files = ["hmp_v13lqphylotypeQuantNB_rs_c.csv", "hmp_v13lqphylotypeQuantE_rs_c.csv"]
color_df = pd.read_csv(os.path.join(data_dir, "hmp_v13lqphylotypePheno_rs_c.csv"), header=0)

for file in files:
    data_path = os.path.join(data_dir, file)
    df_hmp_ori = np.loadtxt(data_path, delimiter=",")
    df_hmp = []


    for index, row in color_df.iterrows():
        site_exist = False
        for site in target_sites:
            if row[site]:
                site_exist = True
        if site_exist:
            df_hmp.append(df_hmp_ori[index])
    df_hmp = np.array(df_hmp)
    np.savetxt(os.path.join(data_dir, file[:-8]+"target_"+file[-8:]), df_hmp, fmt='%1.17f', delimiter=',')


In [4]:
""" 
Prepare colors
"""

colors = []
new_color_df = color_df.copy(deep = True)
drop_indices = []

for index, row in color_df.iterrows():
    site_exist = False
    for i in range(len(target_sites)):
        site = target_sites[i]
        if row[site]:
            colors.append(target_colors[i])
            site_exist = True
    if not site_exist:
        drop_indices.append(index)

colors = np.array(colors)
print(colors.shape)

np.savetxt(os.path.join(data_dir, "hmp_target_colors.txt"), colors, fmt="%s")

new_color_df = new_color_df.drop(drop_indices)
new_color_df.to_csv(os.path.join(data_dir, "hmp_v13lqphylotypePheno_target_rs_c.csv"), header=True, index=False)

(330,)


In [5]:
""" 
Run nSimplices on HMP dataset
"""
colors = np.loadtxt(os.path.join(data_dir, "hmp_target_colors.txt"), dtype="str")
exec(open("../nsimplices.py").read())
alea.seed(42)


In [6]:
""" 
Run
(1) NB normalization + nSimplices + cMDS 
(2) QE normalization + nSimplices + cMDS 

To derive the axes data
""" 

output_files = ["hmp_target_NB_nSimplices_cMDS_axes.txt", "hmp_target_QE_nSimplices_cMDS_axes.txt"]
data_files = ["hmp_v13lqphylotypeQuantNB_target_rs_c.csv", "hmp_v13lqphylotypeQuantE_target_rs_c.csv"]
subspace_dims = []
for i in range(len(output_files)):
    output_file = output_files[i]
    data_file = data_files[i]
    axes_output_path = os.path.join(output_dir, output_file)
    print("======== NB/QE normalization + nSimplices + cMDS ========")
    data_path = os.path.join(data_dir, data_file)
    df_hmp = np.loadtxt(data_path, delimiter=",")
    hmp_dis_sq=squareform(pdist(df_hmp))

    feature_num = 11
    dim_start = 2
    dim_end = 50

    print("hmp_dis_sq shape is:", hmp_dis_sq.shape)
    outlier_indices, subspace_dim , corr_pairwise_dis, corr_coord = nsimplices(hmp_dis_sq, feature_num, dim_start, dim_end)
    print("subspace dimension is:", subspace_dim)
    subspace_dims.append(subspace_dim)

    # run cMDS to get the corrected coordinates in importance decreasing order
    _, _, Xe = cMDS(corr_pairwise_dis)
    np.savetxt(axes_output_path, Xe, fmt='%f')

hmp_dis_sq shape is: (330, 330)
dim in find_subspace_dim is: 2
dim in find_subspace_dim is: 3


KeyboardInterrupt: 

In [6]:
""" 
Run
(1) NB normalization + cMDS 
(2) QE normalization + cMDS 

To derive the axes data
"""  

axes_files = ["hmp_target_NB_MDS_cMDS_axes.txt", "hmp_target_QE_MDS_cMDS_axes.txt"] # put NB before QE
data_files = ["hmp_v13lqphylotypeQuantNB_target_rs_c.csv", "hmp_v13lqphylotypeQuantE_target_rs_c.csv"]

for i in range(len(axes_files)):
    axes_file = axes_files[i]
    data_file = data_files[i]

    print("======== QE/NB normalization + MDS + cMDS ========")
    data_path = os.path.join(data_dir, data_file)
    axes_output_path = os.path.join(output_dir, axes_file)

    df_hmp = np.loadtxt(data_path, delimiter=",")
    hmp_dis_sq=squareform(pdist(df_hmp))

    # Plot cMDS embedding using the pairs of axis from the four most significant axes 
    # enforce_dim = subspace_dims[i] # enforcing the dimension to be consistent with nSimplices QE+nsimplices+cMDS
    enforce_dim = 3
    embedding = MDS(n_components=enforce_dim, max_iter=100000000000,dissimilarity='precomputed') 
    corr_coord = embedding.fit_transform(hmp_dis_sq)
    corr_dis_sq=squareform(pdist(corr_coord))
    _, _, Xe = cMDS(corr_dis_sq)

    np.savetxt(axes_output_path, Xe, fmt='%f')









In [7]:
""" 
Plot pairwise result
"""

figure_files = ["hmp_target_NB_nSimplices_cMDS.png", "hmp_target_QE_nSimplices_cMDS.png", "hmp_target_NB_MDS_cMDS.png", "hmp_target_QE_MDS_cMDS.png"]
axes_files = ["hmp_target_NB_nSimplices_cMDS_axes.txt", "hmp_target_QE_nSimplices_cMDS_axes.txt", "hmp_target_NB_MDS_cMDS_axes.txt", "hmp_target_QE_MDS_cMDS_axes.txt"]
titles = ["NB+nSimplices", "QuantE+nSimplices", "NB+MDS", "QuantE+MDS"]
num_axes = 3 # show pairwise 2D plot to decompose the 3D plot

for i in range(len(figure_files)):
    figure_file = figure_files[i]
    axes_file = axes_files[i]
    title = titles[i]

    print("======== plot pairwise 2D plot (subset) ========")
    Xe = np.loadtxt(os.path.join(output_dir, axes_file))
    print(Xe.shape)
    for first_dim in range(num_axes):
        for second_dim in range(first_dim+1, num_axes):
            plt.figure()
            # only plot stool (blue), ears (black), throat (pink) points 

            stool_indices = [i for i, e in enumerate(colors) if e == 'cornflowerblue']
            ears_indices = [i for i, e in enumerate(colors) if e == 'orange']

            plt.scatter(Xe[stool_indices, second_dim], \
                Xe[stool_indices, first_dim], s=5, c='cornflowerblue', label = "STOOL")
            plt.scatter(Xe[ears_indices, second_dim], \
                Xe[ears_indices, first_dim], s=5, c='orange', label = "VAGINA")


            # for i in range(Xe.shape[0]):
            #     plt.scatter(Xe[i, second_dim], Xe[i, first_dim], s=5, c=colors[i])
            plt.legend()
            plt.title(title, size=10)   
            plt.savefig(os.path.join(output_dir, figure_file[:-4]+"_"+str(first_dim)+"_"+str(second_dim)+".png"))
            

(330, 162)
(330, 161)
(330, 158)
(330, 159)


In [8]:
""" 
Run
(1) NB normalization + nSimplices (without correction) + cMDS 
(2) QE normalization + nSimplices (without correction) + cMDS 

To derive the axes data
""" 

output_files = ["hmp_target_NB_nSimplices_cMDS_no_correct_axes.txt", "hmp_target_QE_nSimplices_cMDS_no_correct_axes.txt"]
data_files = ["hmp_v13lqphylotypeQuantNB_target_rs_c.csv", "hmp_v13lqphylotypeQuantE_target_rs_c.csv"]
subspace_dims = []
for i in range(len(output_files)):
    output_file = output_files[i]
    data_file = data_files[i]
    axes_output_path = os.path.join(output_dir, output_file)
    print("======== NB/QE normalization + nSimplices + cMDS ========")
    data_path = os.path.join(data_dir, data_file)
    df_hmp = np.loadtxt(data_path, delimiter=",")
    hmp_dis_sq=squareform(pdist(df_hmp))

    # feature_num = 11
    dim_start = 3
    dim_end = 3

    print("hmp_dis_sq shape is:", hmp_dis_sq.shape)
    outlier_indices, subspace_dim , corr_pairwise_dis, corr_coord = nsimplices(hmp_dis_sq, feature_num, dim_start, dim_end, correct=False)
    print("corr_pairwise_dis shape is:", corr_pairwise_dis.shape)
    subspace_dims.append(subspace_dim)

    # run cMDS to get the corrected coordinates in importance decreasing order
    _, _, Xe = cMDS(corr_pairwise_dis)
    np.savetxt(axes_output_path, Xe, fmt='%f')

hmp_dis_sq shape is: (330, 330)
dim in find_subspace_dim is: 3
med_height is: [1.94852377]
subspace_dim one is: 3
thres is: 2.6427036800006505 mean is: 2.013832238296311 std is: 0.20962381390144658
outlier indices are: [  0  12  44  67  75 148 152 199 211 267 275 279 287 288 304 318]
idx is: 0 height is: 2.9521082525833435 thres is: 2.6427036800006505
idx is: 12 height is: 2.642923163201909 thres is: 2.6427036800006505
idx is: 44 height is: 3.0600840203508657 thres is: 2.6427036800006505
idx is: 67 height is: 2.660054020345399 thres is: 2.6427036800006505
idx is: 75 height is: 2.682157043165481 thres is: 2.6427036800006505
idx is: 148 height is: 2.6598561201879085 thres is: 2.6427036800006505
idx is: 152 height is: 2.893887371159374 thres is: 2.6427036800006505
idx is: 199 height is: 3.1025948925354707 thres is: 2.6427036800006505
idx is: 211 height is: 2.7747729886677353 thres is: 2.6427036800006505
idx is: 267 height is: 2.8762337647795304 thres is: 2.6427036800006505
idx is: 275 hei



outliet_indices is: []
subspace dimension is: (330, 330)
hmp_dis_sq shape is: (330, 330)
dim in find_subspace_dim is: 3
med_height is: [1.31788907]
subspace_dim one is: 3
thres is: 3.0176460880970892 mean is: 1.2086675930994715 std is: 0.6029928316658726
outlier indices are: []




outliet_indices is: []
subspace dimension is: (330, 330)


In [9]:
""" 
Plot pairwise result (nSimplices without correction)
"""

figure_files = ["hmp_target_NB_nSimplices_cMDS_no_correct.png", "hmp_target_QE_nSimplices_cMDS_no_correct.png"]
axes_files = ["hmp_target_NB_nSimplices_cMDS_no_correct_axes.txt", "hmp_target_QE_nSimplices_cMDS_no_correct_axes.txt"]
titles = ["NB+nSimplices (no correction)", "QuantE+nSimplices (no correction)"]
num_axes = 3 # show pairwise 2D plot to decompose the 3D plot

for i in range(len(figure_files)):
    figure_file = figure_files[i]
    axes_file = axes_files[i]
    title = titles[i]

    print("======== plot pairwise 2D plot (subset) ========")
    Xe = np.loadtxt(os.path.join(output_dir, axes_file))
    print(Xe.shape)
    for first_dim in range(num_axes):
        for second_dim in range(first_dim+1, num_axes):
            plt.figure()
            # only plot stool (blue), ears (black), throat (pink) points 

            stool_indices = [i for i, e in enumerate(colors) if e == 'cornflowerblue']
            ears_indices = [i for i, e in enumerate(colors) if e == 'orange']

            plt.scatter(Xe[stool_indices, second_dim], \
                Xe[stool_indices, first_dim], s=5, c='cornflowerblue', label = "STOOL")
            plt.scatter(Xe[ears_indices, second_dim], \
                Xe[ears_indices, first_dim], s=5, c='orange', label = "VAGINA")


            # for i in range(Xe.shape[0]):
            #     plt.scatter(Xe[i, second_dim], Xe[i, first_dim], s=5, c=colors[i])
            plt.legend()
            plt.title(title, size=10)   
            plt.savefig(os.path.join(output_dir, figure_file[:-4]+"_"+str(first_dim)+"_"+str(second_dim)+".png"))
            

(330, 162)
(330, 161)
