In [1]:
""" 
Run nSimplices + MDS vs MDS on subset (STOOL and VAGINA only) of HMP dataset 
"""

' \nRun nSimplices + MDS vs MDS on subset (STOOL and VAGINA only) of HMP dataset \n'

In [2]:
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os
import pandas as pd
import random as alea
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import MDS


data_dir = "../data/"
output_dir = "../outputs"
target_sites = ['STOOL', 'VAGINA']
target_id = "SV"
target_colors = ["cornflowerblue", "orange"]

In [3]:
"""
Prepare dataset
"""

# iterate over QE and NB
files = ["hmp_v13lqphylotypeQuantNB_rs_c.csv", "hmp_v13lqphylotypeQuantE_rs_c.csv"]
color_df = pd.read_csv(os.path.join(data_dir, "hmp_v13lqphylotypePheno_rs_c.csv"), header=0)

for file in files:
    data_path = os.path.join(data_dir, file)
    df_hmp_ori = np.loadtxt(data_path, delimiter=",")
    df_hmp = []


    for index, row in color_df.iterrows():
        site_exist = False
        for site in target_sites:
            if row[site]:
                site_exist = True
        if site_exist:
            df_hmp.append(df_hmp_ori[index])
    df_hmp = np.array(df_hmp)
    np.savetxt(os.path.join(data_dir, file[:-8]+"target_"+target_id+"_"+file[-8:]), df_hmp, fmt='%1.17f', delimiter=',')


In [4]:
""" 
Prepare colors
"""

colors = []
new_color_df = color_df.copy(deep = True)
drop_indices = []

for index, row in color_df.iterrows():
    site_exist = False
    for i in range(len(target_sites)):
        site = target_sites[i]
        if row[site]:
            colors.append(target_colors[i])
            site_exist = True
    if not site_exist:
        drop_indices.append(index)

colors = np.array(colors)
print(colors.shape)

np.savetxt(os.path.join(data_dir, "hmp_target_"+target_id+"_"+"colors.txt"), colors, fmt="%s")

new_color_df = new_color_df.drop(drop_indices)
new_color_df.to_csv(os.path.join(data_dir, "hmp_v13lqphylotypePheno_target_"+target_id+"_"+"rs_c.csv"), header=True, index=False)

(330,)


In [5]:
""" 
Run nSimplices on HMP dataset
"""
colors = np.loadtxt(os.path.join(data_dir, "hmp_target_"+target_id+"_"+"colors.txt"), dtype="str")
exec(open("../nsimplices.py").read())
alea.seed(42)


In [6]:
""" 
Run
(1) NB normalization + nSimplices + cMDS 
(2) QE normalization + nSimplices + cMDS 

To derive the axes data
""" 

output_files = ["hmp_target_"+target_id+"_"+"NB_nSimplices_cMDS_axes.txt", "hmp_target_"+target_id+"_"+"QE_nSimplices_cMDS_axes.txt"]
data_files = ["hmp_v13lqphylotypeQuantNB_target_"+target_id+"_"+"rs_c.csv", "hmp_v13lqphylotypeQuantE_target_"+target_id+"_"+"rs_c.csv"]
subspace_dims = []
for i in range(len(output_files)):
    output_file = output_files[i]
    data_file = data_files[i]
    axes_output_path = os.path.join(output_dir, output_file)
    print("======== NB/QE normalization + nSimplices + cMDS ========")
    data_path = os.path.join(data_dir, data_file)
    df_hmp = np.loadtxt(data_path, delimiter=",")
    hmp_dis_sq=squareform(pdist(df_hmp))

    feature_num = 11
    dim_start = 2
    dim_end = 50

    print("hmp_dis_sq shape is:", hmp_dis_sq.shape)
    outlier_indices, subspace_dim , corr_pairwise_dis, corr_coord = nsimplices(hmp_dis_sq, feature_num, dim_start, dim_end, std_multi=2)
    print("subspace dimension is:", subspace_dim)
    subspace_dims.append(subspace_dim)

    # run cMDS to get the corrected coordinates in importance decreasing order
    _, _, Xe = cMDS(corr_pairwise_dis)
    np.savetxt(axes_output_path, Xe, fmt='%f')

hmp_dis_sq shape is: (330, 330)
dim in find_subspace_dim is: 2
dim in find_subspace_dim is: 3
dim in find_subspace_dim is: 4
dim in find_subspace_dim is: 5
dim in find_subspace_dim is: 6
dim in find_subspace_dim is: 7
dim in find_subspace_dim is: 8
dim in find_subspace_dim is: 9
dim in find_subspace_dim is: 10
dim in find_subspace_dim is: 11
dim in find_subspace_dim is: 12
dim in find_subspace_dim is: 13
dim in find_subspace_dim is: 14
dim in find_subspace_dim is: 15
dim in find_subspace_dim is: 16
dim in find_subspace_dim is: 17
dim in find_subspace_dim is: 18
dim in find_subspace_dim is: 19
dim in find_subspace_dim is: 20
dim in find_subspace_dim is: 21
dim in find_subspace_dim is: 22
dim in find_subspace_dim is: 23
dim in find_subspace_dim is: 24
dim in find_subspace_dim is: 25
dim in find_subspace_dim is: 26
dim in find_subspace_dim is: 27
dim in find_subspace_dim is: 28
dim in find_subspace_dim is: 29
dim in find_subspace_dim is: 30
dim in find_subspace_dim is: 31
dim in find_subs



outliet_indices is: [  0   3   5   9  12  15  20  44  60  67  70  75  76  78  83 134 138 148
 152 197 199 209 211 225 267 272 273 275 276 277 279 283 287 288 290 300
 301 304 306 308 312 318]
original coord is: [ 0.56492061  0.73105691  1.13638058  0.6745109   1.62167712  1.08195659
 -0.54956151  0.17999703 -0.4669762   1.13939667  0.13203398]
proj_coord is: [ 0.29740352 -0.13202433  0.02146158 -0.01128927  0.09545666  0.06270293
 -0.39337185 -0.14607913 -0.6391485  -0.05100268 -0.13097378]
proj_coord is: [ 0.17555923 -0.25220203  0.25352022  0.06950448  0.12855221 -0.00281886
 -0.49833161 -0.07106101 -0.56464339 -0.00140554 -0.41401745]
proj_coord is: [-0.35753111 -0.31841172  0.1545724   0.14531955  0.37044439  0.18803138
 -0.55614121 -0.04023539 -0.7680717   0.13117249 -0.23250998]
original coord is: [-1.40000789 -0.17429514  0.50518812 -0.44536711 -0.18559374  0.76188849
 -0.95003608 -0.50235941 -1.24066436 -0.81835    -0.43746616]
proj_coord is: [ 0.4074389  -0.18087159  0.0294020



outliet_indices is: [  0 134 138 199 267 287]
original coord is: [-1.04511197  0.59249282  0.3819264   0.88794746  0.23055774 -0.64566893
  1.57775639  0.35728345 -0.08277934 -1.27694468  0.26239566]
proj_coord is: [ 0.16247326 -0.02328454  0.08955009  0.08101749 -0.00956511 -0.10466468
  0.31247636 -0.07078009  0.39797402  0.0922269   0.13661905]
proj_coord is: [-0.94739136 -0.28833024  0.28008723  0.31832747 -0.58321708 -0.34096853
  1.35785286  0.06116086 -0.05453583 -0.00609706 -0.01356574]
proj_coord is: [-0.94200928 -0.2709459   0.21180837  0.28836864 -0.59946787 -0.37407846
  1.37192428  0.04554164 -0.0427038  -0.03008328 -0.0395369 ]
original coord is: [-1.04380886  0.51698779  0.462217   -1.10083472 -0.52701039 -0.37784473
 -0.37024172  0.4476734  -2.4039425  -0.79433185  0.31566024]
proj_coord is: [-0.62889483  0.09012884 -0.3466268  -0.31359917  0.03702424  0.40513176
 -1.20952068  0.27397267 -1.5404615  -0.35698812 -0.5288194 ]
proj_coord is: [-1.34960007 -0.08198209 -0.222

In [7]:
# outlier_indices, subspace_dim , corr_pairwise_dis, corr_coord = nsimplices(hmp_dis_sq, feature_num, 3, 3, std_multi=2)
# print(outlier_indices)

In [10]:
""" 
Run
(1) NB normalization + cMDS 
(2) QE normalization + cMDS 

To derive the axes data
"""  

axes_files = ["hmp_target_"+target_id+"_"+"NB_MDS_cMDS_axes.txt", "hmp_target_"+target_id+"_"+"QE_MDS_cMDS_axes.txt"] # put NB before QE
data_files = ["hmp_v13lqphylotypeQuantNB_target_"+target_id+"_"+"rs_c.csv", "hmp_v13lqphylotypeQuantE_target_"+target_id+"_"+"rs_c.csv"]

for i in range(len(axes_files)):
    axes_file = axes_files[i]
    data_file = data_files[i]

    print("======== QE/NB normalization + MDS + cMDS ========")
    data_path = os.path.join(data_dir, data_file)
    axes_output_path = os.path.join(output_dir, axes_file)

    df_hmp = np.loadtxt(data_path, delimiter=",")
    hmp_dis_sq=squareform(pdist(df_hmp))

    # Plot cMDS embedding using the pairs of axis from the four most significant axes 
    # enforce_dim = subspace_dims[i] # enforcing the dimension to be consistent with nSimplices QE+nsimplices+cMDS
    feature_num = 11
    embedding = MDS(n_components=feature_num, max_iter=100000000000, dissimilarity='precomputed') 
    corr_coord = embedding.fit_transform(hmp_dis_sq)
    corr_dis_sq=squareform(pdist(corr_coord))
    _, _, Xe = cMDS(corr_dis_sq)

    np.savetxt(axes_output_path, Xe, fmt='%f')









In [11]:
""" 
Plot pairwise result
"""

figure_files = ["hmp_target_"+target_id+"_"+"NB_nSimplices_cMDS.png", \
    "hmp_target_"+target_id+"_"+"QE_nSimplices_cMDS.png", \
    "hmp_target_"+target_id+"_"+"NB_MDS_cMDS.png", \
    "hmp_target_"+target_id+"_"+"QE_MDS_cMDS.png"]
axes_files = ["hmp_target_"+target_id+"_"+"NB_nSimplices_cMDS_axes.txt", \
    "hmp_target_"+target_id+"_"+"QE_nSimplices_cMDS_axes.txt", \
    "hmp_target_"+target_id+"_"+"NB_MDS_cMDS_axes.txt", \
    "hmp_target_"+target_id+"_"+"QE_MDS_cMDS_axes.txt"]
titles = ["NB+nSimplices", "QuantE+nSimplices", "NB+MDS", "QuantE+MDS"]
num_axes = 3 # show pairwise 2D plot to decompose the 3D plot

for i in range(len(figure_files)):
    figure_file = figure_files[i]
    axes_file = axes_files[i]
    title = titles[i]

    print("======== plot pairwise 2D plot (subset) ========")
    Xe = np.loadtxt(os.path.join(output_dir, axes_file))
    print(Xe.shape)
    for first_dim in range(num_axes):
        for second_dim in range(first_dim+1, num_axes):
            plt.figure()
            
            target_indices = []
            for color in target_colors:
                cur_indices = [i for i, e in enumerate(colors) if e == color]
                target_indices.append(cur_indices)
            # stool_indices = [i for i, e in enumerate(colors) if e == 'cornflowerblue']
            # ears_indices = [i for i, e in enumerate(colors) if e == 'orange']

            for i in range(len(target_colors)):
                color = target_colors[i]
                site = target_sites[i]
                plt.scatter(Xe[target_indices[i], second_dim], \
                    Xe[target_indices[i], first_dim], s=5, c=color, label = site)
                    
            plt.legend()
            plt.title(title, size=10)   
            plt.savefig(os.path.join(output_dir, figure_file[:-4]+"_"+str(first_dim)+"_"+str(second_dim)+".png"))
            

(330, 164)
(330, 164)
(330, 161)


  plt.figure()


(330, 164)
