In [1]:
""" 
Run nSimplices + MDS vs MDS on subset (STOOL and VAGINA only) of HMP dataset 
"""

' \nRun nSimplices + MDS vs MDS on subset (STOOL and VAGINA only) of HMP dataset \n'

In [2]:
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import os
import pandas as pd
import random as alea
from scipy.spatial.distance import pdist, squareform
from sklearn.manifold import MDS
import plotly.express as px


data_dir = "../data/"
output_dir = "../outputs"
target_sites = ['NOSE', 'VAGINA', 'THROAT']
target_id = "NVT"
target_colors = ["darkgreen", "orange", "deeppink"]

# target_sites = ['STOOL', 'VAGINA']
# target_id = "SV"
# target_colors = ["cornflowerblue", "orange"]

# if color == "deeppink":
#         return "THROAT"
#     if color == "black":
#         return "EARS"
#     if color == "cornflowerblue":
#         return "STOOL"
#     if color == "darkgreen":
#         return "NOSE"
#     if color == "red":
#         return "ELBOWS"
#     if color == "gray":
#         return "MOUTH"
#     if color == "orange":
#         return "VAGINA"

# target_sites = ['THROAT']
# target_id = "T"
# target_colors = ["deeppink"]


In [3]:
"""
Prepare dataset
"""

# iterate over QE and NB
files = ["hmp_v13lqphylotypeQuantNB_rs_c.csv", "hmp_v13lqphylotypeQuantE_rs_c.csv"]
color_df = pd.read_csv(os.path.join(data_dir, "hmp_v13lqphylotypePheno_rs_c.csv"), header=0)

for file in files:
    data_path = os.path.join(data_dir, file)
    df_hmp_ori = np.loadtxt(data_path, delimiter=",")
    df_hmp = []


    for index, row in color_df.iterrows():
        site_exist = False
        for site in target_sites:
            if row[site]:
                site_exist = True
        if site_exist:
            df_hmp.append(df_hmp_ori[index])
    df_hmp = np.array(df_hmp)
    np.savetxt(os.path.join(data_dir, file[:-8]+"target_"+target_id+"_"+file[-8:]), df_hmp, fmt='%1.17f', delimiter=',')


In [4]:
""" 
Prepare colors
"""

colors = []
new_color_df = color_df.copy(deep = True)
drop_indices = []

for index, row in color_df.iterrows():
    site_exist = False
    for i in range(len(target_sites)):
        site = target_sites[i]
        if row[site]:
            colors.append(target_colors[i])
            site_exist = True
    if not site_exist:
        drop_indices.append(index)

colors = np.array(colors)
print(colors.shape)

np.savetxt(os.path.join(data_dir, "hmp_target_"+target_id+"_"+"colors.txt"), colors, fmt="%s")

new_color_df = new_color_df.drop(drop_indices)
new_color_df.to_csv(os.path.join(data_dir, "hmp_v13lqphylotypePheno_target_"+target_id+"_"+"rs_c.csv"), header=True, index=False)

(454,)


In [5]:
""" 
Run nSimplices on HMP dataset
"""
colors = np.loadtxt(os.path.join(data_dir, "hmp_target_"+target_id+"_"+"colors.txt"), dtype="str")
exec(open("../nsimplices.py").read())
alea.seed(42)


In [14]:
""" 
Run
(1) NB normalization + nSimplices + cMDS 
(2) QE normalization + nSimplices + cMDS 

To derive the axes data
""" 

output_files = ["hmp_target_"+target_id+"_"+"NB_nSimplices_cMDS_axes.txt", "hmp_target_"+target_id+"_"+"QE_nSimplices_cMDS_axes.txt"]
data_files = ["hmp_v13lqphylotypeQuantNB_target_"+target_id+"_"+"rs_c.csv", "hmp_v13lqphylotypeQuantE_target_"+target_id+"_"+"rs_c.csv"]
subspace_dims = []
outlier_indices_list = []
for i in range(len(output_files)):
    output_file = output_files[i]
    data_file = data_files[i]
    axes_output_path = os.path.join(output_dir, output_file)
    print("======== NB/QE normalization + nSimplices + cMDS ========")
    data_path = os.path.join(data_dir, data_file)
    df_hmp = np.loadtxt(data_path, delimiter=",")
    hmp_dis_sq=squareform(pdist(df_hmp))

    feature_num = df_hmp.shape[1]
    dim_start = 2
    dim_end = 50
    # dim_start = 3
    # dim_end = 3

    print("hmp_dis_sq shape is:", hmp_dis_sq.shape)
    outlier_indices, subspace_dim , corr_pairwise_dis, corr_coord = nsimplices(hmp_dis_sq, feature_num, dim_start, dim_end, std_multi=2)
    print("subspace dimension is:", subspace_dim)
    subspace_dims.append(subspace_dim)
    outlier_indices_list.append(outlier_indices)

    # run cMDS to get the corrected coordinates in importance decreasing order
    _, _, Xe = cMDS(corr_pairwise_dis)
    np.savetxt(axes_output_path, Xe, fmt='%f')


hmp_dis_sq shape is: (454, 454)
dim in find_subspace_dim is: 2
dim in find_subspace_dim is: 3
dim in find_subspace_dim is: 4
dim in find_subspace_dim is: 5


In [7]:
""" 
Inferred dimension
QE: STOOL - 3, VAGINA - 3, THROAT - 3, EARS - 3, NOSE - 3, ELBOWS - 3, MOUTH - 3
NB: STOOL - 41, VAGINA - 5, THROAT - , EARS - 3, NOSE - 3, ELBOWS - 3, MOUTH - 3

"""

' \nInferred dimension\nQE: STOOL - 3, VAGINA - 3, THROAT - 3, EARS - 3, NOSE - 3, ELBOWS - 3, MOUTH - 3\nNB: STOOL - 41, VAGINA - 5, THROAT - , EARS - 3, NOSE - 3, ELBOWS - 3, MOUTH - 3\n\n'

In [8]:
print("number of samples is:", len(colors))
print("number of outliers is:", len(outlier_indices_list[1]))

number of samples is: 454
number of outliers is: 25


site - number of samples - number of outliers

VAGINA: 186 - 26

NOSE: 136 - 18

THROAT: 134 - 14

VAGINA, STOOL, THROAT: 454 - 25


In [9]:
""" 
Run
(1) NB normalization + cMDS 
(2) QE normalization + cMDS 

To derive the axes data
"""  

axes_files = ["hmp_target_"+target_id+"_"+"NB_MDS_cMDS_axes.txt", "hmp_target_"+target_id+"_"+"QE_MDS_cMDS_axes.txt"] # put NB before QE
data_files = ["hmp_v13lqphylotypeQuantNB_target_"+target_id+"_"+"rs_c.csv", "hmp_v13lqphylotypeQuantE_target_"+target_id+"_"+"rs_c.csv"]

for i in range(len(axes_files)):
    axes_file = axes_files[i]
    data_file = data_files[i]

    print("======== QE/NB normalization + MDS + cMDS ========")
    data_path = os.path.join(data_dir, data_file)
    axes_output_path = os.path.join(output_dir, axes_file)

    df_hmp = np.loadtxt(data_path, delimiter=",")
    hmp_dis_sq=squareform(pdist(df_hmp))

    # Plot cMDS embedding using the pairs of axis from the four most significant axes 
    # enforce_dim = subspace_dims[i] # enforcing the dimension to be consistent with nSimplices QE+nsimplices+cMDS
    feature_num = df_hmp.shape[1]
    print("feature_num is:", feature_num)
    embedding = MDS(n_components=feature_num, max_iter=100000000000, dissimilarity='precomputed') 
    corr_coord = embedding.fit_transform(hmp_dis_sq)
    corr_dis_sq=squareform(pdist(corr_coord))
    _, _, Xe = cMDS(corr_dis_sq)

    np.savetxt(axes_output_path, Xe, fmt='%f')

feature_num is: 425




feature_num is: 425




In [10]:
""" 
Plot pairwise result
"""

figure_files = ["hmp_target_"+target_id+"_"+"NB_nSimplices_cMDS.png", \
    "hmp_target_"+target_id+"_"+"QE_nSimplices_cMDS.png", \
    "hmp_target_"+target_id+"_"+"NB_MDS_cMDS.png", \
    "hmp_target_"+target_id+"_"+"QE_MDS_cMDS.png"]
axes_files = ["hmp_target_"+target_id+"_"+"NB_nSimplices_cMDS_axes.txt", \
    "hmp_target_"+target_id+"_"+"QE_nSimplices_cMDS_axes.txt", \
    "hmp_target_"+target_id+"_"+"NB_MDS_cMDS_axes.txt", \
    "hmp_target_"+target_id+"_"+"QE_MDS_cMDS_axes.txt"]
titles = ["NB+nSimplices", "QuantE+nSimplices", "NB+MDS", "QuantE+MDS"]
num_axes = 3 # show pairwise 2D plot to decompose the 3D plot

for i in range(len(figure_files)):
    figure_file = figure_files[i]
    axes_file = axes_files[i]
    title = titles[i]
    outlier_indices = None
    if 'nSimplices' in figure_file:
        outlier_indices = outlier_indices_list[i]

    print("======== plot pairwise 2D plot (subset) ========")
    Xe = np.loadtxt(os.path.join(output_dir, axes_file))
    print(Xe.shape)
    for first_dim in range(num_axes):
        for second_dim in range(first_dim+1, num_axes):
            plt.figure()
            
            target_indices = []
            for color in target_colors:
                cur_indices = [i for i, e in enumerate(colors) if e == color]
                target_indices.append(cur_indices)
            # stool_indices = [i for i, e in enumerate(colors) if e == 'cornflowerblue']
            # ears_indices = [i for i, e in enumerate(colors) if e == 'orange']

            for i in range(len(target_colors)):
                color = target_colors[i]
                site = target_sites[i]
                site_indices = target_indices[i]
                
                if 'nSimplices' not in figure_file:
                    plt.scatter(Xe[site_indices, second_dim], \
                    Xe[site_indices, first_dim], s=5, c=color, label = site)
                else:
                    site_outlier_indices = list(set(site_indices) & set(outlier_indices))
                    site_normal_indices = list(set(site_indices) - set(outlier_indices))
                
                    plt.scatter(Xe[site_normal_indices, second_dim], \
                        Xe[site_normal_indices, first_dim], s=5, c=color, label = site)
                    plt.scatter(Xe[site_outlier_indices, second_dim], \
                        Xe[site_outlier_indices, first_dim], s=5, c=color, label = site+" outlier", marker="x")
                    
            plt.legend()
            plt.title(title, size=10)   
            plt.savefig(os.path.join(output_dir, figure_file[:-4]+"_"+str(first_dim)+"_"+str(second_dim)+".png"))
            

(454, 428)
(454, 440)
(454, 440)
(454, 439)


In [11]:
def color_to_site(color):
    """
    Returns the site for the color
    """
    if color == "deeppink":
        return "THROAT"
    if color == "black":
        return "EARS"
    if color == "cornflowerblue":
        return "STOOL"
    if color == "darkgreen":
        return "NOSE"
    if color == "red":
        return "ELBOWS"
    if color == "gray":
        return "MOUTH"
    if color == "orange":
        return "VAGINA"

def site_to_color(site):
    """
    Returns the color for the site
    """
    color_discrete_map = {'STOOL': 'cornflowerblue', 'VAGINA': 'orange', 'THROAT': 'deeppink',\
        "EARS": 'black', "NOSE": "darkgreen", "ELBOWS": 'red', "MOUTH": 'grey' }

    return color_discrete_map[site]

In [12]:
""" 
generate 3D plot of the first three axes 
"""

axes_files = ["hmp_target_"+target_id+"_"+"NB_nSimplices_cMDS_axes.txt", \
    "hmp_target_"+target_id+"_"+"QE_nSimplices_cMDS_axes.txt", \
    "hmp_target_"+target_id+"_"+"NB_MDS_cMDS_axes.txt", \
    "hmp_target_"+target_id+"_"+"QE_MDS_cMDS_axes.txt"]
    

figure_files = ["hmp_target_"+target_id+"_"+"NB_nSimplices_cMDS.html", \
    "hmp_target_"+target_id+"_"+"QE_nSimplices_cMDS.html",\
    "hmp_target_"+target_id+"_"+"NB_MDS_cMDS.html", \
    "hmp_target_"+target_id+"_"+"QE_MDS_cMDS.html"]


for i in range(len(axes_files)):
    axes_file = axes_files[i]
    figure_file = figure_files[i]
    outlier_indices = outlier_indices_list[i%2]
    normal_indices = list(set(list(range(Xe.shape[0]))) - set(outlier_indices))
    Xe = np.loadtxt(os.path.join(output_dir, axes_file))
    color_discrete_map = {'STOOL': 'cornflowerblue', 'VAGINA': 'orange', 'THROAT': 'deeppink',\
        "EARS": 'black', "NOSE": "darkgreen", "ELBOWS": 'red', "MOUTH": 'grey' }


    Xe_df = pd.DataFrame(Xe[:,:3], columns = ["axis_0", "axis_1", "axis_2"])
    Xe_df['color'] = colors
    Xe_df['label'] = \
        Xe_df.apply(lambda row: color_to_site(row['color']), axis=1)
    outlier_indicator = np.array([0]*len(colors))
    outlier_indicator[outlier_indices] = 1
    Xe_df['outlier'] = outlier_indicator.tolist()

        # specify trace names and symbols in a dict
    symbols = {'1': 'cross',
            '0':'circle'}

    

    fig = px.scatter_3d(Xe_df, x='axis_0', y='axis_1', z='axis_2',
            color='label', color_discrete_map=color_discrete_map, symbol='outlier')

    for i, d in enumerate(fig.data):
        # fig.data[i].marker.symbol = symbols[fig.data[i].name] 
        print(symbols)
        fig.data[i].marker.symbol = symbols[fig.data[i].name.split(',')[1].strip()] 

    fig.update_layout(scene = dict(
                    xaxis_title='axis 0',
                    yaxis_title='axis 1',
                    zaxis_title='axis 2'))
    fig.write_html(os.path.join(output_dir, figure_file))




{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}
{'1': 'cross', '0': 'circle'}


In [13]:
# """
# Combine three 3D dynamic plots together
# https://stackoverflow.com/questions/59868987/plotly-saving-multiple-plots-into-a-single-html/59869358#59869358
# """
# import plotly.graph_objects as go
# import plotly.express as px
# import plotly.offline as offline
# import pandas as pd

# from plotly.subplots import make_subplots


# """ 
# generate 3D plot of the first three axes 
# """

# target_id = 'M'

# axes_files = ["hmp_target_"+target_id+"_"+"NB_nSimplices_cMDS_axes.txt", \
#     "hmp_target_"+target_id+"_"+"NB_MDS_cMDS_axes.txt"]
    

# figure_files = ["hmp_target_"+target_id+"_"+"NB_nSimplices_cMDS.html", \
#     "hmp_target_"+target_id+"_"+"NB_MDS_cMDS.html"]

# fig = make_subplots(\
#     rows=2, cols=1, shared_xaxes=True, \
#     vertical_spacing=0.02)



# for i in range(len(axes_files)):
#     axes_file = axes_files[i]
#     figure_file = figure_files[i]
#     outlier_indices = outlier_indices_list[i%2]
#     normal_indices = list(set(list(range(Xe.shape[0]))) - set(outlier_indices))
#     Xe = np.loadtxt(os.path.join(output_dir, axes_file))
#     color_discrete_map = {'STOOL': 'cornflowerblue', 'VAGINA': 'orange', 'THROAT': 'deeppink',\
#         "EARS": 'black', "NOSE": "darkgreen", "ELBOWS": 'red', "MOUTH": 'grey' }


#     Xe_df = pd.DataFrame(Xe[:,:3], columns = ["axis_0", "axis_1", "axis_2"])
#     Xe_df['color'] = colors
#     Xe_df['label'] = \
#         Xe_df.apply(lambda row: color_to_site(row['color']), axis=1)
#     outlier_indicator = np.array(['nor']*len(colors))
#     outlier_indicator[outlier_indices] = "out"
#     Xe_df['outlier'] = outlier_indicator.tolist()

#         # specify trace names and symbols in a dict
#     symbols = {'out': 'cross',
#             'nor':'circle'}

    

#     # fig = px.scatter_3d(Xe_df, x='axis_0', y='axis_1', z='axis_2',
#     #         color='label', color_discrete_map=color_discrete_map, symbol='outlier')


#     fig.add_trace(go.Scatter3d(x = Xe_df["axis_0"], y = Xe_df["axis_1"], z = Xe_df["axis_2"], \
#         marker=dict(
#             color = colors
#         )), \
#         row=1, col=1)

#     # for i, d in enumerate(fig.data):
#     #     # fig.data[i].marker.symbol = symbols[fig.data[i].name] 
#     #     print(symbols)
#     #     fig.data[i].marker.symbol = symbols[fig.data[i].name.split(',')[1].strip()] 

#     # fig.update_layout(scene = dict(
#     #                 xaxis_title='axis 0',
#     #                 yaxis_title='axis 1',
#     #                 zaxis_title='axis 2'))
#     fig.write_html(os.path.join(output_dir, figure_file))

# offline.plot(fig, filename='name.html')



