# Uniform Manifold Approximation and Projection (UMAP)

In [None]:
# ! pip install umap-learn

In [None]:
# ## plotting functionality
# ! pip install umap-learn[plot] 

In [None]:
# import sklearn.datasets
import pandas as pd
import numpy as np

# import scipy.sparse as sp
from scipy import sparse

import umap
import umap.plot

# from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
# from pathlib import Path

import glob

In [None]:
# hvppi_preds = pd.read_csv('cleaned_edgelist.csv').drop(columns=['Interaction'])
# hvppi_preds['Segment'] = hvppi_preds['Segment'].fillna('NA')
# hvppi_preds

In [None]:
# ## Filter Scores
# pos_df = hvppi_preds[hvppi_preds['Score'] >= 0.143]
# pos_df = pos_df.sort_values(by=['Score'], ascending=False).reset_index(drop=True)
# # pos_df

# # pos_deg_counts = pos_df['Pro1ID'].value_counts().rename_axis('Pro1ID').reset_index(name='value_counts')
# # pos_deg_counts

# neg_df = hvppi_preds[hvppi_preds['Score'] < 0.143].reset_index(drop=True)
# neg_df['Pro1ID'].value_counts()
# # neg_df

In [None]:
# hadamard_edges = hvppi_preds[['Pro1ID', 'Pro2ID', 'Score']].sort_values(by=['Score'], ascending=False).reset_index(drop=True)
# hadamard_edges.columns = ['Protein1_ID', 'Protein2_ID', 'Score']
# hadamard_edges = hadamard_edges[hadamard_edges['Score']!=0.000]
# # hadamard_edges

# ## Map to index
# h_edges_idx = pd.merge(hadamard_edges, protein_list, on=['Protein1_ID'])
# h_edges_idx = pd.merge(h_edges_idx, protein_list, left_on=['Protein2_ID'], right_on=['Protein1_ID'])
# h_edges_idx = h_edges_idx[['Unnamed: 0_x', 'Unnamed: 0_y', 'Score']]
# h_edges_idx.columns = ['Protein1_ID', 'Protein2_ID', 'Score']
# h_edges_idx

### 2 classes: IAV (Index 0 to 40) & Human (Index 41 to 15684)

In [None]:
protein_list = pd.read_csv('protein_class.csv') 
protein_list

In [None]:
# target = protein_list['class'].values
# target

## Graph Embeddings

### Node Embeddings

In [None]:
emb_name = 'node2vec_plus/SparseOTF'

In [None]:
edit_data_path = 'Embeddings/Graph/' + emb_name + '/' 

In [None]:
hyper_param = 'p2_q2'

In [None]:
# emb = pd.read_csv(edit_data_path + hyper_param + '.txt', sep=' ', skiprows=1, header = None)
# emb = pd.read_csv(edit_data_path + hyper_param + '.csv', skiprows=1, header = None)

## node2vec+
emb = pd.read_csv(edit_data_path + hyper_param + '.emb', sep=' ', skiprows=1, header = None)
 
emb_f = emb.sort_values(by=[0]).set_index([0])
emb_f

In [None]:
g_features = emb_f.sort_index().values
print(g_features.shape)

In [None]:
g_features

### Edge Embeddings

In [None]:
'''
    Group human proteins into classes - according to HVPPI interaction potential thresholds
    
    (A) Score < 0.143
    (B) 0.143 <= Score < 0.212 (0.90)
    (C) 0.212 <= Score < 0.375 (0.95)
    (D) Score >= 0.375 (0.99)
    
    Pro1ID --- Human
    Pro2ID --- IAV
'''

In [None]:
## Add class label
# edge_emb['Class'] = ''

# edge_emb.loc[edge_emb['Score'] < 0.143, 'Class'] = 'A'
# edge_emb.loc[(edge_emb['Score'] >=0.143) & (edge_emb['Score'] < 0.212), 'Class'] = 'B'
# edge_emb.loc[(edge_emb['Score'] >=0.212) & (edge_emb['Score'] < 0.375), 'Class'] = 'C'
# edge_emb.loc[edge_emb['Score'] >= 0.375, 'Class'] = 'D'
# edge_emb

In [None]:
edge_emb = pd.read_csv('./hvppi_edgelist_idx.csv')
edge_emb

In [None]:
## Get edge embeddings from node embeddings

edges_count = edge_emb.shape[0]

X = np.empty((edges_count, g_features.shape[1])) ## All embeddings
k = 0

for i in range(len(edge_emb)):
    
#     print(edge_emb['Protein1_ID'][i], edge_emb['Protein2_ID'][i])

    u = g_features[edge_emb['Protein1_ID'][i]]
    v = g_features[edge_emb['Protein2_ID'][i]]

    hadamard = np.multiply(u, v)
    
    X[k] = hadamard
    k = k + 1
    
print(X)
print(X.shape)

In [None]:
## Target
target = edge_emb['Class'].values
target

In [None]:
## Plot
mapper = umap.UMAP().fit(X)

In [None]:
umap.plot.points(mapper, labels=target)

## Protein Embeddings 

In [None]:
emb_name = 'DPC'

In [None]:
edit_data_path = 'Embeddings/Protein/' + emb_name 

In [None]:
## Load npz
read_emb = sparse.load_npz(edit_data_path + '.npz')
p_features = read_emb.toarray()
p_features

In [None]:
print(p_features.shape)

In [None]:
mapper = umap.UMAP().fit(p_features)
umap.plot.points(mapper, labels=target)

## Subplot

In [None]:
### Create a list of directories
# img_dir = './umap_plots/Graph/'
img_dir = './umap_plots/Protein/'

### Extract the image paths into a list
# All files and directories ending with .png and that don't begin with a dot:
files = glob.glob(img_dir + "*.png")
files
# print(glob.glob(img_dir + "*.png")) 

In [None]:
## Graph Embedding
# plt_titles = ['deepwalk', 'node2vec', 'node2vec+', 'struc2vec', 'ripple2vec',
#              'LINE', 'SDNE', 'VAE', 'GraRep']

## Protein Embedding
plt_titles = ['AAC', 'DPC', 'C/T/D', 'QSOrder', 'APAAC', 'CT', 'NMBroto']

In [None]:
# create the figure
fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(15, 15))

# flatten the axis into a 1-d array to make it easier to access each axes
axs = axs.flatten()

# iterate through and enumerate the files, use i to index the axes
for i, file in enumerate(files):
    
    # read the image in
    pic = plt.imread(file)

    # add the image to the axes
    axs[i].imshow(pic)

    # add an axes title; .stem is a pathlib method to get the filename
    axs[i].set(title=plt_titles[i])

# remove the x and y ticks
for ax in axs:
    ax.set_xticks([])
    ax.set_yticks([])
    
## To-do: axs[1,3] how to hide??
fig.delaxes(axs[7])
fig.delaxes(axs[8])
    
# # add a figure title
# fig.suptitle('Images from https://www.heroforge.com/', fontsize=18)

In [None]:
# # create a list of directories
# dirs = ['../Pictures/dataset1', '../Pictures/dataset2', '../Pictures/dataset3']

# # extract the image paths into a list
# files = [f for dir_ in dirs for f in list(Path(dir_).glob('*.jpg'))]

# # create the figure
# fig, axs = plt.subplots(nrows=5, ncols=3, figsize=(10, 10))

# # flatten the axis into a 1-d array to make it easier to access each axes
# axs = axs.flatten()

# # iterate through and enumerate the files, use i to index the axes
# for i, file in enumerate(files):
    
#     # read the image in
#     pic = plt.imread(file)

#     # add the image to the axes
#     axs[i].imshow(pic)

#     # add an axes title; .stem is a pathlib method to get the filename
#     axs[i].set(title=file.stem)

# # # add a figure title
# # fig.suptitle('Images from https://www.heroforge.com/', fontsize=18)

# Reference codes

In [None]:
# ## Plot
# plt.rcParams["figure.figsize"] = (12, 8)

# fig, axs = plt.subplots(2, 4)
# plt.setp(axs, xticks=[1, 3, 5, 7, 9], xticklabels=['2', '4', '6', '8', '10']) ## xticks starts from 0 (i.e. xticklabels 1)

# locs, labels = plt.xticks()  # Get the current locations and labels.
# print(locs, labels)

# axs[0, 0].plot(acc)
# axs[0, 0].set(ylabel='accuracy')

# axs[0, 1].plot(sen, 'tab:orange')
# axs[0, 1].set(ylabel='sensitivity(recall)')

# axs[0, 2].plot(spec, 'tab:green')
# axs[0, 2].set(ylabel='specificity')

# axs[0, 3].plot(pre, 'tab:red')
# axs[0, 3].set(ylabel='precision')

# axs[1, 0].plot(f1, 'tab:purple')
# axs[1, 0].set(ylabel='F1-Score')

# axs[1, 1].plot(roc, 'tab:brown')
# axs[1, 1].set(ylabel='ROC-AUC')

# axs[1, 2].plot(pr, 'tab:pink')
# axs[1, 2].set(ylabel='PR-AUC')

# ## To-do: axs[1,3] how to hide??
# fig.delaxes(axs[1,3])

# fig.tight_layout()

# plt.savefig('./eval_metrics_plts/' + mtd + '.png')
# print('---Plot Saved---')

In [None]:
# len(pendigits.feature_names) ## dimensions

In [None]:
# pendigits.data.shape

In [None]:
# len(pendigits.target)

In [None]:
# pendigits = sklearn.datasets.load_digits()
# pendigits

In [None]:
# mapper = umap.UMAP().fit(pendigits.data)

In [None]:
# umap.plot.points(mapper)

In [None]:
# umap.plot.points(mapper, labels=pendigits.target)