# Testing the MOGONET paper

The link to the paper is [here](https://www.nature.com/articles/s41467-021-23774-w).

The link to the code repo is [here](https://www.nature.com/articles/s41467-021-23774-w)

Authors: Tongxin Wang, Wei Shao, Zhi Huang, Haixu Tang, Jie Zhang, Zhengming Ding, Kun Huang.

## Table of Content
- [DATA](#data)
- [Main Biomarker](#main-biomarker)
- [Main Mogonet](#main-mogonet)
- [Models](#models)
- [Train_Test](#train-test)
- [Feat Importance](#feat-importance)

### Data

To demonstrate effectiveness of MOGONET, authors applied proposed method on **four different classification tasks** using **four different datasets** (CHECK):

Three types of omics data for each dataset:
 - mRNA expression data (mRNA)
 - DNA methylation data (meth)
 - miRNA expression data (miRNA)

Datasets:
 1) BReast invasive CArnicoma (**BRCA**)
     1) mRNA, 1000 features
     2) meth, 1000 features
     3) miRNA, 611 observations $\times$ 502 features (**NOT 503** as shown in [paper](https://www.nature.com/articles/s41467-021-23774-w/tables/1))
 2) Religious Orders Study/Memory and Aging Project(**ROSMAP**)
 3) Low Grade Glicoma (LGG) --- Missing
 4) KIPAN --- Missing


    

In [4]:
# Load libraries 
import os
import numpy as np
import torch
from train_test import train_test, prepare_trte_data

#### BRCA

In [5]:
# Loading the data for BRCA
BRCA_FOLDER = "BRCA/"
BRCA_view = [1, 2, 3]

# 1st step is to get prepare up the training data list
# all data list of tensors, their index dictionary, and
# their corresponding class label.
brca_train_list, brca_all_list, brca_idx_dict, brca_labels = prepare_trte_data(data_folder=BRCA_FOLDER,
                                                                        view_list=BRCA_view)

#### ROSMAP

In [40]:
# Loading the data for ROSMAP
ROSMAP_FOLDER = "ROSMAP/"
ROSMAP_view = [1, 2, 3]
rosmap_train_list, rosmap_all_list, rosmap_idx_dict, ros_labels = prepare_trte_data(data_folder=BRCA_FOLDER,
                                                                        view_list=BRCA_view)


#### Sizes

In [41]:
# Helper function to print size of dataset (num of tensors)
# size of each tensor
def print_size(train_list, all_list, dataset = "BRCA"):
    if dataset == "BRCA":
        tr_name = dataset + " Train List"
        all_name = dataset + " All List"

    elif dataset == "ROSMAP":
        tr_name = dataset + " Train List"
        all_name = dataset + " All List"

    else:
        return("Wrong dataset input")
    print("#" * 50)
    print(f"The dataset is: {dataset}")
    # Prints size of tensor in the train data list
    print(f"Number of tensors in {tr_name}: {len(train_list)}")
    for i in train_list:
        print(f"The size of each tensor is: {i.shape}")
    print("#" * 50)

    # Prints size of tensor in the all data list
    print(f"Number of tensors in {all_name}: {len(all_list)}")
    for j in all_list:
        print(f"The size of each tensor is: {j.shape}")


print_size(brca_train_list, brca_all_list)
print_size(rosmap_train_list, rosmap_all_list, dataset = "ROSMAP")


##################################################
The dataset is: BRCA
Number of tensors in BRCA Train List: 3
The size of each tensor is: torch.Size([612, 1000])
The size of each tensor is: torch.Size([612, 1000])
The size of each tensor is: torch.Size([612, 503])
##################################################
Number of tensors in BRCA All List: 3
The size of each tensor is: torch.Size([875, 1000])
The size of each tensor is: torch.Size([875, 1000])
The size of each tensor is: torch.Size([875, 503])
##################################################
The dataset is: ROSMAP
Number of tensors in ROSMAP Train List: 3
The size of each tensor is: torch.Size([612, 1000])
The size of each tensor is: torch.Size([612, 1000])
The size of each tensor is: torch.Size([612, 503])
##################################################
Number of tensors in ROSMAP All List: 3
The size of each tensor is: torch.Size([875, 1000])
The size of each tensor is: torch.Size([875, 1000])
The size of each tensor

### Main Biomarker

### Main Mogonet 

In [57]:
def main_runner(data_folder, view_list=None, lr_e_pretrain = 1e-3, lr_e = 5e-4, lr_c = 1e-3, num_epoch_pretrain = 500, num_epoch = 500):
    """
    Main runner of the MOGONET algorithm, takes several hyperparameters, and does train-test split of the data_folder,
    where transforms the data to epochs for GNN. <---- EDIT here
    
    Args:
    """
    
    if data_folder == 'ROSMAP':
        num_class = 2
    elif data_folder == 'BRCA':
        num_class = 5
    else:
        return("Wrong dataset input")
    
    train_test(data_folder, view_list, num_class,
               lr_e_pretrain, lr_e, lr_c, 
               num_epoch_pretrain, num_epoch)             

In [58]:
main_runner("BRCA", [1, 2, 3])

AttributeError: module 'numpy' has no attribute 'asscalar'

### Models

### Train Test

In [6]:
# Loading the data for BRCA
BRCA_FOLDER = "BRCA/"
BRCA_view = [1, 2, 3]

# 1st step is to get prepare up the training data list
# all data list of tensors, their index dictionary, and
# their corresponding class label.
brca_train_list, brca_all_list, brca_idx_dict, brca_labels = prepare_trte_data(data_folder=BRCA_FOLDER,
                                                                        view_list=BRCA_view)

In [19]:
data = brca_train_list[1]
data

tensor([[0.7615, 0.8605, 0.6642,  ..., 0.5645, 0.5360, 0.5845],
        [0.8281, 0.8584, 0.6465,  ..., 0.6303, 0.5936, 0.2596],
        [0.8959, 0.8155, 0.5906,  ..., 0.8908, 0.6695, 0.1482],
        ...,
        [0.6335, 0.5735, 0.3636,  ..., 0.4891, 0.2417, 0.1640],
        [0.8527, 0.8258, 0.5873,  ..., 0.7349, 0.4855, 0.4430],
        [0.8407, 0.8062, 0.6367,  ..., 0.5428, 0.3078, 0.2428]],
       device='cuda:0')

In [20]:
dist = cosine_distance_torch(x1 = data, x2 = data)

In [24]:
data

tensor([[0.7615, 0.8605, 0.6642,  ..., 0.5645, 0.5360, 0.5845],
        [0.8281, 0.8584, 0.6465,  ..., 0.6303, 0.5936, 0.2596],
        [0.8959, 0.8155, 0.5906,  ..., 0.8908, 0.6695, 0.1482],
        ...,
        [0.6335, 0.5735, 0.3636,  ..., 0.4891, 0.2417, 0.1640],
        [0.8527, 0.8258, 0.5873,  ..., 0.7349, 0.4855, 0.4430],
        [0.8407, 0.8062, 0.6367,  ..., 0.5428, 0.3078, 0.2428]],
       device='cuda:0')

In [32]:
data.norm(p="fro")

tensor(440.6357, device='cuda:0')

In [11]:
def cosine_distance_torch(x1, x2=None, eps=1e-8):
    x2 = x1 if x2 is None else x2
    # calculates norm for x1 in frobenius form of dimension 1
    w1 = x1.norm(p=2, dim=1, keepdim=True)
    # calculates norm for x2 in frobenius form of dimension 1
    w2 = w1 if x2 is x1 else x2.norm(p=2, dim=1, keepdim=True)
    return 1 - torch.mm(x1, x2.t()) / (w1 * w2.t()).clamp(min=eps)


In [10]:
# Helper function that generates the adjacent matrix of data
# and store it as tensor, using cosine metric
def gen_adj_mat_tensor(data, parameter, metric="cosine"):
    """
    FILL IN
    
    Parameters:
    ----------
        data: FILL
        parameter: FILL
        metric: "cosine", optional
    """
    assert metric == "cosine", "Only cosine distance implemented"
    # Calculates
    dist = cosine_distance_torch(data, data)
    g = graph_from_dist_tensor(dist, parameter, self_dist=True)
    if metric == "cosine":
        adj = 1-dist
    else:
        raise NotImplementedError
    adj = adj*g 
    adj_T = adj.transpose(0,1)
    I = torch.eye(adj.shape[0])
    if cuda:
        I = I.cuda()
    adj = adj + adj_T*(adj_T > adj).float() - adj*(adj_T > adj).float()
    adj = F.normalize(adj + I, p=1)
    adj = to_sparse(adj)
    
    return adj

In [9]:
# Helper function to generate adjacent matrix of the training
# and testing data
def gen_trte_adj_mat(data_tr_list, data_trte_list, trte_idx, adj_parameter):
    """
    This function calculates the adjacent matrix as tensor for the training data
    and the testing data correspondently to some adj_parameter

    The default metric is 'cosine', as its the only implemented algorithm.

    Parameters:
    ----------
        data_tr_list: list of tensor (training) generated from prepare_trte_data
        data_trte_list: list of tensor (train + test) generated from 
                        prepare_trte_data
        trte_idx: index dictionary of train+test data
        adj_parameter: FILL_UP HERE
    
    Returns:
    -------
        adj_train_list: list of adjacent tensors to training data
        adj_test_list: list of adjacent tensors to the train+test data? 
    """
    adj_metric = "cosine" # cosine distance
    adj_train_list = []
    adj_test_list = []
    for i in range(len(data_tr_list)):
        adj_parameter_adaptive = cal_adj_mat_parameter(adj_parameter, data_tr_list[i], adj_metric)
        adj_train_list.append(gen_adj_mat_tensor(data_tr_list[i], adj_parameter_adaptive, adj_metric))
        adj_test_list.append(gen_test_adj_mat_tensor(data_trte_list[i], trte_idx, adj_parameter_adaptive, adj_metric))
    
    return adj_train_list, adj_test_list

### Feat Importance

## Others

### prepare_trte_data

In [1]:
# Helper function to prepare the data
cuda = True if torch.cuda.is_available() else False


def prepare_trte_data(data_folder, view_list):
    """
    Gets all the *tr.csv and *te.csv in the data_folder, and transforms these to list of tensors, then
    storing it on several returned objects

    Args: 
        data_folder: path to read the data
        view_list: list of files to be viewed [1,2,3] here
    Returns:
        data_train_list: list of tensors of the train data
        data_all_list: list of tensors of combined train and test data
        idx_dict: dict that corresponds to the label (id) of both train,
                  and test data
        labels:  numpy array that stores the actual class of each observation
    """
    num_view = len(view_list)
    # Get the labels and transform it to integer to map it
    labels_tr = np.loadtxt(os.path.join(data_folder, "labels_tr.csv"), delimiter=',')
    labels_te = np.loadtxt(os.path.join(data_folder, "labels_te.csv"), delimiter=',')
    labels_tr = labels_tr.astype(int)
    labels_te = labels_te.astype(int)
    
    # Initialize list to store results
    data_tr_list = []
    data_te_list = []

    # Reads the data in the csv files with _tr / _te
    # And append it correspondently to its list
    for i in view_list:
        data_tr_list.append(np.loadtxt(os.path.join(data_folder, str(i)+"_tr.csv"), delimiter=','))
        data_te_list.append(np.loadtxt(os.path.join(data_folder, str(i)+"_te.csv"), delimiter=','))
    num_tr = data_tr_list[0].shape[0]
    num_te = data_te_list[0].shape[0]
    data_mat_list = []
    for i in range(num_view):
        data_mat_list.append(np.concatenate((data_tr_list[i], data_te_list[i]), axis=0))
    data_tensor_list = []

    for i in range(len(data_mat_list)):
        data_tensor_list.append(torch.FloatTensor(data_mat_list[i]))
        if cuda:
            data_tensor_list[i] = data_tensor_list[i].cuda()
    idx_dict = {}
    idx_dict["tr"] = list(range(num_tr))
    idx_dict["te"] = list(range(num_tr, (num_tr+num_te)))
    data_train_list = []
    data_all_list = []
    for i in range(len(data_tensor_list)):
        data_train_list.append(data_tensor_list[i][idx_dict["tr"]].clone())
        data_all_list.append(torch.cat((data_tensor_list[i][idx_dict["tr"]].clone(),
                                       data_tensor_list[i][idx_dict["te"]].clone()),0))
    labels = np.concatenate((labels_tr, labels_te))
    
    return data_train_list, data_all_list, idx_dict, labels

NameError: name 'torch' is not defined