In [1]:
import torch
from CLIMP import CLImputeUtils
import numpy as np
import pandas as pd
device=torch.device('cpu')

In [None]:
########### 1. Use CLImpute to impute ##########

In [2]:
### Step1. reading dataset
device=torch.device('cpu')
dataset_name = 'Zeisel'
drop_rate = 0.4
groundTruth_data, cells, genes = CLImputeUtils.load_data('CLIMP/data/Zeisel/Zeisel_top2000.csv')

In [3]:
### Step2. simulate dropout-events
drop_data = CLImputeUtils.impute_dropout(groundTruth_data, drop_rate=drop_rate)
print('dataset: {}, drop rate: {}'.format(dataset_name, drop_rate))

dataset: Zeisel, drop rate: 0.4


In [4]:
## Step3: training embedding
X = torch.FloatTensor(np.copy(drop_data)).to(device)
# Step3.1: loading the provided trained model
model = CLImputeUtils.load_pretained_model(X, load_path='CLIMP/data/Zeisel/Zeisel_saved_model.pkl')

# or Step3.1: training a model
# model = CLImputeUtils.training(X, hidden_size=128, epoch=100, aug_rate=0.4)

loading pre-train model


In [5]:
## Step4: select k similiar cells and imputation
choose_cell = CLImputeUtils.select_neighbours(model, X, k=20)
imputed_data = CLImputeUtils.LS_imputation(drop_data, choose_cell, device)

# saved file
# pd.DataFrame(imputed_data.T, index=genes, columns=cells).to_csv('saved path')

In [6]:
## Step5: evaluation
print('dropout data PCCs: {:.4f}, imputed data PCCs: {:.4f}'.
      format(CLImputeUtils.pearson_corr(drop_data, groundTruth_data), 
             CLImputeUtils.pearson_corr(imputed_data, groundTruth_data)))
print('dropout data L1: {:.4f}, imputed data L1: {:.4f}'.
      format(CLImputeUtils.l1_distance(drop_data, groundTruth_data), 
             CLImputeUtils.l1_distance(imputed_data, groundTruth_data)))
print('dropout data RMSE: {:.4f}, imputed data RMSE: {:.4f}'.
      format(CLImputeUtils.RMSE(drop_data, groundTruth_data), 
             CLImputeUtils.RMSE(imputed_data, groundTruth_data)))

dropout data PCCs: 0.7735, imputed data PCCs: 0.9538
dropout data L1: 1.8393, imputed data L1: 1.1245
dropout data RMSE: 17.0827, imputed data RMSE: 8.1165


In [None]:
########### 2. Verify experimental results of Zeisel dataset in this paper ##########

In [7]:
## 1. loading dataset and the simulated dropout events data with 40% used in our experiment
groundTruth_data, cells, genes = CLImputeUtils.load_data('CLIMP/data/Zeisel/Zeisel_top2000.csv')
drop_data, _, _ = CLImputeUtils.load_data('CLIMP/data/Zeisel/Zeisel_d40.csv')
X = torch.FloatTensor(np.copy(drop_data)).to(device)
drop_rate = (len(groundTruth_data.nonzero()[0])-len(drop_data.nonzero()[0]))/len(groundTruth_data.nonzero()[0])
print('drop rate: {:.2f}'.format(drop_rate))

## 2. loading the saved model
model = CLImputeUtils.load_pretained_model(X, load_path='CLIMP/data/Zeisel/Zeisel_saved_model.pkl')

## 3.imputation
choose_cell = CLImputeUtils.select_neighbours(model, X, k=20)
imputed_data = CLImputeUtils.LS_imputation(drop_data, choose_cell, device, filter_noise=2)

# saved
# imputed_saved = pd.DataFrame(imputed_data.T, index=genes, columns=cells)
# imputed_saved.to_csv('CLIMP/data/Zeisel/Zeisel_Imputed.csv')

print('dropout data PCCs: {:.4f}, imputed data PCCs: {:.4f}'.
      format(CLImputeUtils.pearson_corr(drop_data, groundTruth_data), 
             CLImputeUtils.pearson_corr(imputed_data, groundTruth_data)))
print('dropout data L1: {:.4f}, imputed data L1: {:.4f}'.
      format(CLImputeUtils.l1_distance(drop_data, groundTruth_data), 
             CLImputeUtils.l1_distance(imputed_data, groundTruth_data)))
print('dropout data RMSE: {:.4f}, imputed data RMSE: {:.4f}'.
      format(CLImputeUtils.RMSE(drop_data, groundTruth_data), 
             CLImputeUtils.RMSE(imputed_data, groundTruth_data)))

drop rate: 0.40
loading pre-train model
dropout data PCCs: 0.7688, imputed data PCCs: 0.9483
dropout data L1: 1.8372, imputed data L1: 1.0810
dropout data RMSE: 17.2358, imputed data RMSE: 8.5091


In [8]:
# Verify clustering results of Zeisel dataset
clusterResults = pd.read_csv('CLIMP/data/Zeisel/Zeisel_d40_Clustering.csv', index_col=0)
clusterResults = clusterResults.values.squeeze()
labels = pd.read_csv('CLIMP/data/Zeisel/Zeisel_cell_label.csv', index_col=0)
labels = labels.values.squeeze()
print('ARI: {:.3f}, NMI: {:.3f}, NMI: {:.3f}'.
      format(CLImputeUtils.adjusted_rand_score(clusterResults, labels), 
             CLImputeUtils.normalized_mutual_info_score(clusterResults, labels),
             CLImputeUtils.getPurityScore(clusterResults, labels)))

ARI: 0.879, NMI: 0.841, NMI: 0.938
