In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

#原编码对应关系：
#{'B_cells':0, 'CD4_T_helper':1, 'cd14_monocytes':2, 'cd56_NK':3, 'memory_T':4, 'naive_T':5, 'naive_cytotoxic_T':6, 
#'regulatory_T':7}

In [4]:
def split_and_save_data_single_layer(layer, norm_features, encoded_labels, relation_dict):
    os.chdir('/home/tdeng/PBMC/tree-cnn/train/')
    """在单独一层拆分数据并保存"""
    feature_list = []
    label_list = []
    for new_code, origin_code in relation_dict.items():
        features = norm_features[np.isin(encoded_labels, origin_code)]
        feature_list.append(features)
        labels = np.ones(shape=(features.shape[0], 1)) * new_code
        label_list.append(labels)
    np.save('layer' + str(layer) + '/features.npy', np.concatenate(feature_list))
    np.save('layer' + str(layer) + '/labels.npy', np.concatenate(label_list))
    print('Layer{}:Data has been saved！'.format(layer))
    return None

def split_and_save_data_all_layers():
    os.chdir('/home/tdeng/PBMC/')
    norm_features = np.load('norm_features.npy')
    encoded_labels = np.load('encoded_labels.npy')
    print('Data has been read！')
    X_train, X_test, y_train, y_test = train_test_split(norm_features, encoded_labels, test_size=0.3, random_state=2020, shuffle=True)
    np.save('tree-cnn/test/X_test.npy', X_test)
    np.save('tree-cnn/test/y_test.npy', y_test)
    
    #layer1:Hematopoietic Stem -> Lymphoid Progenitor(others, new code:0),
    #                             Myeloid Progenitor(cd14_monocytes, new code:1)
    layer1_relation = {0:[0,1,3,4,5,6,7], 1:[2]}
    
    #layer2:Lymphoid Progenitor -> B_cells(new code:0), 
    #                              T cell progenitor(naive_cytotoxic_T, naive_T, regulatory_T, CD4_T_helper, memory_T, new code:1),
    #                              cd56_NK(new code:2)
    layer2_relation = {0:[0], 1:[1,4,5,6,7], 2:[3]}
    
    #layer3:T cell progenitor -> naive_cytotoxic_T(new code:0),
    #                            naive_T(naive_T, regulatory_T, CD4_T_helper, memory_T, new code:1)
    layer3_relation = {0:[6], 1:[1,4,5,7]}
    
    #layer4: naive_T -> regulatory_T(new code:0),
    #                   CD4_T_helper(new code:1),
    #                   memory_T(new code:2)
    layer4_relation = {0:[7], 1:[1], 2:[4]}
    
    for layer, rela_dict in zip(range(1, 5), [layer1_relation, layer2_relation, layer3_relation, layer4_relation]):
        split_and_save_data_single_layer(layer, X_train, y_train, rela_dict)
    return None

In [5]:
split_and_save_data_all_layers()

Data has been read！
Layer1:Data has been saved！
Layer2:Data has been saved！
Layer3:Data has been saved！
Layer4:Data has been saved！
