In [1]:
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

import torch
torch.use_deterministic_algorithms(True)
import numpy as np
import pandas as pd
import random




def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)

set_seed(42)

In [2]:
# from trainer import get_net_trainer
# from getNETtest import NetTester
# from getSMOTE import SmoteGenerator
# from getPrioritizeGene import getPrioritizeGene
# from Cart2Pixel import Cart2Pixel
# from ConvPixel import ConvPixel
# from sklearn.preprocessing import LabelBinarizer
# from sklearn.preprocessing import LabelEncoder
from model.Vec2Image_py.get_matrix import get_matrix
from model.Vec2Image_py.trainer import get_net_trainer
from model.Vec2Image_py.getNETtest import NetTester
from model.Vec2Image_py.getSMOTE import SmoteGenerator
from model.Vec2Image_py.getPrioritizeGene import getPrioritizeGene
from model.Vec2Image_py.Cart2Pixel import Cart2Pixel
from model.Vec2Image_py.ConvPixel import ConvPixel
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder



In [3]:
print("\nLoading full gene expression dataset...")
df = pd.read_csv("data/deng-reads-RawCount-modefied.csv")
X_full = df.iloc[:, 1:].to_numpy(dtype=np.float32)
labels_full = [col.split('.')[0] for col in df.columns[1:]]
label_map = {label: idx for idx, label in enumerate(set(labels_full))}
y_full = np.array([label_map[l] for l in labels_full])



Loading full gene expression dataset...


In [4]:
df.shape

(21297, 269)

In [5]:

dset = {
    'Xtrain': X_full,
    'train_labels': y_full.astype(int),
}

Parm = {
    'Method': 'tSNE',
    'Max_Px_Size': 30,
    'MPS_Fix': 1,
    'ValidRatio': 0.2,
    'Seed': 42,
    'NORM': 1
}


In [6]:

print("\nRunning get_matrix...")
Out = get_matrix(dset, Parm)

print("\nApplying SMOTE...")
smote = SmoteGenerator(dset['Xtrain'].T, dset['train_labels'], seed=42)
X_aug, y_aug = smote.fit_resample()
X_aug = X_aug.T

print("\nTesting ConvPixel...")
sample_vec = dset['Xtrain'][:, 0]
image = ConvPixel(sample_vec, dset['xp'], dset['yp'], dset['A'], dset['B'], dset['Base'])
print("Image shape:", image.shape)

# lb = LabelBinarizer()
# y_train_oh = lb.fit_transform(y_aug)
# y_val_oh = lb.transform(dset['Validation_labels'])
# dset['label_encoder'] = lb

le = LabelEncoder()
y_aug_int = le.fit_transform(y_aug)
y_val_int = le.transform(dset['Validation_labels'])

# One-hot 编码
y_train_oh = np.eye(len(le.classes_))[y_aug_int]
y_val_oh = np.eye(len(le.classes_))[y_val_int]
dset['label_encoder'] = le



Running get_matrix...

NORM-1
Applying PCA dimensionality reduction...


  dset['Xtrain'] = (dset['Xtrain'] - Out['Min']) / (Out['Max'] - Out['Min'])
  dset['XValidation'] = (dset['XValidation'] - Out['Min']) / (Out['Max'] - Out['Min'])
  dset['XValidation'] = (dset['XValidation'] - Out['Min']) / (Out['Max'] - Out['Min'])


tSNE with exact algorithm is used

 Pixels: 31 x 31

Applying SMOTE...

Testing ConvPixel...
Image shape: (31, 31)


In [9]:
y_train_oh.shape, y_val_oh.shape

((673, 6), (51, 6))

In [6]:
print("\nConverting augmented data to images...")
X_train_imgs = np.zeros((X_aug.shape[1], 1, dset['A'], dset['B']), dtype=np.float32)
for i in range(X_aug.shape[1]):
    fvec = X_aug[:, i][Out['feature_order']] 
    X_train_imgs[i, 0, :, :] = ConvPixel(fvec, dset['xp'], dset['yp'], dset['A'], dset['B'], dset['Base'], 0)

X_val_imgs = dset['XValidation'].transpose(3, 2, 0, 1)



Converting augmented data to images...


In [7]:

print("\nTraining model...")
model, train_loader, val_loader, criterion, optimizer, device = get_net_trainer(
    X_train_imgs, y_train_oh,
    X_val_imgs, y_val_oh
)



Training model...
Epoch 01: Loss=34.3109 | Train Acc=35.66% | Val Acc=25.49%
Epoch 02: Loss=26.8006 | Train Acc=51.71% | Val Acc=35.29%
Epoch 03: Loss=20.1979 | Train Acc=66.42% | Val Acc=39.22%
Epoch 04: Loss=15.5361 | Train Acc=84.70% | Val Acc=39.22%
Epoch 05: Loss=11.5180 | Train Acc=90.94% | Val Acc=74.51%
Epoch 06: Loss=9.6048 | Train Acc=88.41% | Val Acc=90.20%
Epoch 07: Loss=7.1850 | Train Acc=95.54% | Val Acc=94.12%
Epoch 08: Loss=5.1552 | Train Acc=96.14% | Val Acc=90.20%
Epoch 09: Loss=5.5258 | Train Acc=95.25% | Val Acc=96.08%
Epoch 10: Loss=4.4546 | Train Acc=98.37% | Val Acc=96.08%
Epoch 11: Loss=3.9138 | Train Acc=98.51% | Val Acc=96.08%
Epoch 12: Loss=3.9541 | Train Acc=98.66% | Val Acc=96.08%
Epoch 13: Loss=2.9025 | Train Acc=99.11% | Val Acc=94.12%
Epoch 14: Loss=2.9146 | Train Acc=98.37% | Val Acc=98.04%
Epoch 15: Loss=3.3790 | Train Acc=98.81% | Val Acc=96.08%
Epoch 16: Loss=2.7204 | Train Acc=99.41% | Val Acc=98.04%
Epoch 17: Loss=0.8808 | Train Acc=99.85% | Val A

In [8]:
Out.update({
    'model': {'net': model}
})

# n_val = dset['XValidation'].shape[3]
# dset['Xtest'] = Out['ValidationRawdata'][:, :n_val]
# dset['test_labels'] = dset['Validation_labels'][:n_val]  



In [9]:
X_val_raw = Out['ValidationRawdata']  
X_val_raw = np.nan_to_num(X_val_raw) 

X_val_raw_T = X_val_raw.T  
X_val_pca = Out['pca'].transform(X_val_raw_T)  
dset['Xtest'] = X_val_pca.T 

n_val = dset['Xtest'].shape[1]
dset['test_labels'] = Out['ValidationLabelsOrdered'][:n_val]

tester = NetTester(dset, Out, device='cpu')
acc, XTest_tensor, Y_pred = tester.run_test()
print(f"\nTest accuracy: {acc:.2%}")


print("XP hash:", hash(tuple(dset['xp'])))
print("YP hash:", hash(tuple(dset['yp'])))
print("Test Out XP hash:", hash(tuple(Out['xp'])))

#gene_rank = getPrioritizeGene(dset, Out, k=5)
#print("Gene ranking shape:", gene_rank.shape)
print("First few test labels:", dset['test_labels'][:5])
print("Xtest shape:", dset['Xtest'].shape)



Using Norm-1 ...

Test accuracy: 96.08%
XP hash: 8795343342413000627
YP hash: 7626115951859486666
Test Out XP hash: 8795343342413000627
First few test labels: [0 0 0 0 0]
Xtest shape: (100, 51)


In [65]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

# 假设ConvPixel和模型分类函数已提前定义好
def get_prioritize_gene(dest, Out, k, device='cpu'):
    if Out['Norm'] == 1:
        print('\nNORM-1\n')
        Out['Max'] = np.max(dest['Xtest'], axis=1, keepdims=True)
        Out['Min'] = np.min(dest['Xtest'], axis=1, keepdims=True)
        dest['Xtest'] = (dest['Xtest'] - Out['Min']) / (Out['Max'] - Out['Min'])
        dest['Xtest'][np.isnan(dest['Xtest'])] = 0

    elif Out['Norm'] == 2:
        print('\nNORM-2\n')
        Out['Min'] = np.min(dest['Xtest'], axis=1, keepdims=True)
        dest['Xtest'] = np.log(dest['Xtest'] + np.abs(Out['Min']) + 1)
        Out['Max'] = np.max(dest['Xtest'])
        dest['Xtest'] = dest['Xtest'] / Out['Max']

    countgene = min(dest['Xtest'].shape[0], len(Out['xp']), len(Out['yp']))
    error = np.zeros(countgene)

    for i in range(countgene):
        shuffledata = np.copy(dest['Xtest'])
        neigh = NearestNeighbors(n_neighbors=k, p=5, metric='minkowski')
        neigh.fit(np.column_stack((Out['xp'], Out['yp'])))
        mIdx = neigh.kneighbors([[Out['xp'][i], Out['yp'][i]]], return_distance=False)[0]
        mIdx = mIdx[mIdx < shuffledata.shape[0]]  # 额外保险措施
        shuffledata[mIdx, :] = 1

        num_test_labels = len(dest['test_labels'])
        sample_pixel = ConvPixel(shuffledata[:, 0], Out['xp'], Out['yp'], Out['A'], Out['B'], Out['Base'], 0)
        height, width = sample_pixel.shape
        M = np.zeros((height, width, 1, num_test_labels))

        M[:, :, 0, 0] = sample_pixel

        for j in range(1, num_test_labels):
            M[:, :, 0, j] = ConvPixel(shuffledata[:, j], Out['xp'], Out['yp'], Out['A'], Out['B'], Out['Base'], 0)

        print(M.shape)
        X_test_tensor = torch.from_numpy(M).permute(3, 2, 0, 1).float().to(device)

        Y_test_tensor = torch.from_numpy(dest['test_labels']).long().to(device)

        Y_pred = Out['model']['net'](X_test_tensor)
        Y_pred = torch.argmax(Y_pred, dim=1).cpu().numpy()

        valError = np.mean(Y_pred == dest['test_labels'])


        error[i] = valError
        print(f'the running gene number is {i}')

    GeneRank = error
    return GeneRank


In [66]:
GeneRank = get_prioritize_gene(dset, Out, k=5)


NORM-1

(31, 31, 1, 51)
the running gene number is 0
(31, 31, 1, 51)
the running gene number is 1
(31, 31, 1, 51)
the running gene number is 2
(31, 31, 1, 51)
the running gene number is 3
(31, 31, 1, 51)
the running gene number is 4
(31, 31, 1, 51)
the running gene number is 5
(31, 31, 1, 51)
the running gene number is 6
(31, 31, 1, 51)
the running gene number is 7
(31, 31, 1, 51)
the running gene number is 8
(31, 31, 1, 51)
the running gene number is 9
(31, 31, 1, 51)
the running gene number is 10
(31, 31, 1, 51)
the running gene number is 11
(31, 31, 1, 51)
the running gene number is 12
(31, 31, 1, 51)
the running gene number is 13
(31, 31, 1, 51)
the running gene number is 14
(31, 31, 1, 51)
the running gene number is 15
(31, 31, 1, 51)
the running gene number is 16
(31, 31, 1, 51)
the running gene number is 17
(31, 31, 1, 51)
the running gene number is 18
(31, 31, 1, 51)
the running gene number is 19
(31, 31, 1, 51)
the running gene number is 20
(31, 31, 1, 51)
the running gene nu

In [67]:
GeneRank

array([0.19607843, 0.19607843, 0.21568627, 0.21568627, 0.19607843,
       0.21568627, 0.19607843, 0.19607843, 0.21568627, 0.21568627,
       0.21568627, 0.21568627, 0.21568627, 0.21568627, 0.19607843,
       0.21568627, 0.21568627, 0.21568627, 0.19607843, 0.21568627,
       0.21568627, 0.19607843, 0.21568627, 0.21568627, 0.21568627,
       0.21568627, 0.21568627, 0.21568627, 0.21568627, 0.21568627,
       0.21568627, 0.19607843, 0.21568627, 0.21568627, 0.21568627,
       0.21568627, 0.21568627, 0.21568627, 0.21568627, 0.19607843,
       0.19607843, 0.19607843, 0.19607843, 0.19607843, 0.19607843,
       0.19607843, 0.19607843, 0.19607843, 0.19607843, 0.19607843,
       0.19607843, 0.19607843, 0.21568627, 0.21568627, 0.21568627,
       0.23529412, 0.21568627, 0.21568627, 0.21568627, 0.21568627,
       0.21568627, 0.19607843, 0.21568627, 0.21568627, 0.21568627,
       0.21568627, 0.21568627, 0.21568627, 0.21568627, 0.21568627,
       0.21568627, 0.21568627, 0.21568627, 0.21568627, 0.21568

In [69]:
# 挑选出前5个基因
top_k_indices = np.argsort(GeneRank)[:5]

print("Top 5 genes:", top_k_indices)


Top 5 genes: [0 1 6 4 7]
