In [1]:
#安装相关依赖库 如果是windows系统，cmd命令框中输入pip安装，参考上述环境配置
#!pip install sklearn
#!pip install pandas
#---------------------------------------------------
#导入库
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

import os
from tqdm import tqdm
import wandb
import math
import pickle

from torch.utils.data import TensorDataset, DataLoader, Dataset, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import re

from IPython.display import Audio, display
def allDone():
    display(Audio(url='https://www.mediacollege.com/downloads/sound-effects/beep/beep-10.wav', autoplay=True))


## 建立数据集

In [2]:
protein_seq = pd.read_csv('../data/mode1_for_embed.csv')

# add space between each amino aicds
protein_seq['wt_seq'] = protein_seq['wt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))
protein_seq['mt_seq'] = protein_seq['mt_seq'].apply(lambda x: ' '.join(x)).apply(
        lambda x: re.sub(r"[UZOB]", "X", x))

protein_seq['label'].astype(str)
label_names = set(protein_seq['label'])

In [3]:
protein_seq['label'].value_counts()

0    20655
1     7095
Name: label, dtype: int64

In [4]:
def data_for_downstream(embed_path):
    path = embed_path + '/'
    concat = []
    for pkl in os.listdir(path):
        if(".pkl" in pkl):
            file_path = path + pkl
            with open(file_path, 'rb') as file:
                y = pickle.load(file)
                concat += y
    data_y = []
    data_X = []
    for i in range(len(concat)):
        data_X.append(concat[i]['x'][0])
        data_y.append(int(concat[i]['label']))
    data_X = np.array(data_X)
    return data_X, data_y

In [5]:
# def train_test_dataset(situation, gene_constrain, mode):
#     if 'gene_balance' in situation:
#         mode_train = np.load(f'../data/{gene_constrain}/gene_balance/For_ML/{mode}_train.npy', allow_pickle=True)
#         mode_test = np.load(f'../data/{gene_constrain}/gene_balance/For_ML/{mode}_test.npy', allow_pickle=True)
        
#         X_train = mode_train[:,:-1]
#         y_train = mode_train[:,-1].astype(int).tolist()
#         X_test = mode_test[:,:-1]
#         y_test = mode_test[:,-1].astype(int).tolist()
        
#         return X_train, y_train, X_test, y_test
    
#     elif 'gene_not_balance' in situation:
#         mode_train = np.load(f'../data/{gene_constrain}/gene_not_balance/For_ML/{mode}_train.npy')
#         mode_test = np.load(f'../data/{gene_constrain}/gene_not_balance/For_ML/{mode}_test.npy')
        
#         X_train = mode_train[:,:-1]
#         y_train = mode_train[:,-1].astype(int).tolist()
#         X_test = mode_test[:,:-1]
#         y_test = mode_test[:,-1].astype(int).tolist()
        
#         return X_train, y_train, X_test, y_test

#     elif 'imbalance' in situation:
#         mode_train = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_train.npy')
#         mode_test = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_test.npy')

#         X_train = mode_train[:,:-1]
#         y_train = mode_train[:,-1].astype(int).tolist()
#         X_test = mode_test[:,:-1]
#         y_test = mode_test[:,-1].astype(int).tolist()
        
#         return X_train, y_train, X_test, y_test
    
#     else:
#         mode_train1 = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_train_1.npy')
#         mode_train2 = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_train_2.npy')
#         mode_train3 = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_train_2.npy')
#         mode_test = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_test.npy')
        
#         X_train1, X_train2, X_train3 = mode_train1[:,:-1], mode_train2[:,:-1], mode_train3[:,:-1]
#         y_train1, y_train2, y_train3 = mode_train1[:,-1].astype(int).tolist(), mode_train2[:,-1].astype(int).tolist(), mode_train3[:,-1].astype(int).tolist()
#         X_test, y_test = mode_test[:,:-1], mode_test[:,-1].astype(int).tolist()
        
        
#         return X_train1, X_train2, X_train3, y_train1, y_train2, y_train3, X_test, y_test

In [2]:
# 测试用
def train_test_dataset(situation, gene_constrain, mode):
    if 'gene_balance' in situation:
        mode_train = np.load(f'../data/{gene_constrain}/gene_balance/For_ML/{mode}_train.npy', allow_pickle=True)
        mode_test = np.load(f'../data/{gene_constrain}/gene_balance/For_ML/{mode}_test.npy', allow_pickle=True)
        
        X_train = mode_train[:,:-1]
        y_train = mode_train[:,-1].astype(int).tolist()
        X_test = mode_test[:,:-1]
        y_test = mode_test[:,-1].astype(int).tolist()
        
        return X_train, y_train, X_test, y_test
    
    elif 'gene_not_balance' in situation:
        mode_train = np.load(f'../data/{gene_constrain}/gene_not_balance/For_ML/{mode}_train.npy')
        mode_test = np.load(f'../data/{gene_constrain}/gene_not_balance/For_ML/{mode}_test.npy')
        
        X_wt_train = mode_train[:,0:1024]
        X_mt_train = mode_train[:,1024:-1]
        X_train = X_mt_train - X_wt_train
        y_train = mode_train[:,-1].astype(int).tolist()
        X_wt_test = mode_test[:,0:1024]
        X_mt_test = mode_test[:,1024:-1]
        X_test = X_mt_test - X_wt_test
        y_test = mode_test[:,-1].astype(int).tolist()
        
        return X_train, y_train, X_test, y_test

    elif 'imbalance' in situation:
        mode_train = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_train.npy')
        mode_test = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_test.npy')

        X_train = mode_train[:,:-1]
        y_train = mode_train[:,-1].astype(int).tolist()
        X_test = mode_test[:,:-1]
        y_test = mode_test[:,-1].astype(int).tolist()
        
        return X_train, y_train, X_test, y_test
    
    else:
        mode_train1 = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_train_1.npy')
        mode_train2 = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_train_2.npy')
        mode_train3 = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_train_2.npy')
        mode_test = np.load(f'../data/{gene_constrain}/{situation}/For_ML/{mode}_test.npy')
        
        X_train1, X_train2, X_train3 = mode_train1[:,:-1], mode_train2[:,:-1], mode_train3[:,:-1]
        y_train1, y_train2, y_train3 = mode_train1[:,-1].astype(int).tolist(), mode_train2[:,-1].astype(int).tolist(), mode_train3[:,-1].astype(int).tolist()
        X_test, y_test = mode_test[:,:-1], mode_test[:,-1].astype(int).tolist()
        
        
        return X_train1, X_train2, X_train3, y_train1, y_train2, y_train3, X_test, y_test

In [3]:
#使用GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
elif torch.has_mps:
    torch.cuda.manual_seed(2022)
    device = torch.device('mps')
    print('Device name: MPS')
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Device name: MPS


In [4]:
from transformers import T5Tokenizer, T5EncoderModel
tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)
model = model.eval()

In [5]:
# param_list = ['encoder.block.23','final_layer_norm']

# for name, param in model.named_parameters():
#     if 'encoder.block.23' in name:
#         print(name,param.requires_grad)
#     elif 'final_layer_norm' in name:
#         print(name, param.requires_grad)
#     # if 'encoder.block.23' in name:
#     #     print(name)

In [6]:
# for name, param in model.named_parameters():
#     # if 'encoder.block.23' in name:
#         print(name, param.requires_grad)
#         # print(param)

In [7]:
# for param in model.parameters():
#     print(param)
#     break

In [8]:
# print(model.encoder.block[23])

In [9]:
SEED = 2022
np.random.seed(SEED)
torch.manual_seed(SEED)
gene_constrain = 'gene_constrain'
situation = 'gene_not_balance'
mode = 'mode2'

# data_X, data_y= data_for_downstream('../data/imbalance_same_seq/Embedding_results_csv/mode_2_embeds')

X_train, y_train, X_test, y_test = train_test_dataset(situation, gene_constrain,mode)

# 切分数据集
# X_train, X_test, y_train, y_test= train_test_split(data_X, data_y,
#                                                     test_size=0.2,
#                                                     stratify=data_y,
#                                                    random_state=SEED)
# 切分出valid数据集
X_valid, X_test, y_valid, y_test = train_test_split(X_test,y_test,
                                               test_size=0.3,
                                               shuffle=True,
                                               stratify=y_test,
                                               random_state=SEED)

len(X_train), len(y_train),len(X_test),len(y_test),len(X_valid), len(y_valid)

(10913, 10913, 368, 368, 858, 858)

In [10]:
X_test.shape

(368, 1024)

In [11]:
X_test.max()

0.4428303390741348

In [18]:

class TestDataset(Dataset):
    def __init__(self, X, y):
        super().__init__()
        X = np.array(X, dtype=np.float16)

        self.seq = torch.tensor(X)
        self.seq = self.seq.to(torch.float32)
        self.label = torch.tensor(y)
    
    def __len__(self):
        return len(self.seq)

    def __getitem__(self, index):
        return self.seq[index], self.label[index]

batch_size = 3
accumulation_step = 2

train_dataset = TestDataset(X_train, y_train)
test_dataset = TestDataset(X_test, y_test)
val_dataset = TestDataset(X_valid, y_valid)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [19]:
for batch in train_loader:
    print(batch[0])
    print(batch[1])
    break

tensor([[ 0.0164, -0.0124, -0.0073,  ..., -0.0098, -0.0099,  0.0568],
        [ 0.0095, -0.0090, -0.0118,  ..., -0.0101,  0.0257,  0.0007],
        [-0.0142, -0.0640,  0.0014,  ..., -0.0370, -0.0154,  0.0340]])
tensor([1, 0, 1])


## 建立模型

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Create the MLPClassfier class
class MLPClassifier(nn.Module):
    """Simple MLP Model for Classification Tasks.
    """
    def __init__(self,num_input,num_hidden,num_output):
        super(MLPClassifier, self).__init__()

        # Instantiate an one-layer feed-forward classifier
        self.hidden=nn.Linear(num_input,num_hidden)
        # self.dropout=nn.Dropout(0.1,inplace= False)
        self.predict = nn.Sequential(
            nn.Dropout(0.5),
            # nn.Linear(num_hidden, num_output)
            nn.Linear(num_hidden, 768),
            nn.Dropout(0.1),
            # # nn.ReLU(inplace = True),
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.Dropout(0.1),
            # # nn.ReLU(),
            # nn.Linear(256, 64),
            nn.Linear(256, num_output)
        )
           
    def forward(self,x):
        x=self.hidden(x)
        # x = self.dropout(x)
        x=self.predict(x)
        return x

In [15]:
class MLP_Attention_Classifier(nn.Module):
    """Simple MLP Model for Classification Tasks.
    """
    def __init__(self,num_input,num_hidden,num_output):
        super(MLP_Attention_Classifier, self).__init__()

        # Instantiate an one-layer feed-forward classifier
        self.hidden=nn.Linear(num_input,num_hidden)
        # self.dropout=nn.Dropout(0.1,inplace= False)
        self.predict = nn.Sequential(
            nn.Dropout(0.1),
            # nn.Linear(num_hidden, 2)
            nn.Linear(num_hidden, 768),
            nn.Dropout(0.1),
            # # nn.ReLU(inplace = True),
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.Dropout(0.1),
            # # nn.ReLU(),
            # nn.Linear(256, 64),
            nn.Linear(256, num_output)
        )
        self.layer_norm = nn.LayerNorm(256, eps=1e-6)
           
    def forward(self,x):
        x=self.hidden(x)
        x=self.layer_norm(x)
        # x = self.dropout(x)
        x=self.predict(x)
        return x

In [16]:
if gene_constrain == 'ohe_constrain':
    input_size = 2088

elif gene_constrain == 'gene_constrain':
    input_size = 1024
else:
    print('constrain_error')
print(input_size)
    
model = MLPClassifier(num_input = input_size, num_hidden = 832, num_output = 2).to(device)


1024


## 定义训练

In [17]:
from transformers import get_linear_schedule_with_warmup
# 精度计算
def flat_accuracy(preds, labels):
    pred_flat = preds.detach().cpu().numpy()
    pred_flat = np.argmax(pred_flat, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


# 优化方法
n_epochs = 20
early_stop = 8
learning_rate = 1e-4
total_steps = len(train_loader) * 1
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader)*n_epochs)


# wandb
wandb.init(
        project="pytorch-intro",
        config={
            "epochs": n_epochs,
            "batch_size": batch_size,
            "lr": learning_rate,
            "dropout": 0.1,
            })

[34m[1mwandb[0m: Currently logged in as: [33mweininglin[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [33]:
def trainer(train_loader, val_loader, model=model, device = device, early_stop = early_stop, n_epochs = n_epochs, accumulation_step = accumulation_step):

    criterion = nn.CrossEntropyLoss() # Define your loss function, do not modify this.

    # Define your optimization algorithm. 
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0000) 
    writer = SummaryWriter() # Writer of tensoboard.

    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = n_epochs, math.inf, 0, early_stop

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)

        for batch in train_pbar:
           
            b_seq, b_labels = tuple(t.to(device) for t in batch) # Move your data to device. 
            # print('b_seq shape: ', b_seq.shape)
            pred = model(b_seq)  
            loss = criterion(pred, b_labels)
            
            loss = loss/step
            
            loss.backward()                     # Compute gradient(backpropagation).
            
            if ((i+1)% accumulation_steps ==0):
                
                optimizer.step()                    # Update parameters.
                # scheduler.step()
                optimizer.zero_grad()               # Set gradient to zero.

            step += 1
                
            loss_record.append(loss.detach().item())
            
            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})
            
            wandb.log({'epoch': n_epochs, 'loss': loss.detach().item(), 'step': step})
            # print(model.classifier[3].bias)

        mean_train_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)
        wandb.log({'epoch': n_epochs, 'mean_train_loss': mean_train_loss, 'epoch_step': step})

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        total_eval_accuracy = 0
        
        for batch in val_loader:
            
            b_seq, b_labels = tuple(t.to(device) for t in batch) # Move your data to device. 
            # pred = model(b_input_ids, b_attn_mask) 
            with torch.no_grad():
                pred = model(b_seq)
                loss = criterion(pred, b_labels)

            loss_record.append(loss.item())
            total_eval_accuracy += flat_accuracy(pred, b_labels)
            
        mean_valid_loss = sum(loss_record)/len(loss_record)
        avg_val_accuracy = total_eval_accuracy / len(val_loader)
        
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)
        wandb.log({'epoch': n_epochs,
            'val_loss': mean_valid_loss,
            'step': step,
            'accuracy': avg_val_accuracy}
                 )
        

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), './models/model.ckpt') # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else: 
            early_stop_count += 1

        if early_stop_count >= early_stop:
            print('\nModel is not improving, so we halt the training session.')
            
            
            return

In [34]:
trainer(train_loader, val_loader, model, device)
wandb.finish()

Epoch [1/20]: 100%|██████████████| 683/683 [00:13<00:00, 49.56it/s, loss=0.0347]


Epoch [1/20]: Train loss: 0.4744, Valid loss: 0.4139
Saving model with loss 0.414...


Epoch [2/20]: 100%|████████████████| 683/683 [00:13<00:00, 51.54it/s, loss=2.01]


Epoch [2/20]: Train loss: 0.4337, Valid loss: 0.4118
Saving model with loss 0.412...


Epoch [3/20]: 100%|███████████████| 683/683 [00:13<00:00, 50.43it/s, loss=0.199]


Epoch [3/20]: Train loss: 0.4218, Valid loss: 0.4107
Saving model with loss 0.411...


Epoch [4/20]: 100%|███████████████| 683/683 [00:13<00:00, 51.27it/s, loss=0.651]


Epoch [4/20]: Train loss: 0.4097, Valid loss: 0.4195


Epoch [5/20]: 100%|███████████████| 683/683 [00:13<00:00, 50.31it/s, loss=0.593]


Epoch [5/20]: Train loss: 0.3986, Valid loss: 0.4179


Epoch [6/20]: 100%|███████████████| 683/683 [00:13<00:00, 50.70it/s, loss=0.663]


Epoch [6/20]: Train loss: 0.3860, Valid loss: 0.4064
Saving model with loss 0.406...


Epoch [7/20]: 100%|██████████████| 683/683 [00:13<00:00, 50.89it/s, loss=0.0337]


Epoch [7/20]: Train loss: 0.3700, Valid loss: 0.3979
Saving model with loss 0.398...


Epoch [8/20]: 100%|██████████████| 683/683 [00:13<00:00, 50.67it/s, loss=0.0171]


Epoch [8/20]: Train loss: 0.3651, Valid loss: 0.4165


Epoch [9/20]: 100%|██████████████| 683/683 [00:13<00:00, 51.30it/s, loss=0.0804]


Epoch [9/20]: Train loss: 0.3547, Valid loss: 0.4363


Epoch [10/20]: 100%|███████████████| 683/683 [00:13<00:00, 50.40it/s, loss=1.32]


Epoch [10/20]: Train loss: 0.3491, Valid loss: 0.4433


Epoch [11/20]: 100%|█████████████| 683/683 [00:13<00:00, 50.98it/s, loss=0.0424]


Epoch [11/20]: Train loss: 0.3386, Valid loss: 0.4351


Epoch [12/20]: 100%|█████████████| 683/683 [00:13<00:00, 50.59it/s, loss=0.0101]


Epoch [12/20]: Train loss: 0.3312, Valid loss: 0.4160


Epoch [13/20]: 100%|██████████████| 683/683 [00:13<00:00, 50.20it/s, loss=0.239]


Epoch [13/20]: Train loss: 0.3199, Valid loss: 0.4440


Epoch [14/20]: 100%|███████████████| 683/683 [00:13<00:00, 50.85it/s, loss=0.77]


Epoch [14/20]: Train loss: 0.3170, Valid loss: 0.4293


Epoch [15/20]: 100%|████████████████| 683/683 [00:13<00:00, 50.98it/s, loss=1.5]


Epoch [15/20]: Train loss: 0.3059, Valid loss: 0.4722

Model is not improving, so we halt the training session.


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch_step,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,▆▂▃▃▅▇▃▅▄▆█▃▁▆▃▄▄▆▇▆▄▅▃▆▁▂▂▁▃▃▂▃▅▃▅▁▂▂▁▅
mean_train_loss,█▆▆▅▅▄▄▃▃▃▂▂▂▁▁
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
val_loss,▃▂▂▃▃▂▁▃▅▅▅▃▅▄█

0,1
accuracy,0.0
epoch,20.0
epoch_step,10245.0
loss,1.50043
mean_train_loss,0.30586
step,10245.0
val_loss,0.47224


# 测试

In [20]:
def predict(test_loader, model, device):
    model.eval() # Set your model to evaluation mode.
    preds = []
    labels = []
    for batch in tqdm(test_loader):
        b_seq, b_labels = tuple(t.to(device) for t in batch)                   
        with torch.no_grad():                   
            pred = model(b_seq)                     
            preds.append(pred.detach().cpu()) # 放入cpu计算
            labels.append(b_labels.detach().cpu())
            
    preds = torch.cat(preds, dim=0)
    # print(preds)
    preds = torch.argmax(preds, dim=1)
    labels = torch.cat(labels, dim = 0)

    return preds, labels


In [21]:
model =  MLPClassifier(num_input = input_size, num_hidden = 832, num_output = 2).to(device)
model.load_state_dict(torch.load('./models/model.ckpt'))
preds, y_true = predict(test_loader, model, device)

100%|██████████████████████████████████████████| 23/23 [00:00<00:00, 201.87it/s]

tensor([[ 0.9131, -1.0042],
        [ 0.8193, -1.0125],
        [ 0.2308, -0.2367],
        [ 0.9075, -0.9684],
        [ 0.2734, -0.3054],
        [-1.5017,  1.6380],
        [-0.7443,  0.9126],
        [ 0.3497, -0.5148],
        [ 0.6169, -0.6524],
        [-0.0206,  0.1513],
        [-1.8187,  1.9764],
        [ 0.7252, -0.7710],
        [ 0.2287, -0.2368],
        [ 0.7413, -0.7891],
        [-1.8705,  2.1593],
        [ 2.4098, -2.5315],
        [-1.3849,  1.6613],
        [ 0.5396, -0.5688],
        [-1.1419,  1.2752],
        [ 0.6522, -0.6855],
        [ 1.0768, -1.1228],
        [-0.4351,  0.4865],
        [ 0.9461, -1.0016],
        [ 2.5369, -2.6882],
        [-1.7552,  1.9441],
        [ 0.7266, -0.7618],
        [ 0.6463, -0.6919],
        [ 0.7189, -0.7581],
        [ 0.5671, -0.5946],
        [ 1.0248, -1.0970],
        [ 0.5596, -0.5998],
        [ 0.0620, -0.0268],
        [-1.0331,  1.2105],
        [ 0.8712, -0.9129],
        [ 0.1978, -0.2020],
        [ 1.7609, -1




In [37]:
label_names = {'0':0, '1':1}

from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef
report = classification_report(y_true, preds,target_names=label_names)
report_df = classification_report(y_true, preds,target_names=label_names, output_dict=True)
print(report)

print(report_df['weighted avg'])
MCC = matthews_corrcoef(y_true, preds)
print('MCC: ', MCC)

              precision    recall  f1-score   support

           0       0.86      0.92      0.89       268
           1       0.73      0.60      0.66       100

    accuracy                           0.83       368
   macro avg       0.80      0.76      0.77       368
weighted avg       0.83      0.83      0.83       368

{'precision': 0.8252397125673901, 'recall': 0.8315217391304348, 'f1-score': 0.8259274080181317, 'support': 368}
MCC:  0.5536459487403476


In [38]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_true, preds).ravel()
print(pd.DataFrame([{'True Positive':tp,'True Negative':tn,'False Positive':fp,'False Negative':fn}]))
# print(tn,fp,fn,tp)

   True Positive  True Negative  False Positive  False Negative
0             60            246              22              40


In [None]:
result_path = '../data' + '/' + gene_constrain + '/' + situation + '/' + f'ML_predicted_results/{mode}_results'
report_csv = pd.DataFrame(report_df).transpose()
report_csv['MCC'] = MCC
report_csv['True Positive'] = tp
report_csv['True Negative'] = tn
report_csv['False Positive'] = fp
report_csv['False Negative'] = fn

report_csv.to_csv(f'{result_path}/mlp_{mode}_report_save.csv')