In [21]:
import torch
import torch.nn as nn
kernel_List = [12, 4, 4, 4, 4, 4]
channel_List = [128, 256, 512, 512, 512, 400]

class ConvBlock(nn.Module):
    def __init__(self, in_channel, out_channel, kernel_sz, padding, stride = 2) -> None:
        super().__init__()
        self.conv = nn.Conv2d(in_channel, out_channel, kernel_sz, stride, padding)
        self.bn = nn.BatchNorm2d(out_channel)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()
    def forward(self, x):
        x = self.relu(self.bn(self.conv(x)))
        x = self.dropout(x)
        return x

def get_convBlocks(in_channel):
    layerNum = len(kernel_List)
    blocks = []
    blocks.append(ConvBlock(in_channel, channel_List[0], kernel_List[0], int(kernel_List[0] / 2 - 1)))
    for i in range(1, layerNum):
        blocks.append(ConvBlock(channel_List[i-1], channel_List[i], kernel_List[i], int(kernel_List[i] / 2 - 1)))
    return blocks

class DeepFold(nn.Module):
    def __init__(self, in_channel) -> None:
        super().__init__()
        self.convLayer = nn.Sequential(*get_convBlocks(in_channel))
    
    # [batch_size, 3, 256, 256]
    def forward(self, x):
        # [batch_size, 400, 4, 4]
        x = self.convLayer(x)
        # [batch_size, 400, 4]
        x = torch.diagonal(x, dim1=2, dim2=3)
        # [batch_size, 400]
        x = torch.mean(x, dim= 2)

        normValue = torch.norm(x, dim = 1) # norm_value [batch_size]
        # print(normValue.shape)
        # [400, batch_size]  除法要求最后一维要和norm_value维度匹配
        x = x.permute(1, 0)
        # [400, batch_size] 已经正则化
        x = torch.div(x, normValue)

        # [batch_size, 400]
        x = x.permute(1, 0)
        return x

    # def hook(self, layer: nn.Module, input: torch.tensor, output)

# outputList = []
# def hook(self, layer: nn.Module,  output: torch.tensor):
#     outputList.append(output)

x = torch.rand(2, 3, 256, 256)

model = DeepFold(3)

# for layer in model.convLayer:
#     layer.register_forward_hook(hook)

y = model(x)

print(torch.norm(y, dim = 1).shape)

# for ele in outputList:
#     print(ele.shape)

# print(model)
# print(help(model))
# print(len(list(model.named_modules())))
# for name,_ in model.convLayer.named_modules():
#     print(name)
#     print('-'*60)

# √
# print(x.shape)
# for layer in model.convLayer:
#     x = layer(x)
#     print(x.shape)
#     print('-'*60)


torch.Size([2])


In [119]:
import torch
import torch.nn.functional as F
x = torch.tensor(
    [[3, 4],
    [5, 12],
    [3, 4]],dtype=torch.float32)
# print(x.dtype)
print(F.normalize(x))

normV = torch.norm(x, dim = 1)
print(normV)
x = x.permute(1, 0)
print(x)
res = torch.div(x, normV)
print(res)
res = res.permute(1, 0)
print(res)
normV2 = torch.norm(res, dim = 1)
print(normV2)

tensor([[0.6000, 0.8000],
        [0.3846, 0.9231],
        [0.6000, 0.8000]])
tensor([ 5., 13.,  5.])
tensor([[ 3.,  5.,  3.],
        [ 4., 12.,  4.]])
tensor([[0.6000, 0.3846, 0.6000],
        [0.8000, 0.9231, 0.8000]])
tensor([[0.6000, 0.8000],
        [0.3846, 0.9231],
        [0.6000, 0.8000]])
tensor([1., 1., 1.])


In [65]:
def funcy(x: torch.tensor, k:int):
    return x**(-2*k)

In [85]:
class Pretfm(torch.nn.Module):
    def __init__(self, in_channel) -> None:
        super().__init__()
        self.in_channel = in_channel

    def forward(self, x: torch.tensor):

        y = torch.rand(self.in_channel, 256, 256)

        for i in range(1, self.in_channel+1):
            y[i-1] = funcy(x, i)
        x = y
        return x

In [94]:
in_channel = 3
import torchvision.transforms as T
train_tfm = T.Compose(
    [
        T.Resize((256, 256)),
        # 取逆矩阵 扩充channel
        # Pretfm(in_channel),
        # 是否需要数据增强 保留一个问号
        # 层归一化
        # nn.LayerNorm((in_channel, 256, 256))
    ]
)
# t = nn.LayerNorm((in_channel, 256, 256))
# # x = torch.rand(1, 188, 188)
# x = torch.tensor([[[1,1],
#                     [2,2]
#                     ]
#                     ], dtype=torch.float32)
# output = train_tfm(x)
# print(output)
# # print(output.shape)
# print(t(output))

In [95]:
import numpy as np
class Train_set(torch.utils.data.Dataset):

    def __init__(self, dir, id_list, tfm) -> None:
        super().__init__()
        
        self.tensor_list = []
        for id, label in id_list:
            # 在蛋白质数据库文件查找 id.npy
            feature = torch.from_numpy(np.load(dir+id+".npy", allow_pickle=True))
            feature = torch.unsqueeze(feature, 0)
            self.tensor_list.append((feature,
                                        label)
                                        )
        self.tfm = tfm

    def __getitem__(self, idx :int):
        y = self.tensor_list[idx][0]
        y = self.tfm(y)
        label = torch.float64(self.tensor_list[idx][1])
        return y, label

    def __len__(self):
        return len(self.tensor_list)


In [99]:
dir = "../distance_matrix/distance_matrix_inf/"
id_path = "../pair/pair_bool/d1a0aa_.txt"
id_list = []

with open(id_path, "r") as f_r:
    while True:
        lines = f_r.readline()
        if not lines:
            break
        line = lines.split('\n')[0].split('\t')
        id_list.append((line[0], line[1]))

# print(len(id_list))

dataset = Train_set(dir, id_list[:10], train_tfm)
print(id_list[:10])
print(dataset[0][0].shape)



[('d1a0pa2', '0'), ('d2a0ua1', '0'), ('d3a04a_', '0'), ('d5a0ya1', '0'), ('d5a0ya2', '0'), ('d1a1ia1', '0'), ('d1a1va2', '0'), ('d2a14a1', '0'), ('d2a19a2', '0'), ('d2a1jb1', '0')]
torch.Size([1, 256, 256])


In [111]:
data = np.load("../distance_matrix/distance_matrix_inf/d1a0pa2.npy", allow_pickle=True)
data = torch.from_numpy(data)
print(data.dtype)

torch.float64


In [5]:
# 获取左侧一列id 对应的右侧id_list
def get_id_list(pair_path):
    id_list = []
    with open(pair_path, "r") as f_r:
        while True:
            lines = f_r.readline()
            if not lines:
                break
            line1= lines.split('\t')[0]
            line2 = lines.split('\t')[1].split("\n")[0]
            id_list.append((line1, line2))
    return id_list

In [4]:
# 获取id对应的distance_matrix
def get_feature(data_path, tfm):
    feature = torch.from_numpy(np.load(data_path, allow_pickle=True))
    feature = torch.unsqueeze(feature, 0)
    feature = feature.to(torch.float)
    feature = tfm(feature)
    return feature

In [6]:
lis = [3, 2,63,9]
lis.sort(reverse=True)
print(lis)

[63, 9, 3, 2]


In [3]:
def compute_loss(posi_cosList, nega_cosList, K = 10, m = 0.1):
    posi_cosList.sort() # 升序排序 选最小
    nega_cosList.sort(reverse=True) # 降序排序 选最大
    posi_cos = posi_cosList[0] # 只选取一个正例
    loss = 0
    for i in range(K):
        nega_cos = nega_cosList[i]
        loss += max(0, nega_cos - posi_cos + m)

    return loss


In [2]:
import numpy as np
theSelecTrainList = np.load("/home/wngys/lab/DeepFold/pair/train.npy", allow_pickle=True)

In [5]:
# 训练过程
#/home/wngys/lab/DeepFold/Code
from model import *
from data import *
from torch.utils.data import DataLoader
from torch import cosine_similarity
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DFold_model = DeepFold(in_channel = 3)
DFold_model.to(device)

train_tfm = build_transform(in_channel = 3)

# print(train_tfm)
optimizer = torch.optim.SGD(DFold_model.parameters(), lr = 1e-3)

total_epochs = 10
batch_size = 64

resume_dir = None
if resume_dir is not None:
    pass
else:
    st_epoch = 0

pair_dir = "/home/wngys/lab/DeepFold/pair/train_pair_bool_90/"  
data_dir = "/home/wngys/lab/DeepFold/distance_matrix_r/distance_matrix_mine_r_3/" 

trainIDlist = theSelecTrainList[:64]

leftTrain_ds = LeftTrainSet(data_dir, trainIDlist, train_tfm)
leftTrain_dl = DataLoader(leftTrain_ds, batch_size, shuffle = True)
# print(leftTrain_ds[0][1].shape)

for epoch in range(st_epoch, total_epochs):
    # 遍历左侧一列集合每一个Protein ID
    DFold_model.train()

    for IDBatch, feature1 in leftTrain_dl:

        feature1 = feature1.to(device)
        fingerpbatch = DFold_model(feature1)

        for batch_idx in range(len(IDBatch)):
            id_list = get_id_list(pair_dir + IDBatch[batch_idx] +".txt")
            # id_list = get_id_list("/home/wngys/lab/DeepFold/test_d1a0aa_Pair.txt")
            fingerpvec1 = fingerpbatch[batch_idx]
            # feature1 = get_feature(data_dir + id + ".npy", train_tfm)
   
            # print(feature1)
            train_ds = Train_set(data_dir, id_list, train_tfm)
            # print(train_ds[0][0])
            train_dl = DataLoader(train_ds, batch_size, shuffle=False, num_workers=2, pin_memory=True)


            IDtotalLoss = 0
            for feature2, label in train_dl:
                feature2 = feature2.to(device)
                label = label.to(device)
                fingerpvec2 = DFold_model(feature2)
                
                posi_vec_list = []
                nega_vec_list = []

                for number_inbatch in range(fingerpvec2.shape[0]):
                    if label[number_inbatch] == 0:
                        nega_vec_list.append(fingerpvec2[number_inbatch])
                    elif label[number_inbatch] == 1:
                        posi_vec_list.append(fingerpvec2[number_inbatch])
                    else:
                        print("ERROR")

                posi_cos_smi_list = []
                nega_cos_smi_list = []

                for posi_vec in posi_vec_list:
                    posi_cos_smi_list.append(F.cosine_similarity(fingerpvec1, posi_vec, dim = 0))

                for nega_vec in nega_vec_list:
                    nega_cos_smi_list.append(F.cosine_similarity(fingerpvec1, nega_vec, dim = 0))

                # 计算batch Loss
                loss = compute_loss(posi_cos_smi_list, nega_cos_smi_list)

                optimizer.zero_grad()
                loss.backward(retain_graph=True)
                loss.backward()
                optimizer.step()

                IDtotalLoss += loss

            print(f"Epoch: {epoch} | queryID: {IDBatch[batch_idx]} | avg_loss: {IDtotalLoss / len(train_dl):.4f}")

        # DFold_model.eval()




            



            








NameError: name 'theSelecTrainList' is not defined

In [123]:
import torch.nn.functional as F
test_t = torch.tensor([1, 1], dtype=torch.float64)
test_t2 = torch.tensor([2, 4], dtype = torch.float64)

print(F.cosine_similarity(test_t, test_t2,dim = -1))

tensor(0.9487, dtype=torch.float64)


In [2]:
import torch 

class test_dataset(torch.utils.data.Dataset):
    def __init__(self, left, right) -> None:
        super().__init__()
        self.left = left
        self.right = right

    def __getitem__(self, index):
        return self.left[index], self.right[index]

    def __len__(self):
        return len(self.left)

left = ['1', '2', '3', '4', '5', '6', '7', '8']
right = []
for i in range(8):
    right.append(torch.tensor(i+1))
# print(right)

ds = test_dataset(left, right)
# print(ds[5])
dl = torch.utils.data.DataLoader(ds, batch_size=4, shuffle = True)

for x, y in dl:
    print(type(x))
    print(len(x))
    print(x, y)

<class 'tuple'>
4
('5', '8', '3', '1') tensor([5, 8, 3, 1])
<class 'tuple'>
4
('7', '6', '4', '2') tensor([7, 6, 4, 2])


In [1]:
from torch.utils import data
from model import *
from data import *
from torch.utils.data import DataLoader
import os


def read_data():
    data_dir = "/home/wngys/lab/DeepFold/distance_matrix_r/distance_matrix_mine_r_3"
    data_dict = {}
    # print(os.listdir(data_dir)[:10])
    for file_name in os.listdir(data_dir):
        # print(file_name)
        file_path = os.path.join(data_dir, file_name)
        data = np.load(file_path, allow_pickle=True)
        ID = file_name.split('.')[0]
        data_dict[ID] = data
        break

    fileName = "/home/wngys/lab/DeepFold/distance_matrix_r/matrix_data.npy"
    
    np.save(fileName, data_dict)

read_data()

In [2]:
fileName = "/home/wngys/lab/DeepFold/distance_matrix_r/matrix_data.npy"
d = np.load(fileName, allow_pickle=True).tolist()

In [5]:
d['d3m2pa1'].shape

(3, 285, 285)