In [30]:
import pandas as pd
import torch
import os
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader
import matplotlib.pyplot as plt
import random



In [31]:
!pip install torch_geometric

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


## dataloader

In [32]:
def resize_tensor(input_tensor, target_length):
    input_tensor = input_tensor.unsqueeze(1) 
    resized_tensor = F.interpolate(
        input_tensor, 
        size=target_length, 
        mode='linear', 
        align_corners=False
    )
    resized_tensor = resized_tensor.squeeze(1)
    
    return resized_tensor


In [33]:
## labels: day45 baseline 0, day 45 others 1, day90 baseline 2, day others 90 3, day 120 baseline 4, day 120 others 5
import pandas as pd
import torch
import os

## labels: day45 baseline 0, day 45 others 1, day90 baseline 2, day others 90 3, day 120 baseline 4, day 120 others 5
file_folder = "/home/featurize/work/ylx/MEA/overfitting"
sub_file_list = os.listdir(file_folder)
all_data = []
for file_name in sub_file_list:  
    file_path = os.path.join(file_folder, file_name)
    data_sample = {}
    cls_mea = file_name.split("_")[-2]
    if cls_mea == "baseline":
        label = 0
    else:
        label = 1
    data_sample["label"] = label
    df = pd.read_csv(file_path)
    data_np = df.values
    data_tensor = torch.tensor(data_np, dtype=torch.float32)
    target_length = 4570 
    data_tensor = resize_tensor(data_tensor, target_length)
    data_sample["data"] = data_tensor
    data_sample["label"] = label
    data_sample["data_name"] = file_name
    all_data.append(data_sample)


In [34]:
train_loader = DataLoader(all_data, batch_size=1, shuffle=True)
test_loader = DataLoader(all_data, batch_size=1, shuffle=False)



# Model

In [35]:
def min_max_normalize(x):
  min_val = x.min()
  max_val = x.max()
  normalized_x = (x - min_val) / (max_val - min_val)
  return normalized_x

# print("Min-Max Normalization (0-1):\n", x_normalized_minmax)

In [36]:
class Encoder(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(num_node_features, hidden_channels)#GCN需要改数据形式
        
        #预测输出形式预判问题
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.conv4 = GCNConv(hidden_channels, hidden_channels)
      
    def forward(self, x, edge_index, batch):
        num_sample = x.shape[0]
        # print(num_sample)
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.conv3(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        x = self.conv4(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = global_mean_pool(x, batch)
        max_values, _ = torch.max(x, dim=0)  
        
        
        # print(max_values.unsqueeze(0))
        return min_max_normalize(max_values).unsqueeze(0), num_sample
    

In [37]:
import torch.nn as nn
class Decoder(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = nn.Linear(256,4570)
        self.linear2 = nn.Linear(4570,8000)
        self.linear3 = nn.Linear(8000,4570)
        self.linear4 = nn.Linear(4570,4570)
        
        

    def forward(self, x, num_samples):
        # noise  = 0.1 * torch.randn(num_samples, 256)
        # noise  = noise.to("cuda")
        # x = x + noise
        #x复制n份（repeat）变成一个tensor，
        #decode的结果打印出来
        #算meansquare，按道理是一样的，任务是验证这一点？
        #reproducable，minimal example
        x = self.linear1(x) # 
        x1 = x
        x = F.relu(x) 
        x = self.linear2(x)  # 
        x = F.relu(x)
        x = self.linear3(x) + x1# 
        x = F.relu(x) 
        x = self.linear4(x)  # 
        # 激活函数
        
        return x


In [38]:
# class Decoder(torch.nn.Module):
#     def __init__(self, num_node_features, hidden_channels):
#         super().__init__()
        
#     def forward(self, x):
        
#         return x
     # 生成标准正态分布的噪声，形状为 (num_samples, 256) 
# import torch.nn as nn
# class Decoder(torch.nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.linear1 = nn.Linear(256,4570)
#         self.linear2 = nn.Linear(4570,8000)
#         self.linear3 = nn.Linear(8000,4570)
#         self.linear4 = nn.Linear(4570,4570)
        
        

#     def forward(self, x, num_samples):
#         noise  = 0.1 * torch.randn(num_samples, 256)
#         noise  = noise.to("cuda")
#         x = x+noise
#         #x复制n份（repeat）变成一个tensor，
#         #decode的结果打印出来
#         #算meansquare，按道理是一样的，任务是验证这一点？
#         reproducable，minimal example
#         x = self.linear1(x) # 
#         x1 = x
#         x = F.relu(x) 
#         x = self.linear2(x)  # 
#         x = F.relu(x)
#         x = self.linear3(x) + x1# 
#         x = F.relu(x) 
#         x = self.linear4(x)  # 
#         # 激活函数
        
#         return x

    

In [39]:
class Autoencoder(torch.nn.Module):  
    def __init__(self, num_node_features, hidden_channels):  
        super().__init__()  
        self.encoder = Encoder(num_node_features, hidden_channels)  
        self.decoder = Decoder() 
    def forward(self, x, edge_index, batch):  
        # Encode the input  
        encoded, num_sample = self.encoder(x, edge_index, batch)
        #added
        print("encode_shape:", encoded.shape)
        print("encoded.unsqueeze(0).shape:", encoded.unsqueeze(0).shape)
        print("x.shape: ", x.shape)
        print("repeat_shape:", x.shape[0])
        encode_copy = encoded.repeat(x.shape[0], 1)
        def add_indices(tensor):
            """
            在每一行末端添加一个index.
            
            """
            device = tensor.device
            rows = tensor.shape[0]
            indices = torch.arange(1, rows + 1,device = device).float().reshape(-1, 1)
            return torch.cat((tensor, indices),  dim=1)
        
        encode_copy_added_index = add_indices(encode_copy)
            
             
        print("encode_copy:", encode_copy)  
        print("encode_copy.shape", encode_copy.shape)
        decoded = self.decoder(encoded.unsqueeze(0),num_sample)
        
        
        #added
        print("decode:", decoded) 
        print("decoded.shape", decoded.shape)
        decoded_of_encode_copy = self.decoder(encode_copy_added_index, num_sample)
        print("decoded_of_encode_copy_added_index:", decoded_of_encode_copy)
        print("decoded_of_encode_copy_added_index.shape", decoded_of_encode_copy.shape)
        return decoded  
  

In [40]:
latent_dim = 16
num_node_features = 4570
hidden_channels = 256
model = Autoencoder(num_node_features, hidden_channels) 

# Train


In [41]:
num_graphs = len(all_data) 
graphs = []
for i in range(num_graphs):
    graph = all_data[i]
    num_nodes = int(graph['data'].shape[0])  # 每个图的节点数目
    print("num_nodes:", num_nodes)
    node_features = graph['data']            # 节点特征矩阵
    print("node_features.shape:", node_features.shape)
    # 创建依次连接的边缘索引
    edge_index = []
    for i in range(num_nodes - 1):
        edge_index.append([i, i+1])
    # 转换为PyTorch张量
    edge_index = torch.tensor(edge_index).t().contiguous()# 转置并连接，[[0,1],[1,2],[2,3],...]变成[[0,1,2,3,...],[1,2,3,4,...]]
    # edge_index.append([i+1, i])                      # 如果图是无向的，添加反方向的边
    print(edge_index.shape)
    y = torch.tensor([graph['label']], dtype=torch.long)
    graph_data = Data(x=node_features, edge_index=edge_index, y=y)
    graphs.append(graph_data)
random.seed(42)
random.shuffle(graphs)
train_graphs = graphs 
# test_graphs = graphs[int(num_graphs*0.8):]    # 后20%作为测试集
train_loader = DataLoader(train_graphs, batch_size=1, shuffle=False)
# test_loader = DataLoader(test_graphs, batch_size=32, shuffle=False)

num_nodes: 111
node_features.shape: torch.Size([111, 4570])
torch.Size([2, 110])


In [None]:
from tqdm import tqdm
epoch = 10000   
lerning_rate = 0.01
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lerning_rate, capturable=True)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epoch, eta_min=0.000001)
mse_loss = torch.nn.MSELoss() 
loss_values = []
tttt=torch.randn(1, 256)
for epoch in tqdm(range(epoch)):
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        # print(f"Shape of data.x: {data.x.shape}")
        # encode= encoder(data.x, data.edge_index, data.batch)
        # print(f"Encode shape: {encode.shape}")
        
        decode = model(data.x, data.edge_index, data.batch)
        
        # decode = decoder(encode_copy, data.edge_index, data.batch)
        # print(f"Decoded output shape: {decode.shape}")

        loss = mse_loss(decode, data.x)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    avg_loss = total_loss / len(train_loader)
    loss_values.append(avg_loss)
    if epoch%500==0:
        print(f'Epoch {epoch+1}, Loss: {avg_loss}')

    # 训练完成后保存模型权重
model_path = f'gcn_model_ep_{epoch}_lr_{lerning_rate}_hidden_{hidden_channels}.pth'
torch.save(model.state_dict(), model_path)
print(f'Model saved to {model_path}')

# 绘制损失曲线
plt.figure(figsize=(10, 5))
plt.plot(loss_values, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
# 保存图像到本地文件
loss_curve_path = f'training_loss_curve_ep_{epoch}_lr_{lerning_rate}_hidden_{hidden_channels}.png'
plt.savefig(loss_curve_path)
plt.show()

# 测试模式
# model.eval()
# correct = 0
# total = 0
# with torch.no_grad():
#     # for data in test_loader:
#     for data in train_loader:

#         data = data.to(device)
#         out = model(data.x, data.edge_index, data.batch)
#         pred = out.argmax(dim=1)
#         correct += int((pred == data.y).sum())
#         total += data.num_graphs
# with torch.no_grad():
#     correct = 0
#     total = 0
#     # 遍历训练集或测试集
#     for data in train_loader:
#         data = data.to(device)
        
#         # 编码器进行编码
#         encode = encoder(data.x, data.edge_index, data.batch)
        
#         # 假设最终模型需要对编码结果进行分类 (也许解码器不是必要的，取决于你的模型设计)
#         # 如果你想对节点进行分类，可以直接用编码结果进行分类
#         # 例如，你可能有一个全连接层来将编码结果映射到类别空间：
#         # out = model_classifier(encode)
        
#         # 对于分类任务，我们通常会使用 `argmax` 选择最大概率的类别
#         pred = encode.argmax(dim=1)
        
#         # 计算正确的预测和总预测数
#         correct += (pred == data.y).sum().item()
#         total += data.num_graphs

#     accuracy = correct / total
#     print(f'Test Accuracy: {accuracy}')

# accuracy = correct / total
# print(f'Test Accuracy: {accuracy}')


  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

encode_shape: torch.Size([1, 256])
encoded.unsqueeze(0).shape: torch.Size([1, 1, 256])
x.shape:  torch.Size([111, 4570])
repeat_shape: 111
encode_copy: tensor([[0.0826, 0.3475, 0.3146,  ..., 0.1937, 0.5253, 0.1286],
        [0.0826, 0.3475, 0.3146,  ..., 0.1937, 0.5253, 0.1286],
        [0.0826, 0.3475, 0.3146,  ..., 0.1937, 0.5253, 0.1286],
        ...,
        [0.0826, 0.3475, 0.3146,  ..., 0.1937, 0.5253, 0.1286],
        [0.0826, 0.3475, 0.3146,  ..., 0.1937, 0.5253, 0.1286],
        [0.0826, 0.3475, 0.3146,  ..., 0.1937, 0.5253, 0.1286]],
       device='cuda:0', grad_fn=<RepeatBackward0>)
encode_copy.shape torch.Size([111, 256])
decode: tensor([[[ 0.2288, -0.1293,  0.1112,  ..., -0.1652,  0.0214, -0.0374]]],
       device='cuda:0', grad_fn=<ViewBackward0>)
decoded.shape torch.Size([1, 1, 4570])





RuntimeError: mat1 and mat2 shapes cannot be multiplied (111x257 and 256x4570)

In [14]:
# import torch
# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# import matplotlib.pyplot as plt
# import numpy as np

# # 加载模型权重进行测试
# model.load_state_dict(torch.load(model_path))
# print(f'Model loaded from {model_path}')

# # 收集预测和标签
# all_preds = []
# all_labels = []

# with torch.no_grad():
#     # for data in test_loader:     # 确保这里使用的是测试数据加载器
#     for data in train_loader:  # 确保这里使用的是测试数据加载器

#         # data = data.to(device)
#         data = {key: value.to(device) for key, value in data.items()}
#         out = model(data.x, data.edge_index, data.batch)
#         pred = out.argmax(dim=1)
#         all_preds.extend(pred.cpu().tolist())
#         all_labels.extend(data.y.cpu().tolist())

# # 计算混淆矩阵
# conf_matrix = confusion_matrix(all_labels, all_preds)

# # 提取混淆矩阵的值
# TN, FP, FN, TP = conf_matrix.ravel()

# # 计算准确率 (Accuracy)
# accuracy = (TP + TN) / (TP + TN + FP + FN)
# print(f'Accuracy: {accuracy:.4f}')

# # 计算精确率 (Precision)
# precision = TP / (TP + FP)
# print(f'Precision: {precision:.4f}')

# # 计算召回率 (Recall)
# recall = TP / (TP + FN)
# print(f'Recall: {recall:.4f}')

# # 可视化混淆矩阵
# fig, ax = plt.subplots(figsize=(4, 3))
# sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)

# # 标注 TP, FP, TN, FN
# labels = ['TN', 'FP', 'FN', 'TP']
# for i in range(2):
#     for j in range(2):
#         ax.text(j, i, f"{labels[i * 2 + j]}={conf_matrix[i, j]}", ha='center', va='center', color='red')

# ax.set_xlabel('Predicted Labels')
# ax.set_ylabel('True Labels')
# ax.set_title('Confusion Matrix with Annotations')
# plt.show()

# accuracy = np.sum(np.diag(conf_matrix)) / np.sum(conf_matrix)
# print(f'Test Accuracy: {accuracy}')
# print(f'all_labels: {all_labels}')
# print(f'all_preds: {all_preds}')
