# 轉向量

In [None]:
import torch
from transformers import BertModel, BertTokenizer
import pandas as pd 
from tqdm import tqdm
import numpy as np

# 使用 cuda，如果有 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 載入 BERT 模型和斷詞機制
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)

# 設置模型為評估模式，以防止梯度計算
model.eval()

def sentences2list(path,key_1, key_2):
    df = pd.read_csv(path)
    sentences_1 = df[key_1].values.tolist()
    sentences_2 = df[key_2].values.tolist()
    label = df['label'].values.tolist()

    return sentences_1,sentences_2, label



def get_sentence_vector(sentence):
    # Tokenize 句子並轉換為張量
    # turning 會引用到
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)

    # 用於不進行梯度計算的上下文管理器
    with torch.no_grad():
        # 獲取 BERT 模型的最後一層的輸出
        outputs = model(**inputs)
    
    # 最後一層的所有 token 的向量
    last_hidden_state = outputs.last_hidden_state

    # 將所有 token 的向量相加，並除以 token 的數量，得到句子的向量
    sentence_vector = (last_hidden_state.sum(dim=1) / last_hidden_state.shape[1]).cpu().numpy().tolist()

    return sentence_vector



def turning(sentences_list):
    vectors = []
    total = len(sentences_list)
    for num, sen in tqdm(enumerate(sentences_list),total = total, desc="sentence to vector ..."):
        vector = str(get_sentence_vector(sen))
        vectors.append(vector)
    return vectors

path_=r"C:\Users\YuCheng_Ch\Desktop\accton\hr_datasets_5798.csv"
sentence_1, sentence_2, label = sentences2list(path=path_, key_1="sen1", key_2="sen2")
vector_1 = turning(sentence_1)
vector_2 = turning(sentence_2)

pd.DataFrame({"sen1":sentence_1,"vector1":vector_1,"sen2":sentence_2,"vector2":vector_2,"label":label}).to_csv(path_,encoding="utf-8-sig")


# 移除前面檔案的左右括弧

In [None]:



import pandas as pd 
csv_file = path_

df = pd.read_csv(csv_file)

sen1 = df['sen1'].values.tolist()
vector_1 = df['vector1'].values.tolist()
sen2 = df['sen2'].values.tolist()
vector_2 = df['vector2'].values.tolist()
label = df['label'].values.tolist()

for num, vectors in enumerate(vector_1):
    vector_1[num] = vectors.replace('[[','[').replace(']]',']')

for num, vectors in enumerate(vector_2):
    vector_2[num] = vectors.replace('[[','[').replace(']]',']')

pd.DataFrame({"sen1":sentence_1,"vector1":vector_1,"sen2":sentence_2,"vector2":vector_2,"label":label}).to_csv(csv_file,encoding='utf-8-sig')

# 訓練

In [None]:
import torch
from torch.utils import data as data_
import torch.nn as nn
from torch.autograd import Variable
import matplotlib.pyplot as plt
import torchvision
import pandas as pd 
from sklearn.model_selection import train_test_split
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch import optim
import ast
import os 
# 定義輸出 CSV 檔案路徑
input_csv_path = path_ # 替換成你的路徑
model_save_directory = r'C:\Users\YuCheng_Ch\Desktop\datasets\onetime_train'
batch_size = 128
epochs = 500


# 定義模型架構
class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        


        self.linear1 = nn.Linear(768, 512)
        self.linear2 = nn.Linear(512, 256)
        self.linear3 = nn.Linear(256, 128)        
        self.linear4 = nn.Linear(128, 64)
        self.linear5 = nn.Linear(64, 16)
        self.linear6 = nn.Linear(16, 8)
        self.output = nn.Linear(8, 2)   

        
    def forward(self, x):
        x = self.linear1(x)
        #print(f'After linear1: {x.shape}')  # 調試輸出
        x = F.relu(x)
        #print(f'After relu1: {x.shape}')  # 調試輸出
        x = self.linear2(x)
        #print(f'After linear2: {x.shape}')  # 調試輸出
        x = F.relu(x)
        x = self.linear3(x)
        x = F.relu(x)
        x = self.linear4(x)
        x = F.relu(x)
        x = self.linear5(x)
        x = F.relu(x)
        x = self.linear6(x)
        x = F.relu(x)
        x = self.output(x)
  
        return x



df = pd.read_csv(input_csv_path)
sent1 = df['sen1'].values.tolist()
sent2 = df['sen2'].values.tolist()
vector1 = df['vector1'].apply(ast.literal_eval).tolist()
vector2 = df['vector2'].apply(ast.literal_eval).tolist()

tensor_vector1 = torch.tensor(vector1,dtype=torch.float32).cuda()
tensor_vector1 = tensor_vector1 / torch.max(tensor_vector1)
tensor_vector2 = torch.tensor(vector2,dtype=torch.float32).cuda()
tensor_vector2 = tensor_vector2 / torch.max(tensor_vector2)
merged_tensor = (tensor_vector1+tensor_vector2)/2
#轉向量時已除字數，這邊不必再除。

merged_label = df['label'].values.tolist()
target_tensor = torch.tensor(merged_label,dtype=torch.float64).cuda()
train_x, test_x, train_y, test_y = train_test_split(merged_tensor,target_tensor, test_size=0.1,shuffle=True)

train_x = torch.tensor(train_x, dtype=torch.float32).cuda()
test_x = torch.tensor(test_x, dtype=torch.float32).cuda()
train_y = torch.tensor(train_y, dtype=torch.long).cuda() 
test_y = torch.tensor(test_y, dtype=torch.long).cuda()
 

#xy打包
train = torch.utils.data.TensorDataset(train_x, train_y)
test = torch.utils.data.TensorDataset(test_x, test_y)
#打包後迭帶
train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True)



LR = 5e-5
##### 加權損失函數
class_counts = np.bincount(train_y.cpu().numpy())
class_weights = 1. / class_counts
weights = torch.tensor(class_weights, dtype=torch.float32).cuda()
loss_fun = nn.CrossEntropyLoss(weight=weights)
##### 加權損失函數


model = NN().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)



save_interval = 25
log_interval = 25


# 確保儲存資料夾存在
os.makedirs(model_save_directory, exist_ok=True)

for epoch in tqdm(range(epochs)):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for tensor, target in train_dataloader:   
        tensor = tensor.cuda()
        target = target.cuda()
        optimizer.zero_grad()
        result = model(tensor)
        loss = loss_fun(result, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(result.data, 1)
        total_train += target.size(0)
        correct_train += (predicted == target).sum().item()

    if (epoch + 1) % log_interval == 0:
        average_loss = running_loss / len(train_dataloader)
        train_accuracy = 100 * correct_train / total_train
        print("train:",f'Epoch [{epoch + 1}/{epochs}], Loss: {average_loss:.4f}, Accuracy: {train_accuracy:.2f}%')

    if (epoch + 1) % save_interval == 0:
        model.eval()
        correct = 0
        total = 0
        test_loss = 0.0

        with torch.no_grad():
            for tensor, target in test_dataloader:
                tensor = tensor.cuda()
                target = target.cuda()
                output = model(tensor)
                # for num in range(0,25):
                #     print(output[num][1]) #eval probability.
                loss = loss_fun(output, target)
                test_loss += loss.item()
                _, predicted = torch.max(output.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()

        accuracy = 100 * correct / total
        test_loss /= len(test_dataloader)
        print("test:",f'Epoch [{epoch + 1}/{epochs}] - Accuracy on test data: {accuracy:.2f}%, Loss: {test_loss:.4f}')
        
        # Save the model
        model_path = os.path.join(model_save_directory, f'model_epoch_{epoch + 1}_acc_{accuracy:.2f}_loss_{test_loss:.4f}.pth')
        torch.save(model.state_dict(), model_path)
        print(f'Model saved to {model_path}')

