# 轉向量

In [None]:
import torch
from transformers import BertModel, BertTokenizer
import pandas as pd 
from tqdm import tqdm
import numpy as np

# 使用 cuda，如果有 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 載入 BERT 模型和斷詞機制
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)

# 設置模型為評估模式，以防止梯度計算
model.eval()

def sentences2list(path,key_1, key_2):
    df = pd.read_csv(path)
    sentence_merged = df[key_1]+df[key_2]
    label = df['label']
    
    
    return sentence_merged, label



def get_sentence_vector(sentence):
    # Tokenize 句子並轉換為張量
    # turning 會引用到
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)

    # 用於不進行梯度計算的上下文管理器
    with torch.no_grad():
        # 獲取 BERT 模型的最後一層的輸出
        outputs = model(**inputs)
    
    # 最後一層的所有 token 的向量
    last_hidden_state = outputs.last_hidden_state

    
    
    # 將所有 token 的向量相加，並除以 token 的數量，得到句子的向量
    sentence_vector = (last_hidden_state.sum(dim=1)/last_hidden_state.shape[1]).cpu().numpy().tolist()

    return sentence_vector



def turning(sentences_list):
    vectors = []
    total = len(sentences_list)
    for num, sen in tqdm(enumerate(sentences_list),total = total, desc="sentence to vector ..."):
        vector = str(get_sentence_vector(sen))
        vectors.append(vector)
    return vectors

path_=r"C:\Users\YuCheng_Ch\Desktop\accton\hr_datasets_5798 1.csv"
sentence_merged,label = sentences2list(path_, "sen1", "sen2")
vector = turning(sentence_merged)


pd.DataFrame({"sentence_merged":sentence_merged,"vector":vector,"label":label}).to_csv(path_,encoding="utf-8-sig")


# 移除前面檔案的左右括弧

In [None]:


import pandas as pd 
csv_file = path_

df = pd.read_csv(csv_file)

sentences_merged = df['sentence_merged'].values.tolist()
vectors = df['vector'].values.tolist()
labels = df['label'].values.tolist()


for num, vector in enumerate(vectors):
    vectors[num] = vector.replace('[[','[').replace(']]',']')



pd.DataFrame({"sentences_merged":sentences_merged,"vector":vectors,"label":labels}).to_csv(csv_file,encoding='utf-8-sig')

# 兩向量之間運算

In [None]:
import pandas as pd
import ast

# 讀取 CSV 檔案

csv_file = path_
df = pd.read_csv(csv_file, encoding='utf-8-sig')

# 定義一個函數來解析向量，並進行向量相加和除以字數總和
def process_row(row):
    # 解析 list 字符串為真正的 lis
    sen_vector = ast.literal_eval(row['vector1'])
    pre_vector = ast.literal_eval(row['vector2'])

    # 向量相加
    merged_vector = [s + p for s, p in zip(sen_vector, pre_vector)]

    # 計算字數總和
    total_characters = len(row['sen1']) + len(row['sen2'])

    # 將相加的向量除以字數總和
    final_vector = [v / total_characters for v in merged_vector]

    return final_vector

# 應用函數處理每一行
df['final_vector'] = df.apply(process_row, axis=1)

# 保存結果到新的 CSV 檔案
df.to_csv(csv_file, index=False, encoding='utf-8-sig')

print("向量處理完成，已保存到{}'。".format(csv_file))

# 訓練

In [10]:
import torch
from torch.utils import data as data_
import torch.nn as nn
from torch.autograd import Variable
import matplotlib.pyplot as plt
import torchvision
import pandas as pd 
from sklearn.model_selection import train_test_split
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from torch import optim
import ast

# 定義輸出 CSV 檔案路徑
input_csv_path = csv_file # 替換成你的路徑
model_save_directory = r'C:\Users\YuCheng_Ch\Desktop\accton\onetime_train'
batch_size = 128



# 定義模型架構
class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        


        self.linear1 = nn.Linear(768, 512)
        self.linear2 = nn.Linear(512, 256)
        self.linear3 = nn.Linear(256, 128)        
        self.linear4 = nn.Linear(128, 64)
        self.linear5 = nn.Linear(64, 16)
        self.linear6 = nn.Linear(16, 8)
        self.output = nn.Linear(8, 2)   

        
    def forward(self, x):
        x = self.linear1(x)
        #print(f'After linear1: {x.shape}')  # 調試輸出
        x = F.relu(x)
        #print(f'After relu1: {x.shape}')  # 調試輸出
        x = self.linear2(x)
        #print(f'After linear2: {x.shape}')  # 調試輸出
        x = F.relu(x)
        x = self.linear3(x)
        x = F.relu(x)
        x = self.linear4(x)
        x = F.relu(x)
        x = self.linear5(x)
        x = F.relu(x)
        x = self.linear6(x)
        x = F.relu(x)
        x = self.output(x)
  
        return x



df = pd.read_csv(input_csv_path)
sentences_merged = df['sentences_merged'].values.tolist()
vector = df['vector'].apply(ast.literal_eval).tolist()
label = df['label'].values.tolist()


vector = torch.tensor(vector,dtype=torch.float32).cuda()
vector = vector / torch.max(vector)



merged_label = df['label'].values.tolist()
target_tensor = torch.tensor(merged_label,dtype=torch.float64).cuda()
train_x, test_x, train_y, test_y = train_test_split(vector,target_tensor, test_size=0.1,shuffle=True)

train_x = torch.tensor(train_x, dtype=torch.float32).cuda()
test_x = torch.tensor(test_x, dtype=torch.float32).cuda()
train_y = torch.tensor(train_y, dtype=torch.long).cuda()
test_y = torch.tensor(test_y, dtype=torch.long).cuda()

#xy打包
train = torch.utils.data.TensorDataset(train_x, train_y)
test = torch.utils.data.TensorDataset(test_x, test_y)
#打包後迭帶
train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True)



LR = 5e-5
##### 加權損失函數
class_counts = np.bincount(train_y.cpu().numpy())
class_weights = 1. / class_counts
weights = torch.tensor(class_weights, dtype=torch.float32).cuda()
loss_fun = nn.CrossEntropyLoss(weight=weights)
##### 加權損失函數


model = NN().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)


import os 
epochs = 100
save_interval = 25
log_interval = 25


# 確保儲存資料夾存在
os.makedirs(model_save_directory, exist_ok=True)

for epoch in tqdm(range(epochs)):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for tensor, target in train_dataloader:   
        tensor = tensor.cuda()
        target = target.cuda()
        optimizer.zero_grad()
        result = model(tensor)
        loss = loss_fun(result, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(result.data, 1)
        total_train += target.size(0)
        correct_train += (predicted == target).sum().item()

    if (epoch + 1) % log_interval == 0:
        average_loss = running_loss / len(train_dataloader)
        train_accuracy = 100 * correct_train / total_train
        print("train:",f'Epoch [{epoch + 1}/{epochs}], Loss: {average_loss:.4f}, Accuracy: {train_accuracy:.2f}%')

    if (epoch + 1) % save_interval == 0:
        model.eval()
        correct = 0
        total = 0
        test_loss = 0.0

        with torch.no_grad():
            for tensor, target in test_dataloader:
                tensor = tensor.cuda()
                target = target.cuda()
                output = model(tensor)
                # for num in range(0,25):
                #     print(output[num][1]) #eval probability.
                loss = loss_fun(output, target)
                test_loss += loss.item()
                _, predicted = torch.max(output.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()

        accuracy = 100 * correct / total
        test_loss /= len(test_dataloader)
        print("test:",f'Epoch [{epoch + 1}/{epochs}] - Accuracy on test data: {accuracy:.2f}%, Loss: {test_loss:.4f}')
        
        # Save the model
        model_path = os.path.join(model_save_directory, f'model_epoch_{epoch + 1}_acc_{accuracy:.2f}_loss_{test_loss:.4f}.pth')
        torch.save(model.state_dict(), model_path)
        print(f'Model saved to {model_path}')



  train_x = torch.tensor(train_x, dtype=torch.float32).cuda()
  test_x = torch.tensor(test_x, dtype=torch.float32).cuda()
  train_y = torch.tensor(train_y, dtype=torch.long).cuda()
  test_y = torch.tensor(test_y, dtype=torch.long).cuda()
 27%|██▋       | 27/100 [00:02<00:06, 12.14it/s]

train: Epoch [25/100], Loss: 0.4744, Accuracy: 78.17%
test: Epoch [25/100] - Accuracy on test data: 77.59%, Loss: 0.4981
Model saved to C:\Users\YuCheng_Ch\Desktop\accton\onetime_train\model_epoch_25_acc_77.59_loss_0.4981.pth


 51%|█████     | 51/100 [00:04<00:04, 11.98it/s]

train: Epoch [50/100], Loss: 0.4280, Accuracy: 80.37%
test: Epoch [50/100] - Accuracy on test data: 78.70%, Loss: 0.4602
Model saved to C:\Users\YuCheng_Ch\Desktop\accton\onetime_train\model_epoch_50_acc_78.70_loss_0.4602.pth


 77%|███████▋  | 77/100 [00:06<00:01, 12.18it/s]

train: Epoch [75/100], Loss: 0.3969, Accuracy: 82.17%
test: Epoch [75/100] - Accuracy on test data: 79.63%, Loss: 0.4509
Model saved to C:\Users\YuCheng_Ch\Desktop\accton\onetime_train\model_epoch_75_acc_79.63_loss_0.4509.pth


100%|██████████| 100/100 [00:08<00:00, 12.15it/s]

train: Epoch [100/100], Loss: 0.3718, Accuracy: 83.32%
test: Epoch [100/100] - Accuracy on test data: 80.19%, Loss: 0.4608
Model saved to C:\Users\YuCheng_Ch\Desktop\accton\onetime_train\model_epoch_100_acc_80.19_loss_0.4608.pth



