# 轉向量

In [None]:
import torch
from transformers import BertModel, BertTokenizer
import pandas as pd 
from tqdm import tqdm
import numpy as np

# 使用 cuda，如果有 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 載入 BERT 模型和斷詞機制
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)

# 設置模型為評估模式，以防止梯度計算
model.eval()

def sentences2list(path,key_1, key_2):
    df = pd.read_csv(path)
    sentences_1 = df[key_1].values.tolist()
    sentences_2 = df[key_2].values.tolist()

    return sentences_1,sentences_2



def get_sentence_vector(sentence):
    # Tokenize 句子並轉換為張量
    # turning 會引用到
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)

    # 用於不進行梯度計算的上下文管理器
    with torch.no_grad():
        # 獲取 BERT 模型的最後一層的輸出
        outputs = model(**inputs)
    
    # 最後一層的所有 token 的向量
    last_hidden_state = outputs.last_hidden_state

    # 將所有 token 的向量相加，並除以 token 的數量，得到句子的向量
    sentence_vector = (last_hidden_state.sum(dim=1) / last_hidden_state.shape[1]).cpu().numpy().tolist()

    return sentence_vector



def turning(sentences_list):
    vectors = []
    total = len(sentences_list)
    for num, sen in tqdm(enumerate(sentences_list),total = total, desc="sentence to vector ..."):
        vector = str(get_sentence_vector(sen))
        vectors.append(vector)
    return vectors

path_=r"C:\Users\YuCheng_Ch\Desktop\accton\甲方vs全部.csv"
sentence_1, sentence_2 = sentences2list(path=path_, key_1="sen1", key_2="sen2")
vector_1 = turning(sentence_1)
vector_2 = turning(sentence_2)

pd.DataFrame({"sen1":sentence_1,"vector1":vector_1,"sen2":sentence_2,"vector2":vector_2}).to_csv(path_,encoding="utf-8-sig")


# 移除前面檔案的左右括弧

In [None]:

import pandas as pd 
csv_file = path_

df = pd.read_csv(csv_file)

sen1 = df['sen1'].values.tolist()
vector_1 = df['vector1'].values.tolist()
sen2 = df['sen2'].values.tolist()
vector_2 = df['vector2'].values.tolist()


for num, vectors in enumerate(vector_1):
    vector_1[num] = vectors.replace('[[','[').replace(']]',']')

for num, vectors in enumerate(vector_2):
    vector_2[num] = vectors.replace('[[','[').replace(']]',']')

pd.DataFrame({"sen1":sentence_1,"vector1":vector_1,"sen2":sentence_2,"vector2":vector_2}).to_csv(csv_file,encoding='utf-8-sig')

# 兩向量相加除2

In [None]:
import pandas as pd
import ast

# 讀取 CSV 檔案

csv_file = path_
df = pd.read_csv(csv_file, encoding='utf-8-sig')

# 定義一個函數來解析向量，並進行向量相加和除以字數總和
def process_row(row):
    # 解析 list 字符串為真正的 lis
    sen_vector = ast.literal_eval(row['vector1'])
    pre_vector = ast.literal_eval(row['vector2'])

    # 向量相加
    merged_vector = [s + p for s, p in zip(sen_vector, pre_vector)]

    # 計算字數總和
    total_characters = len(row['sen1']) + len(row['sen2'])

    # 將相加的向量除以字數總和
    final_vector = [v / total_characters for v in merged_vector]

    return final_vector

# 應用函數處理每一行
df['final_vector'] = df.apply(process_row, axis=1)

# 保存結果到新的 CSV 檔案
df.to_csv(csv_file, index=False, encoding='utf-8-sig')

print("向量處理完成，已保存到{}'。".format(csv_file))

# 推論

In [None]:
import torch
import pandas as pd
import os
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np 

# 定義輸出 CSV 檔案路徑
input_csv_path = path_ # 替換成你的路徑
model_path = r'C:\Users\YuCheng_Ch\Desktop\datasets\onetime_train\model_epoch_175_acc_70.93_loss_0.7378.pth'
csv_output_path = r'C:\Users\YuCheng_Ch\Desktop\accton\預測結果\甲方vs全部.csv'
batch_size = 128



# 定義模型架構
class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        


        self.linear1 = nn.Linear(768, 512)
        self.linear2 = nn.Linear(512, 256)
        self.linear3 = nn.Linear(256, 128)        
        self.linear4 = nn.Linear(128, 64)
        self.linear5 = nn.Linear(64, 16)
        self.linear6 = nn.Linear(16, 8)
        self.output = nn.Linear(8, 2)   
        for param in self.linear6.parameters():             
            param.requires_grad = False
        for param in self.output.parameters(): 
            param.requires_grad = False

        
    def forward(self, x):
        x = self.linear1(x)
        #print(f'After linear1: {x.shape}')  # 調試輸出
        x = F.relu(x)
        #print(f'After relu1: {x.shape}')  # 調試輸出
        x = self.linear2(x)
        #print(f'After linear2: {x.shape}')  # 調試輸出
        x = F.relu(x)
        x = self.linear3(x)
        x = F.relu(x)
        x = self.linear4(x)
        x = F.relu(x)
        x = self.linear5(x)
        x = F.relu(x)
        x = self.linear6(x)
        x = F.relu(x)
        x = self.output(x)
        x = F.softmax(x, dim=1)
        #print(f'Final output: {x.shape}')  # 調試輸出
  
        return x



df = pd.read_csv(input_csv_path)

# 提取句子和向量
sen = (df['sen1']+df['sen2']).values.tolist()
final_vector = df['final_vector'].apply(eval).tolist()


model = NN()
model.load_state_dict(torch.load(model_path, weights_only=True))
model = model.cuda()
model.eval()

# 轉換向量為Tensor
merged_vectors = torch.tensor(final_vector, dtype=torch.float32).cuda()
max_num = torch.max(merged_vectors)
merged_vectors = merged_vectors/max_num

# 使用 DataLoader 進行批量預測
dataset = TensorDataset(merged_vectors)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

class_1_probabilities = []

with torch.no_grad():
    for batch in dataloader:
        batch = batch[0].cuda()
        logits = model(batch)
        print(f'Logits: {logits[:2]}')  # 打印前5個 logits
        logits = F.softmax(logits, dim=1)
        print(f'Softmax: {logits[:2]}')  # 打印前5個 softmax 結果
        class_1_probabilities.extend(logits[:, 1].cpu().numpy())



# 儲存結果到CSV
pd.DataFrame({'sentence': sen, 'class_1_probabilities': class_1_probabilities}).to_csv(csv_output_path, encoding='utf-8-sig', index=False)