# 轉向量

In [5]:
import torch
from transformers import BertModel, BertTokenizer
import pandas as pd 
from tqdm import tqdm
import numpy as np

# 使用 cuda，如果有 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 載入 BERT 模型和斷詞機制
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)

# 設置模型為評估模式，以防止梯度計算
model.eval()

def sentences2list(path,key_1, key_2):
    df = pd.read_csv(path)
    sentence_merged = (df[key_1] + df[key_2]).tolist()    
    return sentence_merged



def get_sentence_vector(batch_sentences):
    # Tokenize 句子並轉換為張量
    inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True).to(device)

    # 用於不進行梯度計算的上下文管理器
    with torch.no_grad():
        # 獲取 BERT 模型的最後一層的輸出
        outputs = model(**inputs)
    
    # 最後一層的所有 token 的向量
    last_hidden_state = outputs.last_hidden_state

    # 將所有 token 的向量相加，並除以 token 的數量，得到句子的向量
    sentence_vectors = (last_hidden_state.sum(dim=1)/torch.max(last_hidden_state))
    sentence_vectors = (sentence_vectors/last_hidden_state.shape[1]).cpu().numpy().tolist()

    return sentence_vectors

def turning(sentences_list, batch_size=512):
    vectors = []
    total = len(sentences_list)
    
    # 分批處理
    for i in tqdm(range(0, total, batch_size), total=total // batch_size + 1, desc="sentence to vector ..."):
        batch_sentences = sentences_list[i:i+batch_size]
        batch_vectors = get_sentence_vector(batch_sentences)
        vectors.extend([str(vector) for vector in batch_vectors])
    
    return vectors


path_= r"C:\Users\YuCheng_Ch\Desktop\thesis\因此，檢察官對於起訴之犯罪事實，應負提出證據及說服之實質舉證責任。.csv"
sentence_merged = sentences2list(path_, "sen1", "sen2")
vector = turning(sentence_merged,batch_size=256)


pd.DataFrame({"sentence_merged":sentence_merged,"vector":vector}).to_csv(path_,encoding="utf-8-sig")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
sentence to vector ...:   0%|          | 0/2048 [00:00<?, ?it/s]

# 移除前面檔案的左右括弧

In [None]:


import pandas as pd 
csv_file = path_

df = pd.read_csv(csv_file)

sentences_merged = df['sentence_merged'].values.tolist()
vectors = df['vector'].values.tolist()



for num, vector in enumerate(vectors):
    vectors[num] = vector.replace('[[','[').replace(']]',']')



pd.DataFrame({"sentences_merged":sentences_merged,"vector":vectors}).to_csv(csv_file,encoding='utf-8-sig')

# 推論

In [None]:
import torch
import pandas as pd
import os
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np 
csv_file=csv_file
# 定義輸出 CSV 檔案路徑
input_csv_path = path_ # 替換成你的路徑
model_path = r'C:\Users\YuCheng_Ch\Desktop\thesis\onetime_train\model_epoch_200_acc_76.64_loss_0.3114.pth'
csv_output_path = csv_file
batch_size = 20480



# 定義模型架構
class NN(nn.Module):
    def __init__(self):
        super(NN, self).__init__()
        


        self.linear1 = nn.Linear(768, 512)
        self.linear2 = nn.Linear(512, 256)
        self.linear3 = nn.Linear(256, 128)        
        self.linear4 = nn.Linear(128, 64)
        self.linear5 = nn.Linear(64, 16)
        self.linear6 = nn.Linear(16, 8)
        self.output = nn.Linear(8, 2)   

        
    def forward(self, x):
        x = self.linear1(x)
        #print(f'After linear1: {x.shape}')  # 調試輸出
        x = F.relu(x)
        #print(f'After relu1: {x.shape}')  # 調試輸出
        x = self.linear2(x)
        #print(f'After linear2: {x.shape}')  # 調試輸出
        x = F.relu(x)
        x = self.linear3(x)
        x = F.relu(x)
        x = self.linear4(x)
        x = F.relu(x)
        x = self.linear5(x)
        x = F.relu(x)
        x = self.linear6(x)
        x = F.relu(x)
        x = self.output(x)
        x = F.softmax(x, dim=1)
        #print(f'Final output: {x.shape}')  # 調試輸出
  
        return x



df = pd.read_csv(input_csv_path)

# 提取句子和向量
sen = (df['sentences_merged']).values.tolist()
final_vector = df['vector'].apply(eval).tolist()


model = NN()
model.load_state_dict(torch.load(model_path, weights_only=True))
model = model.cuda()
model.eval()

# 轉換向量為Tensor
merged_vectors = torch.tensor(final_vector, dtype=torch.float32).cuda()
max_num = torch.max(merged_vectors)
merged_vectors = merged_vectors/max_num

# 使用 DataLoader 進行批量預測
dataset = TensorDataset(merged_vectors)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

class_1_probabilities = []

with torch.no_grad():
    for batch in dataloader:
        batch = batch[0].cuda()
        logits = model(batch)
        print(f'Logits: {logits[:2]}')  # 打印前5個 logits
        logits = F.softmax(logits, dim=1)
        print(f'Softmax: {logits[:2]}')  # 打印前5個 softmax 結果
        class_1_probabilities.extend(logits[:, 1].cpu().numpy())



# 儲存結果到CSV
pd.DataFrame({'sentence': sen, 'class_1_probabilities': class_1_probabilities}).to_csv(csv_output_path+"_", encoding='utf-8-sig', index=False)