In [1]:
import spacy
import torch
import json
import numpy as np
from tqdm import tqdm
from transformers import RobertaTokenizer, RobertaModel
from scipy.sparse import csr_matrix

In [2]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [3]:
# 加载英语模型
nlp = spacy.load("en_core_web_sm")
# model = RobertaModel.from_pretrained("roberta-base")
# tokenizer = RobertaTokenizer.from_pretrained("roberta-base", do_lower_case=False)
model = RobertaModel.from_pretrained("./roberta-base/")
tokenizer = RobertaTokenizer("./roberta-base/vocab.json", "./roberta-base/merges.txt", use_fast=False)
vocab_size = len(tokenizer)

Some weights of the model checkpoint at ./roberta-base/ were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
# texts = list()
# with open("original_text/hc3human.json", "r", encoding="utf-8") as f:
#     for line in f.readlines():
#         texts.append(line)
# with open("original_text/hc3chatgpt.json", "r", encoding="utf-8") as f:
#     for line in f.readlines():
#         texts.append(line)
# print(len(texts))

10000


In [17]:
# import random
# random.seed(42)

# random.shuffle(texts)

# with open("original_text/hc3_train.json", "w", encoding="utf-8") as f:
#     for i in range(0, 8000):
#         f.write(texts[i])
# with open("original_text/hc3_val.json", "w", encoding="utf-8") as f:
#     for i in range(8000, 9000):
#         f.write(texts[i])
# with open("original_text/hc3_test.json", "w", encoding="utf-8") as f:
#     for i in range(9000, 10000):
#         f.write(texts[i])

In [4]:
texts = list()
with open("original_text/hc3human.json", "r", encoding="utf-8") as f:
    for line in f.readlines():
        text = json.loads(line.strip())['text']
        texts.append(text)
with open("original_text/hc3chatgpt.json", "r", encoding="utf-8") as f:
    for line in f.readlines():
        text = json.loads(line.strip())['text']
        texts.append(text)
print(len(texts))

10000


In [53]:
tokenized_sentences = list()
all_token_embeddings = list()
all_edge_index = list()
all_sparse_adj_matrix = list()
for text in tqdm(texts):
    try:
        doc = nlp(text)
        tokenized_sentence = [token.text for token in doc]
        tokenized_sentences.append(tokenized_sentence)
        # print(tokenized_sentence)
        
        max_length = 512
        chunks = [tokenized_sentence[i:i+max_length] for i in range(0, len(tokenized_sentence), max_length)]
        chunk_outputs = []
        for chunk in chunks:
            token_ids = tokenizer.convert_tokens_to_ids(chunk)
            input_ids = torch.tensor(token_ids).unsqueeze(0)
            with torch.no_grad():
                output = model(input_ids)

            last_hidden_states = output.last_hidden_state
            token_embeddings = last_hidden_states[0]
            chunk_outputs.append(token_embeddings)
        token_embeddings = torch.cat(chunk_outputs, dim=0)
        all_token_embeddings.append(token_embeddings)
        # print(len(tokenized_sentence))
        # print(token_embeddings.shape)
        node_relations = list()
        for i,word in enumerate(doc):        
            node_relations.append([word.i,word.head.i])
            # 加上自环
            if word.i != word.head.i:
                node_relations.append([word.i,word.i])
        edge0 = list()
        edge1 = list()
        for edge in node_relations:
            edge0.append(edge[0])
            edge1.append(edge[1])
        edge_index = torch.tensor([edge0, edge1], dtype=torch.long)
        all_edge_index.append(edge_index)
        # sparse_adj_matrix = csr_matrix((np.ones(len(edge0)),(np.array(edge0), np.array(edge1))),shape=(len(tokenized_sentence),len(tokenized_sentence)))
        # dependency_matrix = sparse_adj_matrix
        # print(sparse_adj_matrix)
        # all_sparse_adj_matrix.append(sparse_adj_matrix)
    except Exception as e:
        print(text)
        print(e)

100%|██████████| 10000/10000 [27:04<00:00,  6.16it/s] 


In [19]:
y=torch.cat((torch.zeros(5000), torch.ones(5000)), dim = 0)
y

tensor([0., 0., 0.,  ..., 1., 1., 1.])

In [31]:
import pickle
with open("graph_data/hc3_all_token_embeddings.pkl", "wb") as f:
    pickle.dump(all_token_embeddings, f)
with open("graph_data/hc3_all_edge_index.pkl", "wb") as f:
    pickle.dump(all_edge_index, f)
with open("graph_data/hc3_y.pkl", "wb") as f:
    pickle.dump(y, f)

训练模型

In [59]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from tqdm import tqdm

# 构建 GCN 模型
class GCN(nn.Module):
    def __init__(self,  input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
        self.fc = nn.Linear(output_dim, 1) 
        
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = torch.mean(x, dim=0, keepdim=True)  
        x = self.fc(x)
        return torch.sigmoid(x)  

In [60]:
import pickle
import random
random.seed(42)

# with open("graph_data/hc3_all_token_embeddings.pkl", "rb") as f:
#     all_token_embeddings = pickle.load(f)
# with open("graph_data/hc3_all_edge_index.pkl", "rb") as f:
#     all_edge_index = pickle.load(f)
# with open("graph_data/hc3_y.pkl", "rb") as f:
#     y = pickle.load(f)
# print(len(all_token_embeddings), len(all_edge_index), len(y))

combined_list = list(zip(all_token_embeddings, all_edge_index, y))
random.shuffle(combined_list)
all_token_embeddings, all_edge_index, y = zip(*combined_list)

In [61]:
# 模型训练
input_dim = 768  # 输入维度
hidden_dim = 256  # 隐藏层维度
output_dim = 2  # 输出类别数
train_len = int(len(all_token_embeddings)*0.8)
val_len = len(all_token_embeddings) - train_len
gcnmodel = GCN(input_dim, hidden_dim, output_dim)
optimizer = optim.Adam(gcnmodel.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [62]:
epochs = 20
train_loss = []
val_loss = []
train_acc = []
val_acc = []
for epoch in range(epochs):
    # 训练集
    epoch_loss = 0.0
    correct_predictions = 0
    for i in tqdm(range(train_len),  f"epoch: {epoch+1}, Training"):
        data = Data(x=all_token_embeddings[i], edge_index=all_edge_index[i], y=y[i])
        optimizer.zero_grad()
        outputs = gcnmodel(data)
        # print(outputs)
        loss = criterion(outputs, data.y.float().view(-1, 1))
        # print(loss)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        predictions = (outputs >= 0.5).long()  
        correct_predictions += (predictions == data.y.view(-1, 1)).sum().item()
    epoch_loss /= train_len
    epoch_acc = correct_predictions / train_len
    print(f"epoch: {epoch+1}, train_loss: {epoch_loss}, train_acc: {epoch_acc}")
    train_loss.append(epoch_loss)
    train_acc.append(epoch_acc)
    
    # 验证集
    epoch_loss = 0.0
    correct_predictions = 0
    for i in tqdm(range(train_len, len(all_token_embeddings)),  f"epoch: {epoch+1}, Validation"):
        data = Data(x=all_token_embeddings[i], edge_index=all_edge_index[i], y=y[i])
        outputs = gcnmodel(data)
        loss = criterion(outputs, data.y.float().view(-1, 1))
        epoch_loss += loss.item()
        predictions = (outputs >= 0.5).long()
        correct_predictions += (predictions == data.y.view(-1, 1)).sum().item()
    epoch_loss /= val_len
    epoch_acc = correct_predictions / val_len
    print(f"epoch: {epoch+1}, val_loss: {epoch_loss}, val_acc: {epoch_acc}")
    val_loss.append(epoch_loss)
    val_acc.append(epoch_acc)

epoch: 1, Training: 100%|██████████| 8000/8000 [00:31<00:00, 252.31it/s]


epoch: 1, train_loss: 0.34135247974603705, train_acc: 0.850125


epoch: 1, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 623.50it/s]


epoch: 1, val_loss: 0.17168363584279023, val_acc: 0.9565


epoch: 2, Training: 100%|██████████| 8000/8000 [00:31<00:00, 252.04it/s]


epoch: 2, train_loss: 0.16874683047063344, train_acc: 0.944875


epoch: 2, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 614.52it/s]


epoch: 2, val_loss: 0.1183810183384536, val_acc: 0.9635


epoch: 3, Training: 100%|██████████| 8000/8000 [00:32<00:00, 249.77it/s]


epoch: 3, train_loss: 0.12373776972758295, train_acc: 0.961


epoch: 3, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 623.25it/s]


epoch: 3, val_loss: 0.09451926180805875, val_acc: 0.9685


epoch: 4, Training: 100%|██████████| 8000/8000 [00:31<00:00, 253.82it/s]


epoch: 4, train_loss: 0.09552198397244936, train_acc: 0.9715


epoch: 4, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 631.40it/s]


epoch: 4, val_loss: 0.08200260208252942, val_acc: 0.9725


epoch: 5, Training: 100%|██████████| 8000/8000 [00:31<00:00, 253.88it/s]


epoch: 5, train_loss: 0.07692710811232956, train_acc: 0.977375


epoch: 5, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 624.59it/s]


epoch: 5, val_loss: 0.08224168126330697, val_acc: 0.972


epoch: 6, Training: 100%|██████████| 8000/8000 [00:31<00:00, 253.34it/s]


epoch: 6, train_loss: 0.06167388150758772, train_acc: 0.983375


epoch: 6, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 622.46it/s]


epoch: 6, val_loss: 0.08533512484207548, val_acc: 0.972


epoch: 7, Training: 100%|██████████| 8000/8000 [00:31<00:00, 252.56it/s]


epoch: 7, train_loss: 0.04846011193181407, train_acc: 0.9875


epoch: 7, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 623.88it/s]


epoch: 7, val_loss: 0.09491106975913294, val_acc: 0.9725


epoch: 8, Training: 100%|██████████| 8000/8000 [00:32<00:00, 249.68it/s]


epoch: 8, train_loss: 0.04037994268652748, train_acc: 0.98975


epoch: 8, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 621.93it/s]


epoch: 8, val_loss: 0.09670232109449446, val_acc: 0.9705


epoch: 9, Training: 100%|██████████| 8000/8000 [00:32<00:00, 242.48it/s]


epoch: 9, train_loss: 0.03317827793079549, train_acc: 0.993


epoch: 9, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 619.55it/s]


epoch: 9, val_loss: 0.09012448973740483, val_acc: 0.9745


epoch: 10, Training: 100%|██████████| 8000/8000 [00:33<00:00, 241.39it/s]


epoch: 10, train_loss: 0.03157115008249196, train_acc: 0.99275


epoch: 10, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 622.29it/s]


epoch: 10, val_loss: 0.1191908633668014, val_acc: 0.964


epoch: 11, Training: 100%|██████████| 8000/8000 [00:33<00:00, 239.28it/s]


epoch: 11, train_loss: 0.03253350948001371, train_acc: 0.992125


epoch: 11, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 618.20it/s]


epoch: 11, val_loss: 0.111037132947899, val_acc: 0.969


epoch: 12, Training: 100%|██████████| 8000/8000 [00:33<00:00, 239.40it/s]


epoch: 12, train_loss: 0.025883183552748032, train_acc: 0.9945


epoch: 12, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 617.98it/s]


epoch: 12, val_loss: 0.1696719238373939, val_acc: 0.968


epoch: 13, Training: 100%|██████████| 8000/8000 [00:33<00:00, 241.30it/s]


epoch: 13, train_loss: 0.022459803556593834, train_acc: 0.995875


epoch: 13, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 618.86it/s]


epoch: 13, val_loss: 0.10392842729426462, val_acc: 0.9695


epoch: 14, Training: 100%|██████████| 8000/8000 [00:33<00:00, 241.19it/s]


epoch: 14, train_loss: 0.023162085708960715, train_acc: 0.995625


epoch: 14, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 622.59it/s]


epoch: 14, val_loss: 0.16084086552554713, val_acc: 0.967


epoch: 15, Training: 100%|██████████| 8000/8000 [00:33<00:00, 239.93it/s]


epoch: 15, train_loss: 0.02067339141732088, train_acc: 0.995625


epoch: 15, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 617.66it/s]


epoch: 15, val_loss: 0.15628841045508812, val_acc: 0.9675


epoch: 16, Training: 100%|██████████| 8000/8000 [00:33<00:00, 239.90it/s]


epoch: 16, train_loss: 0.02216562623076597, train_acc: 0.99525


epoch: 16, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 624.74it/s]


epoch: 16, val_loss: 0.1103282908285541, val_acc: 0.968


epoch: 17, Training: 100%|██████████| 8000/8000 [00:33<00:00, 239.54it/s]


epoch: 17, train_loss: 0.019099550314535363, train_acc: 0.996125


epoch: 17, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 612.46it/s]


epoch: 17, val_loss: 0.1286017796114064, val_acc: 0.964


epoch: 18, Training: 100%|██████████| 8000/8000 [00:33<00:00, 239.38it/s]


epoch: 18, train_loss: 0.017194813125385316, train_acc: 0.997


epoch: 18, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 619.50it/s]


epoch: 18, val_loss: 0.11227779213108033, val_acc: 0.9705


epoch: 19, Training: 100%|██████████| 8000/8000 [00:33<00:00, 239.72it/s]


epoch: 19, train_loss: 0.01681230262956831, train_acc: 0.99725


epoch: 19, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 619.28it/s]


epoch: 19, val_loss: 0.2680402272533756, val_acc: 0.969


epoch: 20, Training: 100%|██████████| 8000/8000 [00:34<00:00, 234.50it/s]


epoch: 20, train_loss: 0.017770611136053473, train_acc: 0.996625


epoch: 20, Validation: 100%|██████████| 2000/2000 [00:03<00:00, 623.58it/s]

epoch: 20, val_loss: 0.27335069010438745, val_acc: 0.968





In [36]:
torch.save(gcnmodel.state_dict(), './model/gcn_model.pth')

In [63]:
max(val_acc)

0.9745