In [1]:
import json
from tqdm import tqdm
import pandas as pd

unique_labels = [
    "dis",
    "sym",
    "pro",
    "equ",
    "dru",
    "ite",
    "bod",
    "dep",
    "mic",
]

bio_unique_labels = []
bio_unique_labels.extend([ "B-"+l for l in unique_labels])
bio_unique_labels.extend([ "I-"+l for l in unique_labels])
bio_unique_labels.extend([ "O-"+l for l in unique_labels])
bio_unique_labels.extend([ "E-"+l for l in unique_labels])
bio_unique_labels.extend([ "S-"+l for l in unique_labels])
bio_unique_labels.append("O")

labels_to_ids = {k: v for v, k in enumerate(sorted(bio_unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(bio_unique_labels))}

def handle_raw_data(raw_path:str,csv_path:str):
    pre_pd = pd.DataFrame(columns=["W","B"])
    with open(raw_path,"r") as file:
        raw_data = json.load(file)
        raw_data_bar = tqdm(raw_data)
        for r in raw_data_bar:
            words = r["text"]
            if len(r["text"]) > 512:
                words = words[0:512]
            bio = ["O"]*len(r["text"])
            for e in r["entities"]:
                e_type = e["type"]
                e_start =int(e["start_idx"])
                e_end = int(e["end_idx"])
                if e_end >512:
                    continue
                if e_end - e_start == 1 :
                    bio[e_start] = "S-"+e_type
                elif e_end- e_start == 2:
                    bio[e_start] = "B-"+e_type
                    bio[e_end-1]="E-"+e_type
                elif e_end - e_start >= 3:
                    bio[e_start] = "B-"+e_type
                    for i in range(e_start+1,e_end-1):
                        bio[i]="I-"+e_type
                    bio[e_end-1]="E-"+e_type
                else:
                    print("error")                  
            pre_pd.loc[len(pre_pd)] = {"W":words,"B":bio}
    pre_pd.to_csv(csv_path)
    return pre_pd
                            


In [3]:
print(bio_unique_labels)

['dis', 'sym', 'pro', 'equ', 'dru', 'ite', 'bod', 'dep', 'mic']


In [None]:

train_pd = handle_raw_data("./data/train.json","data.csv")


In [None]:
from transformers import BertModel, BertTokenizer ,BertTokenizerFast

bert = BertModel.from_pretrained("chinese-roberta-wwm-ext")
# tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
tokenizer =BertTokenizerFast.from_pretrained('chinese-roberta-wwm-ext')

test_sentence = train_pd["W"].to_list()
print(test_sentence[33])
text_tokenized = tokenizer(
    test_sentence,
    padding='max_length',
    max_length=512,
    truncation=True,
    return_tensors="pt"
    )
print(text_tokenized)


In [None]:
pytokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
input_ids = text_tokenized.input_ids[0]
print(pytokenizer.decode(input_ids))
print(len(input_ids))


In [None]:

print(pytokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0]))
# print(word_ids)


In [None]:
from transformers import BertModel

model = BertModel.from_pretrained("chinese-roberta-wwm-ext")
model.cuda()

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# 加载预训练的BERT模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)

# 准备训练数据
sentences = ['I love apples', 'I dislike onions']
labels = [1, 0]  # 正类别: 1, 负类别: 0

# 将文本转换为BERT可接受的编码表示
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# 前向传播
outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
logits = outputs[0]  # 获取logits

# 计算损失函数
loss_fn = torch.nn.CrossEntropyLoss()
loss = loss_fn(logits, torch.tensor(labels))

# 后向传播和优化
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
optimizer.zero_grad()
loss.backward()
optimizer.step()

# 预测新的词
new_sentences = ['I like bananas', 'I hate spiders']
new_inputs = tokenizer(new_sentences, padding=True, truncation=True, return_tensors='pt')
new_outputs = model(input_ids=new_inputs['input_ids'], attention_mask=new_inputs['attention_mask'])
new_logits = new_outputs[0]  # 获取logits
predicted_labels = torch.argmax(new_logits, dim=1)

print(predicted_labels.tolist())  # 输出预测的标签

In [None]:
from transformers import BertConfig

config = BertConfig.from_pretrained("./chinese-roberta-wwm-ext/")

config

In [None]:
config.hidden_size