# 情感分析的系统开发

### 首先自己制作数据集，看看有没有开源的数据集后续可以替换
（ChnSentiCorp、THUCNews等）

In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset

# 假设数据集包含两列： 'text'（用户评论文本）和 'label'（情感标签，如 0 表示负向，1 表示中性，2 表示正向）
data = pd.DataFrame({
    "text": ["我很喜欢这个产品！", "服务态度差", "质量不错，但是有点贵", "非常满意，下次还会购买"],
    "label": [2, 0, 1, 2]
})

# 数据预处理
def preprocess_data(data, max_length=128):
    tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")  # 使用中文BERT
    # 对数据进行编码
    encodings = tokenizer(data['text'].tolist(), truncation=True, padding=True, max_length=max_length)
    return encodings

# 加载数据并分为训练集和测试集
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_encodings = preprocess_data(train_data)
test_encodings = preprocess_data(test_data)

# 转换为Dataset格式
train_dataset = Dataset.from_dict({"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"], "labels": train_data['label'].tolist()})
test_dataset = Dataset.from_dict({"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"], "labels": test_data['label'].tolist()})

In [3]:
# 模型初始化
model = BertForSequenceClassification.from_pretrained("bert-base-chinese", num_labels=3)    #这里使用中文的bert模型进行训练

# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",          # 输出目录
    eval_strategy="epoch",     # 每个epoch后进行评估
    per_device_train_batch_size=8,   # 每个设备的训练批次大小
    per_device_eval_batch_size=8,    # 每个设备的评估批次大小
    num_train_epochs=3,              # 训练的总epoch数
    logging_dir='./logs',            # 日志保存目录
)

# 使用 Trainer API 进行训练和评估
trainer = Trainer(
    model=model,                         # 模型
    args=training_args,                  # 训练参数
    train_dataset=train_dataset,         # 训练数据集
    eval_dataset=test_dataset            # 测试数据集
)

# 模型训练
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.333214
2,No log,1.346904
3,No log,1.378777




TrainOutput(global_step=3, training_loss=0.9053508440653483, metrics={'train_runtime': 3.5914, 'train_samples_per_second': 2.506, 'train_steps_per_second': 0.835, 'total_flos': 60125527098.0, 'train_loss': 0.9053508440653483, 'epoch': 3.0})

### 这个地方是做数据增强的的，其实无所谓，在原数据比较好的情况下也不需要做数据增强

In [None]:
# 模型评估
eval_results = trainer.evaluate()
print(f"模型评估结果: {eval_results}")


# 例12-2
import random
import jieba
from synonyms import synonyms
from opencc import OpenCC

# 加载原始数据
data = [
    {"text": "这款产品非常好用，功能强大且易操作", "label": "positive"},
    {"text": "服务态度差，体验非常糟糕", "label": "negative"},
    {"text": "产品质量一般，但价格实惠", "label": "neutral"}
]

# 近义词替换函数
def synonym_replacement(text, replace_prob=0.3):
    words = jieba.lcut(text)
    new_words = []
    for word in words:
        if random.random() < replace_prob:
            similar_words = synonyms.nearby(word)
            if similar_words:  # 如果有近义词
                word = random.choice(similar_words[0])
        new_words.append(word)
    return ''.join(new_words)

# 简繁体转换函数
cc = OpenCC('s2t')
def convert_simplified_to_traditional(text):
    return cc.convert(text)

# 拼写变化函数（适用于中文拼音相似的替换）
def typo_augmentation(text, typo_prob=0.2):
    typo_dict = {'好': '号', '差': '查', '强': '墙', '易': '依'}
    words = list(text)
    for i, word in enumerate(words):
        if random.random() < typo_prob and word in typo_dict:
            words[i] = typo_dict[word]
    return ''.join(words)

# 扩充数据
def augment_data(data):
    augmented_data = []
    for entry in data:
        text = entry['text']
        label = entry['label']

        # 原文数据
        augmented_data.append({"text": text, "label": label})
        
        # 近义词替换
        augmented_text = synonym_replacement(text)
        augmented_data.append({"text": augmented_text, "label": label})
        
        # 简繁体转换
        traditional_text = convert_simplified_to_traditional(text)
        augmented_data.append({"text": traditional_text, "label": label})
        
        # 拼写变化
        typo_text = typo_augmentation(text)
        augmented_data.append({"text": typo_text, "label": label})

    return augmented_data

# 运行增强代码并展示结果
augmented_data = augment_data(data)
for entry in augmented_data:
    print(f"Text: {entry['text']}, Label: {entry['label']}")

### 使用文本嵌入可以有效提高微调的效率

In [1]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'


In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np

# 初始化SBERT模型
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# 样本数据，包含不同情感类别的句子
texts = [
    "这款产品非常好用，功能强大且易操作。",
    "服务态度差，体验非常糟糕。",
    "产品质量一般，但价格实惠。",
    "这是我用过的最好的一款应用。",
    "这家餐厅的服务真的很差劲。",
    "这件商品的性价比非常高，值得推荐！"
]

# 使用SBERT生成文本嵌入
embeddings = model.encode(texts)

# 输出每条文本的嵌入向量
for i, embedding in enumerate(embeddings):
    print(f"Text: {texts[i]}")
    print(f"Embedding: {embedding}\n")


'(ReadTimeoutError("HTTPSConnectionPool(host='hf-mirror.com', port=443): Read timed out. (read timeout=10)"), '(Request ID: 8e5dde0c-efb7-4ec8-b036-16e087918c49)')' thrown while requesting HEAD https://hf-mirror.com/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='hf-mirror.com', port=443): Read timed out. (read timeout=10)"), '(Request ID: af79c891-60d9-4853-9d90-3234ef3ce7ba)')' thrown while requesting HEAD https://hf-mirror.com/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/./modules.json
Retrying in 2s [Retry 2/5].
'(ProtocolError('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer')), '(Request ID: 8d507639-1a63-4c8b-ab37-c98b4fc4f800)')' thrown while requesting HEAD https://hf-mirror.com/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2/resolve/main/./modules.json
Retrying in 4s [Retry 3/5].
'(MaxRetryError('HT

KeyboardInterrupt: 

In [None]:
### 使用另一种预训练模型
model = SentenceTransformer('princeton-nlp/sup-simcse-bert-base-uncased')

texts = [
    "这款产品非常好用，功能强大且易操作。",
    "服务态度差，体验非常糟糕。",
    "产品质量一般，但价格实惠。",
    "这是我用过的最好的一款应用。",
    "这家餐厅的服务真的很差劲。",
    "这件商品的性价比非常高，值得推荐！"
]
embeddings=model.encode(texts)
for i, embedding in enumerate(embeddings):
    print(f"Text: {texts[i]}")
    print(f"Embedding: {embedding}\n")