In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: Mountpoint must not already contain files

In [None]:
!pip install transformers torch pandas scikit-learn optuna tqdm

In [None]:
# 创建项目目录结构
import os
import json

PROJECT_PATH = "/content/drive/MyDrive/hotel_sentiment"

# 创建主目录和子目录
for dir_name in ['sample_data', 'models', 'logs', 'results']:
    dir_path = os.path.join(PROJECT_PATH, dir_name)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        print(f"创建目录: {dir_path}")

# 创建情感词典
sentiment_dict = {
    "好": 1, "很好": 1, "不错": 1, "棒": 1, "优秀": 1, "满意": 1, "喜欢": 1, "干净": 1, "舒适": 1, "方便": 1,
    "差": -1, "糟糕": -1, "不好": -1, "差劲": -1, "失望": -1, "不满意": -1, "脏": -1, "吵": -1, "贵": -1, "慢": -1
}
sentiment_dict_path = os.path.join(PROJECT_PATH, 'sample_data/sentiment_dict.json')
with open(sentiment_dict_path, 'w', encoding='utf-8') as f:
    json.dump(sentiment_dict, f, ensure_ascii=False, indent=4)
print(f"创建情感词典: {sentiment_dict_path}")

# 创建停用词表
stopwords = [
    "的", "了", "在", "是", "我", "有", "和", "就", "不", "人", "都", "一", "一个", "上", "也", "很",
    "到", "说", "要", "去", "你", "会", "着", "没有", "看", "好", "自己", "这", "么", "来", "让",
    "给", "但", "并", "而", "却", "还", "等", "着", "把", "被", "比", "较", "这个", "那个", "这样"
]
stopwords_path = os.path.join(PROJECT_PATH, 'sample_data/stopwords.txt')
with open(stopwords_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(stopwords))
print(f"创建停用词表: {stopwords_path}")

创建情感词典: /content/drive/MyDrive/hotel_sentiment/sample_data/sentiment_dict.json
创建停用词表: /content/drive/MyDrive/hotel_sentiment/sample_data/stopwords.txt


In [None]:
import pandas as pd

# 验证文件是否存在
file_path = '/content/ChnSentiCorp_htl_all.csv'
if os.path.exists(file_path):
    print("数据集文件已找到")
    # 读取并显示数据集基本信息
    df = pd.read_csv(file_path)
    print("\n数据集基本信息：")
    print(df.info())
    print("\n标签分布：")
    print(df['label'].value_counts())
else:
    print("错误：找不到数据集文件！")

数据集文件已找到

数据集基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7766 entries, 0 to 7765
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   7766 non-null   int64 
 1   review  7765 non-null   object
dtypes: int64(1), object(1)
memory usage: 121.5+ KB
None

标签分布：
label
1    5322
0    2444
Name: count, dtype: int64


In [None]:
# 1. 导入必要的库
import os
import torch
from transformers import BertTokenizer
from data_processor import DataProcessor
from model import ChineseSentimentDataset
from hyperparameter_tuning import run_hyperparameter_search

# 2. 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# 3. 初始化数据处理器
PROJECT_PATH = "/content/drive/MyDrive/hotel_sentiment"
processor = DataProcessor(
    sentiment_dict_path=os.path.join(PROJECT_PATH, 'sample_data/sentiment_dict.json'),
    stopwords_path=os.path.join(PROJECT_PATH, 'sample_data/stopwords.txt')
)

# 4. 处理数据（不使用数据平衡）
print("\n处理数据...")
train_df, test_df = processor.process_data(
    '/content/ChnSentiCorp_htl_all.csv',
    balance_data=False
)

# 5. 创建tokenizer
print("\n加载tokenizer...")
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext-large')

# 6. 创建数据集
print("\n创建数据集...")
train_dataset = ChineseSentimentDataset(
    texts=train_df['cleaned_text'].values,
    labels=train_df['label'].values,
    tokenizer=tokenizer
)

test_dataset = ChineseSentimentDataset(
    texts=test_df['cleaned_text'].values,
    labels=test_df['label'].values,
    tokenizer=tokenizer
)

print(f"训练集大小: {len(train_dataset)}")
print(f"测试集大小: {len(test_dataset)}")

# 7. 运行超参数搜索
print("\n开始超参数搜索...")
best_params = run_hyperparameter_search(
    train_dataset=train_dataset,
    valid_dataset=test_dataset,
    n_trials=5  # 使用5次trials进行搜索
)

if best_params:
    print("\n最佳超参数：")
    for param, value in best_params.items():
        print(f"{param}: {value}")
else:
    print("\n超参数搜索失败！")

KeyboardInterrupt: 

In [None]:
# 1. 加载数据
print("1. 加载数据...")
try:
    # 尝试使用 gb18030 编码（常用于中文文件）
    df = pd.read_csv('/content/ChnSentiCorp_htl_all.csv', encoding='gb18030')
except:
    try:
        # 如果失败，尝试使用 gbk 编码
        df = pd.read_csv('/content/ChnSentiCorp_htl_all.csv', encoding='gbk')
    except:
        try:
            # 如果还失败，尝试使用 gb2312 编码
            df = pd.read_csv('/content/ChnSentiCorp_htl_all.csv', encoding='gb2312')
        except:
            # 最后尝试使用 latin1 编码
            df = pd.read_csv('/content/ChnSentiCorp_htl_all.csv', encoding='latin1')

# 2. 数据基本信息
print("\n2. 数据基本信息：")
print(df.info())

# 3. 检查空值
print("\n3. 空值检查：")
print(df.isnull().sum())

# 4. 标签分布
print("\n4. 标签分布：")
print(df['label'].value_counts())
print("\n正负样本比例：")
print(df['label'].value_counts(normalize=True))

# 5. 文本长度统计
print("\n5. 文本长度统计：")
df['text_length'] = df['review'].str.len()
print(df['text_length'].describe())

1. 加载数据...

2. 数据基本信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7766 entries, 0 to 7765
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   7766 non-null   int64 
 1   review  7765 non-null   object
dtypes: int64(1), object(1)
memory usage: 121.5+ KB
None

3. 空值检查：
label     0
review    1
dtype: int64

4. 标签分布：
label
1    5322
0    2444
Name: count, dtype: int64

正负样本比例：
label
1    0.685295
0    0.314705
Name: proportion, dtype: float64

5. 文本长度统计：
count    7765.000000
mean      370.546040
std       415.449642
min         6.000000
25%       132.000000
50%       241.000000
75%       444.000000
max      8424.000000
Name: text_length, dtype: float64


In [None]:
import pandas as pd
import numpy as np
from data_processor import DataProcessor
import os

# 1. 加载数据
print("1. 加载数据...")
df = pd.read_csv('/content/ChnSentiCorp_htl_all.csv')

# 2. 使用DataProcessor进行处理
print("\n2. 使用DataProcessor处理数据...")
processor = DataProcessor(
    sentiment_dict_path=os.path.join(PROJECT_PATH, 'sample_data/sentiment_dict.json'),
    stopwords_path=os.path.join(PROJECT_PATH, 'sample_data/stopwords.txt')
)

# 3. 数据处理和平衡
print("\n3. 数据处理和平衡...")
train_df, test_df = processor.process_data(
    '/content/ChnSentiCorp_htl_all.csv',
    balance_data=True  # 启用数据平衡
)

# 4. 显示处理后的结果
print("\n4. 处理后的数据分布：")
print("训练集大小:", len(train_df))
print("测试集大小:", len(test_df))
print("\n训练集标签分布：")
print(train_df['label'].value_counts())
print("\n测试集标签分布：")
print(test_df['label'].value_counts())

1. 加载数据...

2. 使用DataProcessor处理数据...

3. 数据处理和平衡...
1. 原始数据量: 7766
原始标签分布:
label
1    5322
0    2444
Name: count, dtype: int64

2. 转换为数值后的数据量: 7766

3. 文本清理后的数据量: 7765

执行数据平衡...
平衡前的训练集标签分布:
label
1    4258
0    1954
Name: count, dtype: int64

平衡后的训练集标签分布:
label
1    1954
0    1954
Name: count, dtype: int64

最终处理结果:
训练集大小: 3908
测试集大小: 1553

最终标签分布:
label
1    1954
0    1954
Name: count, dtype: int64

4. 处理后的数据分布：
训练集大小: 3908
测试集大小: 1553

训练集标签分布：
label
1    1954
0    1954
Name: count, dtype: int64

测试集标签分布：
label
1    1064
0     489
Name: count, dtype: int64


In [None]:
import torch
from transformers import BertTokenizer
from torch.utils.data import DataLoader
from model import MultiDimensionalSentimentModel, ChineseSentimentDataset
from tqdm import tqdm
import os
from config import Config

# 1. 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

# 2. 创建tokenizer
print("\n创建tokenizer...")
tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext-large')

# 3. 创建数据集
print("\n创建数据集...")
train_dataset = ChineseSentimentDataset(
    texts=train_df['cleaned_text'].values,
    labels=train_df['label'].values,
    tokenizer=tokenizer
)

test_dataset = ChineseSentimentDataset(
    texts=test_df['cleaned_text'].values,
    labels=test_df['label'].values,
    tokenizer=tokenizer
)

print(f"训练集大小: {len(train_dataset)}")
print(f"测试集大小: {len(test_dataset)}")

# 4. 创建模型
print("\n创建模型...")
model = MultiDimensionalSentimentModel(
    pretrained_model_name='hfl/chinese-roberta-wwm-ext-large',
    num_dimensions=2  # 二分类问题
).to(device)

# 5. 创建数据加载器
print("\n创建数据加载器...")
train_loader = DataLoader(
    train_dataset,
    batch_size=Config.BATCH_SIZE,
    shuffle=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=Config.BATCH_SIZE
)

# 6. 创建优化器和损失函数
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=Config.LEARNING_RATE,
    weight_decay=Config.WEIGHT_DECAY
)
criterion = torch.nn.CrossEntropyLoss()

# 7. 训练循环
print("\n开始训练...")
best_val_loss = float('inf')

for epoch in range(Config.NUM_EPOCHS):
    # 训练阶段
    model.train()
    train_loss = 0
    train_progress = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{Config.NUM_EPOCHS} [Train]')

    for batch in train_progress:
        # 将数据移到设备上
        batch = {k: v.to(device) for k, v in batch.items()}

        # 前向传播
        optimizer.zero_grad()
        outputs = model(batch['input_ids'], batch['attention_mask'])
        loss = criterion(outputs, batch['labels'])

        # 反向传播
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), Config.GRADIENT_CLIP_VALUE)
        optimizer.step()

        train_loss += loss.item()
        train_progress.set_postfix({'loss': f'{loss.item():.4f}'})

    avg_train_loss = train_loss / len(train_loader)

    # 验证阶段
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    print("\n验证中...")
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Validation'):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(batch['input_ids'], batch['attention_mask'])
            loss = criterion(outputs, batch['labels'])
            val_loss += loss.item()

            # 计算准确率
            _, predicted = torch.max(outputs, 1)
            total += batch['labels'].size(0)
            correct += (predicted == batch['labels']).sum().item()

    avg_val_loss = val_loss / len(test_loader)
    accuracy = 100 * correct / total

    print(f'\nEpoch {epoch + 1} 结果:')
    print(f'平均训练损失: {avg_train_loss:.4f}')
    print(f'平均验证损失: {avg_val_loss:.4f}')
    print(f'验证准确率: {accuracy:.2f}%')

    # 保存最佳模型
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        model_save_path = os.path.join(Config.MODEL_SAVE_PATH, 'best_model.pt')
        torch.save(model.state_dict(), model_save_path)
        print(f"保存新的最佳模型到: {model_save_path}")

print("\n训练完成!")

使用设备: cuda

创建tokenizer...


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]


创建数据集...
训练集大小: 3908
测试集大小: 1553

创建模型...

创建数据加载器...

开始训练...


Epoch 1/3 [Train]: 100%|██████████| 489/489 [08:16<00:00,  1.02s/it, loss=0.0286]



验证中...


Validation: 100%|██████████| 195/195 [00:58<00:00,  3.34it/s]



Epoch 1 结果:
平均训练损失: 0.3572
平均验证损失: 0.2558
验证准确率: 90.47%
保存新的最佳模型到: /content/drive/MyDrive/hotel_sentiment/models/best_model.pt


Epoch 2/3 [Train]: 100%|██████████| 489/489 [08:16<00:00,  1.02s/it, loss=0.0122]



验证中...


Validation: 100%|██████████| 195/195 [00:58<00:00,  3.33it/s]



Epoch 2 结果:
平均训练损失: 0.2110
平均验证损失: 0.3594
验证准确率: 90.02%


Epoch 3/3 [Train]: 100%|██████████| 489/489 [08:16<00:00,  1.02s/it, loss=1.1763]



验证中...


Validation: 100%|██████████| 195/195 [00:58<00:00,  3.33it/s]


Epoch 3 结果:
平均训练损失: 0.1156
平均验证损失: 0.4568
验证准确率: 90.66%

训练完成!





In [None]:
# 重新保存最佳模型
import os
import torch

# 确保目录存在
save_dir = "/content/drive/MyDrive/hotel_sentiment/models"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    print(f"创建目录: {save_dir}")

# 保存模型
model_path = os.path.join(save_dir, 'best_model.pt')
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': {
        'batch_size': Config.BATCH_SIZE,
        'learning_rate': Config.LEARNING_RATE,
        'weight_decay': Config.WEIGHT_DECAY,
        'hidden_dropout': Config.HIDDEN_DROPOUT
    },
    'best_val_loss': best_val_loss,
    'best_accuracy': 90.47
}, model_path)

print(f"模型已保存到: {model_path}")

# 验证文件是否存在
if os.path.exists(model_path):
    print(f"文件大小: {os.path.getsize(model_path) / (1024*1024):.2f} MB")
else:
    print("错误：文件未能成功保存！")

模型已保存到: /content/drive/MyDrive/hotel_sentiment/models/best_model.pt
文件大小: 3763.41 MB


In [None]:
from google.colab import files

# 保存模型
model_path = "/content/drive/MyDrive/hotel_sentiment/models/model_state.pt"
torch.save(model.state_dict(), model_path)

# 下载模型
files.download(model_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# 直接保存并下载到本地
from google.colab import files

# 1. 保存模型到临时目录
model_path = "/content/model_state.pt"  # 直接保存在临时目录
torch.save({
    'model_state_dict': model.state_dict(),
    'config': {
        'batch_size': Config.BATCH_SIZE,
        'learning_rate': Config.LEARNING_RATE,
        'weight_decay': Config.WEIGHT_DECAY,
        'hidden_dropout': Config.HIDDEN_DROPOUT
    }
}, model_path)

# 2. 直接下载到本地电脑
files.download(model_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!find /content -name "*.pt"

/content/model_state.pt
/content/drive/MyDrive/hotel_sentiment/models/model_state.pt
/content/drive/MyDrive/hotel_sentiment/models/best_model.pt
