使用课堂示例cooking.stackexchange.txt，使用fasttext训练文本分类模型。

In [None]:
# 完整模型实现
import os
import fasttext
from sklearn.model_selection import train_test_split

# 配置参数
DATA_PATH = r'F:\NLP算法课程\正式课\0319\语言模型及词向量相关知识\cooking.stackexchange.txt'
MODEL_SAVE_PATH = r'F:\temp\cooking_model.bin'

def preprocess_data(input_path, output_path):
    """数据预处理函数"""
    with open(input_path, 'r', encoding='utf-8') as f_in, \
         open(output_path, 'w', encoding='utf-8') as f_out:
        
        for line in f_in:
            # 清洗和标准化处理
            line = line.strip().lower()
            parts = line.split()
            
            # 分离标签和文本
            labels = [p for p in parts if p.startswith('__label__')]
            text = ' '.join([p for p in parts if not p.startswith('__label__')])
            
            # 去除非字母数字字符
            text = ''.join([c if c.isalnum() or c.isspace() else ' ' for c in text])
            text = ' '.join(text.split())  # 合并多余空格
            
            if labels and text:  # 过滤空数据
                f_out.write(f'{" ".join(sorted(set(labels)))} {text}\n')

# 执行预处理
preprocessed_path = os.path.join(os.path.dirname(DATA_PATH), 'processed_cooking.txt')
preprocess_data(DATA_PATH, preprocessed_path)

# 划分训练集/测试集
with open(preprocessed_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()
train_lines, test_lines = train_test_split(lines, test_size=0.2, random_state=42)

# 保存数据集
train_path = os.path.join(r'F:\temp', 'cooking_train.txt')
test_path = os.path.join(r'F:\temp', 'cooking_test.txt')

with open(train_path, 'w', encoding='utf-8') as f:
    f.writelines(train_lines)
with open(test_path, 'w', encoding='utf-8') as f:
    f.writelines(test_lines)

# 训练优化模型
model = fasttext.train_supervised(
    input=train_path,
    loss='ova',               # 多标签分类模式
    lr=0.8,                   # 调高学习率
    epoch=150,                # 增加训练轮次
    wordNgrams=3,             # 使用3-gram特征
    minCount=3,               # 降低词频阈值
    dim=200,                  # 增加向量维度
    autotuneValidationFile=test_path,  # 自动调参
    autotuneDuration=600,     # 延长调参时间
    thread=8                  # 使用更多CPU核心
)

# 详细评估
print("【模型评估结果】")
print(f"- 准确率@1: {model.test(test_path)[1]*100:.2f}%")
print(f"- 数据总量: {len(lines)}条")
print(f"- 类别数量: {len(model.labels)}种")


【模型评估结果】
- 准确率@1: 61.88%
- 数据总量: 15404条
- 类别数量: 733种


ValueError: F:\NLP算法课程\正式课\0319\作业\cooking_model.bin cannot be opened for saving!

In [10]:
# 保存生产环境模型
MODEL_SAVE_PATH = r'F:\temp\cooking_model.bin'
model.save_model(MODEL_SAVE_PATH)


# 预测示例
test_cases = [
    "how to make beef stew with red wine?",
    "proper temperature for baking chocolate chip cookies",
    "storing fresh fish in refrigerator"
]

for text in test_cases:
    labels, probs = model.predict(text, k=3)
    print(f"\n预测 '{text}':")
    for label, prob in zip(labels, probs):
        print(f"{label.replace('__label__', ''):<25} {prob:.2%}")



预测 'how to make beef stew with red wine?':
stews                     96.15%
beef                      2.30%
jicama                    0.00%

预测 'proper temperature for baking chocolate chip cookies':
baking                    100.00%
cookies                   100.00%
temperature               73.72%

预测 'storing fresh fish in refrigerator':
fish                      100.00%
refrigerator              95.12%
storage-method            93.81%
