## 🧰 Step 1. 导入依赖与路径设置
本节导入所需依赖包，并配置基础路径。
- 若第一次运行，需要确保：
  - `lib/` 目录下包含 `glove.840B.300d.txt`
  - `corpus/imdb/` 目录下包含 Kaggle 提供的 `.tsv` 文件
## 💾 Step 2. 加载或生成 GloVe 向量（.kv 格式）
此步骤会检测 `lib/` 目录下的文件：
1. 若已有 `glove.840B.300d.kv`，则直接加载；
2. 若只有 `.gensim.txt` 或 `.txt` 文件，会自动转换生成 `.kv`。
`.kv` 格式的优势：
- 加载更快（几秒）
- 占内存更少（mmap 方式）
- 避免 `EOFError` 及文件损坏风险


In [11]:
# -*- coding: utf-8 -*-
"""
glove_convert.py
用途：将 glove.840B.300d.txt 转换为高效的 gensim / kv 格式
支持自动检测文件、异常修复、日志输出
"""
import os
import logging
from gensim.models import KeyedVectors

# ========== 日志配置 ==========
logging.basicConfig(format="%(asctime)s: %(levelname)s: %(message)s",
                    level=logging.INFO)
logger = logging.getLogger(__name__)

# ========== 文件路径 ==========
glove_txt = "lib/glove.840B.300d.txt"          # 原始官方文件（2GB）
gensim_txt = "lib/glove.840B.300d.gensim.txt"  # 转换后的文本
kv_bin = "lib/glove.840B.300d.kv"              # 高效二进制缓存

# ========== 开始加载 ==========
logger.info("开始检测并加载 GloVe 文件...")

wvmodel = None

# 1️⃣ 优先加载二进制 .kv（最快）
if os.path.exists(kv_bin):
    logger.info("检测到 .kv 文件，直接加载（mmap='r'）: %s", kv_bin)
    wvmodel = KeyedVectors.load(kv_bin, mmap='r')

# 2️⃣ 若无 .kv，尝试加载 .gensim.txt
elif os.path.exists(gensim_txt):
    try:
        logger.info("检测到 .gensim.txt 文件，尝试加载: %s", gensim_txt)
        wvmodel = KeyedVectors.load_word2vec_format(gensim_txt, binary=False)
        logger.info("加载成功，保存为 .kv 以便下次快速读取...")
        wvmodel.save(kv_bin)
        logger.info("已保存至: %s", kv_bin)
    except Exception as e:
        logger.warning("加载 .gensim.txt 失败: %s", e)

# 3️⃣ 若以上都没有或损坏，则直接从 .txt 读取（no_header=True）
if wvmodel is None:
    if not os.path.exists(glove_txt):
        raise FileNotFoundError("未找到原始 GloVe 文件，请确认路径正确。")
    logger.warning("尝试从原始 glove.840B.300d.txt 加载（no_header=True）...")
    wvmodel = KeyedVectors.load_word2vec_format(glove_txt, binary=False, no_header=True)
    logger.info("加载成功！保存为 .kv 缓存中...")
    wvmodel.save(kv_bin)
    logger.info("保存完成：%s", kv_bin)

# ========== 验证加载结果 ==========
logger.info("✅ 加载完成：词汇量 = %d，维度 = %d",
            len(wvmodel.key_to_index), wvmodel.vector_size)


2025-10-19 13:15:35,603: INFO: 开始检测并加载 GloVe 文件...
2025-10-19 13:15:35,608: INFO: 检测到 .gensim.txt 文件，尝试加载: lib/glove.840B.300d.gensim.txt
2025-10-19 13:15:35,609: INFO: loading projection weights from lib/glove.840B.300d.gensim.txt
2025-10-19 13:20:30,262: INFO: loading projection weights from lib/glove.840B.300d.txt
2025-10-19 13:25:28,961: INFO: KeyedVectors lifecycle event {'msg': 'loaded (2196017, 300) matrix of type float32 from lib/glove.840B.300d.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-10-19T13:25:28.960865', 'gensim': '4.3.3', 'python': '3.12.3 | packaged by conda-forge | (main, Apr 15 2024, 18:20:11) [MSC v.1938 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26200-SP0', 'event': 'load_word2vec_format'}
2025-10-19 13:25:28,963: INFO: 加载成功！保存为 .kv 缓存中...
2025-10-19 13:25:28,967: INFO: KeyedVectors lifecycle event {'fname_or_handle': 'lib/glove.840B.300d.kv', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-10-19T13:25:28.

## 🎬 Step 3. 构建 IMDB 数据集
此步骤执行以下流程：
1. 加载 IMDB 训练 / 测试 `.tsv` 文件  
2. 使用 `BeautifulSoup` 清洗文本  
3. 构建词汇表  
4. 编码为索引序列并填充至固定长度  
5. 使用 GloVe 向量构建 embedding 矩阵  
6. 序列化输出到 `pickle/imdb_glove.pickle3`


In [13]:
# -*- coding: utf-8 -*-
"""
imdb_glove.py
功能：将 IMDB 影评数据集转换为基于 GloVe 预训练词向量的索引化张量数据
输出：pickle/imdb_glove.pickle3
"""

import csv
import logging
import os
import re
import sys
from itertools import chain

import pandas as pd
import torch
import pickle
from bs4 import BeautifulSoup
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split

# ========= 基本参数 =========
embed_size = 300         # GloVe 维度
max_len = 512            # 每条样本的最大长度
imdb_dir = "./corpus/imdb"
pickle_out = "pickle/imdb_glove.pickle3"

# ========= 数据路径 =========
train_path = os.path.join(imdb_dir, "labeledTrainData.tsv")
test_path = os.path.join(imdb_dir, "testData.tsv")
unlabeled_path = os.path.join(imdb_dir, "unlabeledTrainData.tsv")  # 可选文件

# ========= GloVe 路径 =========
kv_file = os.path.join("lib", "glove.840B.300d.kv")  # 已生成的高效文件


# ========== 文本清洗函数 ==========
def review_to_wordlist(review, remove_stopwords=False):
    """
    文本清洗：去 HTML 标签、去非字母字符、小写化并分词
    """
    review_text = BeautifulSoup(str(review), "html.parser").get_text()
    review_text = re.sub(r"[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    return words


# ========== 编码函数 ==========
def encode_samples(tokenized_samples, word_to_idx):
    """
    将词转换为索引序列（未知词为 0）
    """
    features = []
    for sample in tokenized_samples:
        feature = [word_to_idx.get(token, 0) for token in sample]
        features.append(feature)
    return features


# ========== Padding 函数 ==========
def pad_samples(features, maxlen=max_len, PAD=0):
    """
    截断或填充样本到固定长度
    """
    padded = []
    for seq in features:
        if len(seq) >= maxlen:
            padded.append(seq[:maxlen])
        else:
            padded.append(seq + [PAD] * (maxlen - len(seq)))
    return padded


# ========== 主程序 ==========
def main():
    # ========== 日志配置 ==========
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format="%(asctime)s: %(levelname)s: %(message)s")
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s", " ".join(sys.argv))

    # ========== 路径检查 ==========
    for p in [train_path, test_path]:
        if not os.path.exists(p):
            raise FileNotFoundError(f"未找到数据文件：{p}，请确认 IMDB TSV 已放在 {imdb_dir}")

    if not os.path.exists(kv_file):
        raise FileNotFoundError(f"未找到 {kv_file}，请先运行 glove_convert.py 生成 .kv 文件")

    os.makedirs(os.path.dirname(pickle_out), exist_ok=True)

    # ========== 读取 IMDB 数据 ==========
    train = pd.read_csv(train_path, header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
    test = pd.read_csv(test_path, header=0, delimiter="\t", quoting=csv.QUOTE_NONE)
    logger.info("train shape: %s | test shape: %s", train.shape, test.shape)

    # ========== 文本清洗 ==========
    logger.info("cleaning & tokenizing train/test reviews ...")
    clean_train_reviews, train_labels = [], []
    for i, review in enumerate(train["review"]):
        clean_train_reviews.append(review_to_wordlist(review))
        train_labels.append(int(train["sentiment"][i]))
    clean_test_reviews = [review_to_wordlist(r) for r in test["review"]]

    # ========== 构建词表 ==========
    logger.info("building vocabulary ...")
    vocab = set(chain(*clean_train_reviews)) | set(chain(*clean_test_reviews))
    vocab_size = len(vocab)
    logger.info("vocab size: %d", vocab_size)

    # ========== 划分验证集 ==========
    logger.info("train/val split ...")
    train_reviews, val_reviews, train_labels_arr, val_labels_arr = train_test_split(
        clean_train_reviews, train_labels, test_size=0.2, random_state=0
    )

    # ========== 加载 GloVe ==========
    logger.info("loading GloVe model from %s", kv_file)
    wvmodel = KeyedVectors.load(kv_file, mmap='r')
    logger.info("GloVe loaded: vocab=%d, dim=%d",
                len(wvmodel.key_to_index), wvmodel.vector_size)

    # ========== 构建词 ↔ 索引 ==========
    logger.info("building word_to_idx / idx_to_word ...")
    word_to_idx = {word: i + 1 for i, word in enumerate(vocab)}
    word_to_idx["<unk>"] = 0
    idx_to_word = {i + 1: word for i, word in enumerate(vocab)}
    idx_to_word[0] = "<unk>"

    # ========== 编码 & padding ==========
    logger.info("encoding & padding ...")
    train_features = torch.tensor(pad_samples(encode_samples(train_reviews, word_to_idx)))
    val_features = torch.tensor(pad_samples(encode_samples(val_reviews, word_to_idx)))
    test_features = torch.tensor(pad_samples(encode_samples(clean_test_reviews, word_to_idx)))

    train_labels_t = torch.tensor(train_labels_arr, dtype=torch.long)
    val_labels_t = torch.tensor(val_labels_arr, dtype=torch.long)

    # ========== 构建 embedding 权重矩阵 ==========
    logger.info("building embedding weight matrix ...")
    weight = torch.zeros(vocab_size + 1, embed_size, dtype=torch.float32)

    exist = 0
    for word, idx in word_to_idx.items():
        if word == "<unk>":
            continue
        if word in wvmodel.key_to_index:
            weight[idx, :] = torch.from_numpy(wvmodel.get_vector(word))
            exist += 1
    logger.info("initialized embeddings for %d / %d words (%.2f%% covered)",
                exist, vocab_size, 100 * exist / vocab_size)

    # ========== 保存 ==========
    logger.info("saving dataset to %s ...", pickle_out)
    with open(pickle_out, "wb") as f:
        pickle.dump(
            [
                train_features,
                train_labels_t,
                val_features,
                val_labels_t,
                test_features,
                weight,
                word_to_idx,
                idx_to_word,
                vocab,
            ],
            f,
        )
    logger.info("✅ Done! Saved preprocessed data to %s", pickle_out)


if __name__ == "__main__":
    main()


2025-10-19 13:43:08,923: INFO: running D:\Anaconda\Lib\site-packages\ipykernel_launcher.py -f C:\Users\22711\AppData\Roaming\jupyter\runtime\kernel-a488ede4-750d-4260-805b-7db9d5cf9eb4.json
2025-10-19 13:43:09,673: INFO: train shape: (25000, 3) | test shape: (25000, 2)
2025-10-19 13:43:09,674: INFO: cleaning & tokenizing train/test reviews ...
  review_text = BeautifulSoup(str(review), "html.parser").get_text()
2025-10-19 13:43:24,728: INFO: building vocabulary ...
2025-10-19 13:43:25,303: INFO: vocab size: 101399
2025-10-19 13:43:25,303: INFO: train/val split ...
2025-10-19 13:43:25,342: INFO: loading GloVe model from lib\glove.840B.300d.kv
2025-10-19 13:43:25,343: INFO: loading KeyedVectors object from lib\glove.840B.300d.kv
2025-10-19 13:43:26,273: INFO: loading vectors from lib\glove.840B.300d.kv.vectors.npy with mmap=r
2025-10-19 13:43:26,288: INFO: KeyedVectors lifecycle event {'fname': 'lib\\glove.840B.300d.kv', 'datetime': '2025-10-19T13:43:26.288106', 'gensim': '4.3.3', 'pytho

## 🔍 Step 4. 验证数据文件
加载并查看结构是否正确。


In [15]:
# -*- coding: utf-8 -*-
"""
check_imdb_pickle.py
快速检查 imdb_glove.pickle3 是否正确生成
"""

import pickle
import torch

path = "pickle/imdb_glove.pickle3"

with open(path, "rb") as f:
    data = pickle.load(f)

train_features, train_labels, val_features, val_labels, test_features, weight, word_to_idx, idx_to_word, vocab = data

print("✅ 数据文件加载成功！\n")
print(f"训练集特征: {train_features.shape}, 标签: {train_labels.shape}")
print(f"验证集特征: {val_features.shape}, 标签: {val_labels.shape}")
print(f"测试集特征: {test_features.shape}")
print(f"Embedding 矩阵: {weight.shape}")
print(f"词汇表大小: {len(vocab)}")
print(f"示例词向量（'good'）:")
print(weight[word_to_idx.get('good', 0)][:10])  # 打印前10个维度


✅ 数据文件加载成功！

训练集特征: torch.Size([20000, 512]), 标签: torch.Size([20000])
验证集特征: torch.Size([5000, 512]), 标签: torch.Size([5000])
测试集特征: torch.Size([25000, 512])
Embedding 矩阵: torch.Size([101400, 300])
词汇表大小: 101399
示例词向量（'good'）:
tensor([-0.4263,  0.4431, -0.3452, -0.1326, -0.0582,  0.0526,  0.2157, -0.3672,
        -0.0452,  2.2444])
