In [1]:
import pandas as pd
import re
import multiprocessing
from gensim.models import Word2Vec


In [2]:
with open('data/全唐诗2.txt','r',encoding='utf-8') as f:
    txt = f.read()
all_list = re.split(r'卷\d+_?\d*', txt)
volume_nums = [int(num) for num in re.findall(r'卷(\d+)_?\d*', txt)]
cleaned_parts = [re.sub(r'\s+', ' ', part.strip()) for part in all_list if part.strip()]

In [5]:
# 1. 增加训练数据
with open('data/全唐诗2.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# 2. 更好的文本预处理
sentences = []
for line in text.split('\n'):
    # 移除标点符号
    line = re.sub(r'[^\u4e00-\u9fff]', '', line)
    if len(line) > 1:  # 确保句子长度大于1
        sentences.append(list(line))


In [None]:
# 3. 调整Word2Vec参数
model = Word2Vec(sentences,
                vector_size=100,     # 降低维度
                window=10,           # 扩大窗口以捕获更多上下文
                min_count=10,        # 提高词频阈值过滤低频词
                sg=1,                # 使用skip-gram
                negative=10,         # 增加负采样
                epochs=20,           # 增加训练轮数
                workers=multiprocessing.cpu_count())

In [3]:
# 4. 使用余弦相似度而不是点积
def find_similar_words(word, topn=20):
    try:
        similar_words = model.wv.most_similar(
            positive=[word],
            topn=topn,
            restrict_vocab=None
        )
        return similar_words
    except KeyError:
        return f"词语 '{word}' 不在词汇表中"


In [4]:
model = Word2Vec.load('models/word_vectors.model')
# 5. 测试情感词
emotion_dict = {}
results = {}
emotions = ['悲', '喜', '怒', '乐', '忧', '思']
for emotion in emotions:
    similar = find_similar_words(emotion)
    if isinstance(similar, list):
        # 只显示相似度大于0.45的结果
        emotion_dict[emotion] = [w for w, s in similar if s > 0.45]
        results[emotion] = [(w, s) for w, s in similar if s > 0.45]
emotion_dict

{'悲': ['哀',
  '伤',
  '叹',
  '嗟',
  '泣',
  '凄',
  '愁',
  '衰',
  '恨',
  '哭',
  '恸',
  '怨',
  '暮',
  '吁',
  '念',
  '堪',
  '涕',
  '已',
  '酸',
  '汍'],
 '喜': ['贺',
  '逢',
  '幸',
  '忻',
  '频',
  '遇',
  '再',
  '每',
  '预',
  '同',
  '欣',
  '朋',
  '新',
  '吉',
  '初',
  '睹',
  '陪',
  '相'],
 '怒': ['訇',
  '呀',
  '号',
  '掣',
  '吼',
  '嘻',
  '雷',
  '震',
  '猬',
  '捩',
  '电',
  '诛',
  '狞',
  '霆',
  '轰',
  '骇',
  '哮',
  '鲸',
  '暴',
  '毒'],
 '乐': ['酺',
  '雩',
  '享',
  '歌',
  '顺',
  '邠',
  '觱',
  '豫',
  '鼓',
  '篥',
  '献',
  '纾',
  '泰',
  '笙',
  '章',
  '雅',
  '欢',
  '永',
  '娱',
  '庙'],
 '忧': ['患',
  '虑',
  '雠',
  '疹',
  '防',
  '迫',
  '谤',
  '心',
  '蠲',
  '念',
  '岂',
  '免',
  '悰',
  '身',
  '盗',
  '亦',
  '罹',
  '荼',
  '计',
  '虽'],
 '思': ['忆',
  '望',
  '梦',
  '吟',
  '念',
  '情',
  '恨',
  '怀',
  '想',
  '愁',
  '独',
  '续',
  '对',
  '心',
  '逢',
  '伤']}

In [6]:
find_similar_words('春')

[('花', 0.649898886680603),
 ('晴', 0.6018003225326538),
 ('柳', 0.5961396098136902),
 ('暖', 0.5912441611289978),
 ('杏', 0.5886990427970886),
 ('秋', 0.582026481628418),
 ('莺', 0.5739514231681824),
 ('晚', 0.5727213025093079),
 ('梅', 0.5610203146934509),
 ('芳', 0.5552077889442444),
 ('绿', 0.5495471954345703),
 ('冬', 0.5226607918739319),
 ('愁', 0.5206615924835205),
 ('早', 0.5091170072555542),
 ('初', 0.50760418176651),
 ('日', 0.5068395137786865),
 ('东', 0.5067498087882996),
 ('新', 0.5029377341270447),
 ('长', 0.49475616216659546),
 ('浓', 0.491061806678772)]

### 划分数据集

In [10]:
import json
import numpy as np
from collections import Counter

# 1. 加载数据
with open('data/result.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 2. 检查情感标签分布
emotions = list(data.values())
emotion_counts = Counter(emotions)
print("情感标签分布:")
for emotion, count in emotion_counts.items():
    print(f"{emotion}: {count}")

# 3. 过滤样本
min_samples = 100
valid_emotions = {emotion for emotion, count in emotion_counts.items() if count >= min_samples}

# 4. 只保留有效类别的数据
filtered_data = {text: emotion for text, emotion in data.items() if emotion in valid_emotions}

# 5. 转换为列表格式
texts = []
labels = []
for text, emotion in filtered_data.items():
    texts.append(text)
    labels.append(emotion)

# 6. 划分数据集
from sklearn.model_selection import train_test_split

# 首先划分出测试集(20%)
X_temp, X_test, y_temp, y_test = train_test_split(
    texts, labels, 
    test_size=0.2, 
    random_state=42, 
    stratify=labels
)

# 再将剩余数据划分为训练集(64%)和验证集(16%)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.2,
    random_state=42,
    stratify=y_temp
)

# 7. 保存划分后的数据集
train_data = dict(zip(X_train, y_train))
val_data = dict(zip(X_val, y_val))
test_data = dict(zip(X_test, y_test))

datasets = {
    'train': train_data,
    'val': val_data,
    'test': test_data
}

# 8. 保存数据集并打印统计信息
for split, data in datasets.items():
    with open(f'data/{split}.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    
    # 打印每个数据集的大小和类别分布
    print(f"\n{split}集:")
    print(f"样本数量: {len(data)}")
    print("类别分布:", Counter(data.values()))

情感标签分布:
思: 178601
乐: 79509
惧: 11574
悲: 50775
忧: 56423
喜: 39879
怒: 9590
敬: 10
惊: 17
伤: 1
静: 2
悔: 5
哀: 1
恨: 1
羞: 3

train集:
样本数量: 272864
类别分布: Counter({'思': 114305, '乐': 50886, '忧': 36110, '悲': 32496, '喜': 25522, '惧': 7407, '怒': 6138})

val集:
样本数量: 68216
类别分布: Counter({'思': 28576, '乐': 12721, '忧': 9028, '悲': 8124, '喜': 6381, '惧': 1852, '怒': 1534})

test集:
样本数量: 85271
类别分布: Counter({'思': 35720, '乐': 15902, '忧': 11285, '悲': 10155, '喜': 7976, '惧': 2315, '怒': 1918})


### 训练模型

In [23]:
import json
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 1. 加载Word2Vec模型和训练数据
word2vec_model = Word2Vec.load('models/word_vectors.model')
with open('data/train.json', 'r', encoding='utf-8') as f:
    train_data = json.load(f)

# 2. 将诗句转换为向量表示
def text_to_vec(text, model, vector_size=100):
    """
    将文本转换为向量表示（使用词向量的平均值）
    """
    words = list(text)  # 将文本分成单个字
    vectors = []
    for word in words:
        if word in model.wv:
            vectors.append(model.wv[word])
    if vectors:
        return np.mean(vectors, axis=0)
    return np.zeros(vector_size)  # 如果没有任何词的向量，返回零向量

# 3. 准备训练数据
X = []  # 特征向量
y = []  # 标签

for text, emotion in train_data.items():
    vec = text_to_vec(text, word2vec_model)
    X.append(vec)
    y.append(emotion)

X = np.array(X)
y = np.array(y)

In [22]:
import numpy as np
from tensorflow.keras.models import Sequential,load_model, save_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import pickle

# 2. 对标签进行编码
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_onehot = to_categorical(y_encoded)

# 3. 构建LSTM模型
model = Sequential([
    LSTM(128, input_shape=(1, 100), return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# 4. 编译模型
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# 5. 训练模型
history = model.fit(
    X, y_onehot,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    shuffle=True
)

# 6. 预测函数
def predict_emotion(text, model, word2vec_model, label_encoder):
    # 将文本转换为向量
    vec = text_to_vec(text, word2vec_model)
    vec = vec.reshape(1, 1, 100)
    
    # 预测
    pred = model.predict(vec)
    emotion = label_encoder.inverse_transform([np.argmax(pred)])[0]
    return emotion
# 1. 保存整个模型（使用新的.keras格式）
model.save('models/emotion_lstm_model.keras')

# 2. 保存标签编码器
with open('models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# 加载模型时：
def load_emotion_model():
    # 1. 加载模型
    loaded_model = load_model('models/emotion_lstm_model.keras')
    
    # 2. 加载标签编码器
    with open('models/label_encoder.pkl', 'rb') as f:
        loaded_label_encoder = pickle.load(f)
    
    return loaded_model, loaded_label_encoder

# 使用加载的模型进行预测
def predict_with_loaded_model(text, loaded_model, loaded_label_encoder, word2vec_model):
    # 将文本转换为向量
    vec = text_to_vec(text, word2vec_model)
    vec = vec.reshape(1, 1, 100)
    
    # 预测
    pred = loaded_model.predict(vec)
    emotion = loaded_label_encoder.inverse_transform([np.argmax(pred)])[0]
    return emotion
# 示例使用：
"""
# 加载模型示例
loaded_model, loaded_label_encoder = load_emotion_model()

# 预测示例
test_poem = "生计尚如蓬"
emotion = predict_with_loaded_model(test_poem, loaded_model, loaded_label_encoder, word2vec_model)
print(f"预测情感: {emotion}")
"""

Epoch 1/10


  super().__init__(**kwargs)


[1m3411/3411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.4920 - loss: 1.3750 - val_accuracy: 0.5488 - val_loss: 1.1840
Epoch 2/10
[1m3411/3411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5352 - loss: 1.2126 - val_accuracy: 0.5547 - val_loss: 1.1627
Epoch 3/10
[1m3411/3411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5440 - loss: 1.1926 - val_accuracy: 0.5608 - val_loss: 1.1457
Epoch 4/10
[1m3411/3411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5525 - loss: 1.1780 - val_accuracy: 0.5664 - val_loss: 1.1354
Epoch 5/10
[1m3411/3411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5557 - loss: 1.1664 - val_accuracy: 0.5690 - val_loss: 1.1257
Epoch 6/10
[1m3411/3411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.5563 - loss: 1.1621 - val_accuracy: 0.5719 - val_loss: 1.1188
Epoch 7/10
[1m3411/3

In [42]:
# 预测示例
test_poem = "万代千秋仰圣君"
emotion = predict_emotion(test_poem, model, word2vec_model, label_encoder)
print(f"预测情感: {emotion}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
预测情感: 喜


from tensorflow.keras.models import load_model, save_model
import pickle

# 1. 保存整个模型（使用新的.keras格式）
model.save('models/emotion_lstm_model.keras')

# 2. 保存标签编码器
with open('models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# 加载模型时：
def load_emotion_model():
    # 1. 加载模型
    loaded_model = load_model('models/emotion_lstm_model.keras')
    
    # 2. 加载标签编码器
    with open('models/label_encoder.pkl', 'rb') as f:
        loaded_label_encoder = pickle.load(f)
    
    return loaded_model, loaded_label_encoder

# 使用加载的模型进行预测
def predict_with_loaded_model(text, loaded_model, loaded_label_encoder, word2vec_model):
    # 将文本转换为向量
    vec = text_to_vec(text, word2vec_model)
    vec = vec.reshape(1, 1, 100)
    
    # 预测
    pred = loaded_model.predict(vec)
    emotion = loaded_label_encoder.inverse_transform([np.argmax(pred)])[0]
    return emotion
# 示例使用：
"""
# 加载模型示例
loaded_model, loaded_label_encoder = load_emotion_model()

# 预测示例
test_poem = "生计尚如蓬"
emotion = predict_with_loaded_model(test_poem, loaded_model, loaded_label_encoder, word2vec_model)
print(f"预测情感: {emotion}")
"""

In [11]:
import pandas as pd
from openai import OpenAI
import time

# 初始化 OpenAI 客户端
client = OpenAI(
    api_key="sk-8U3EuqHdqzCTqbuVuTNClwygtiXwgrHwtU6A1CgijYKH4ZCr",
    base_url="https://api.chatanywhere.tech/v1"
)

# 读取CSV文件
df = pd.read_csv('data/legal.csv')

def analyze_sentiment(poem_content):
    try:
        # 添加延时避免API限制
        time.sleep(1)
        # 定义情绪字典
        emotion_dict = {
            '悲': ['愁', '恸', '痛', '寡', '哀', '伤', '嗟'],
            '惧': ['谗', '谤', '患', '罪', '诈', '惧', '诬'],
            '乐': ['悦', '欣', '乐', '怡', '洽', '畅', '愉'],
            '怒': ['怒', '雷', '吼', '霆', '霹', '猛', '轰'],
            '思': ['思', '忆', '怀', '恨', '吟', '逢', '期'],
            '喜': ['喜', '健', '倩', '贺', '好', '良', '善'],
            '忧': ['恤', '忧', '痾', '虑', '艰', '遑', '厄']
        }
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "你是一个诗词分析专家。请只返回以下情感之一：悲、惧、乐、怒、思、喜、忧"},
                {
                    "role": "user",
                    "content": f"这是一个情绪字典{emotion_dict}。分析这首诗的主要情感，返回json格式 诗句:其对应一个字的情感类别（悲、惧、乐、怒、思、喜、忧）：\n\n{poem_content}"
                }
            ]
        )
        
        sentiment = completion.choices[0].message.content
        # 确保返回的是单个情感词
        # if sentiment.strip() in ['悲', '惧', '乐', '怒', '思', '喜', '忧']:
        #     return sentiment.strip()
        return sentiment
        
    except Exception as e:
        print(f"分析诗句时出错: {str(e)}")
        return None

df_sample = df[100:300].copy()
sentiments = []

for idx, row in df_sample.iterrows():
    print(f"正在分析第 {idx+1} 首诗...")
    sentiment = analyze_sentiment(row['content'])
    sentiments.append(sentiment)
    print(f"情感分析结果: {sentiment}")

# # 将结果添加到DataFrame
# df_sample['sentiment'] = sentiments

# # 显示结果
# print("\n分析结果:")
# print(df_sample[['title', 'poets', 'content', 'sentiment']])



ValueError: cannot reshape array of size 27286400 into shape (40,100)

In [4]:
sentiments_copy = sentiments.copy()
sentiments_copy

['```json\n{\n    "洁野凝晨曜": "乐",\n    "装墀带夕晖": "乐",\n    "集条分树玉": "乐",\n    "拂浪影泉玑": "乐",\n    "色洒妆台粉": "乐",\n    "花飘绮席衣": "乐",\n    "入扇萦离匣": "思",\n    "点素皎残机": "忧"\n}\n```',
 '```json\n{\n    "北阙三春晚": "思",\n    "南荣九夏初": "乐",\n    "黄莺弄渐变": "乐",\n    "翠林花落馀": "忧",\n    "瀑流还响谷": "思",\n    "猿啼自应虚": "忧",\n    "早荷向心卷": "喜",\n    "长杨就影舒": "乐",\n    "此时欢不极": "乐",\n    "调轸坐相於": "乐"\n}\n```',
 '```json\n{\n    "红轮不暂驻": "思",\n    "乌飞岂复停": "思",\n    "岑霞渐渐落": "忧",\n    "溪阴寸寸生": "思",\n    "藿叶随光转": "乐",\n    "葵心逐照倾": "乐",\n    "晚烟含树色": "忧",\n    "栖鸟杂流声": "乐"\n}\n```',
 '{\n    "高轩临碧渚": "乐",\n    "飞檐迥架空": "乐",\n    "馀花攒镂槛": "乐",\n    "残柳散雕栊": "忧",\n    "岸菊初含蕊": "乐",\n    "园梨始带红": "喜",\n    "莫虑昆山暗": "思",\n    "还共尽杯中": "乐"\n}',
 '{\n  "结伴戏方塘": "乐",\n  "携手上雕航": "乐",\n  "船移分细浪": "乐",\n  "风散动浮香": "乐",\n  "游莺无定曲": "乐",\n  "惊凫有乱行": "忧",\n  "莲稀钏声断": "悲",\n  "水广棹歌长": "乐",\n  "栖乌还密树": "思",\n  "泛流归建章": "思"\n}',
 '```json\n{\n    "华林满芳景": "乐",\n    "洛阳遍阳春": "乐",\n    "朱颜含远日": "思",\n    "翠色影长津": "思",\n    "乔柯啭娇鸟":

In [8]:
import os
import re
import json
def clean_json_string(json_str):
    # 移除 ```json 和 ``` 标记
    json_str = re.sub(r'```json\n|\n```', '', json_str)
    return json_str
def process_sentiments(sentiments):

    all_verses = {}
    
    for sentiment_json in sentiments:
        # 清理JSON字符串
        clean_json = clean_json_string(sentiment_json)
        
        try:
            # 解析JSON字符串并直接更新到总字典中
            verse_sentiments = json.loads(clean_json)
            all_verses.update(verse_sentiments)
        except json.JSONDecodeError as e:
            print(f"JSON解析错误: {str(e)}")
    return all_verses
def save_to_json(all_verses, output_path):
    # 如果文件已存在，先读取现有内容
    existing_data = {}
    if os.path.exists(output_path):
        try:
            with open(output_path, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)
        except json.JSONDecodeError:
            print(f"警告：现有文件 {output_path} 不是有效的JSON格式")
    existing_data.update(all_verses)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=4)
save_to_json(process_sentiments(sentiments_copy),'data/train.json')

JSON解析错误: Extra data: line 4 column 2 (char 44)


### 获取诗的体裁

In [18]:
import time
from openai import OpenAI
import re
import json
import os
import pandas as pd

def analyze_type(contents):
        # 初始化 OpenAI 客户端
    client = OpenAI(
        api_key="sk-5e329eb2ca954936bc587878b2bef459",
        base_url="https://api.deepseek.com/v1"
    )
    all_results = {}
    output_path = 'data/type.json'
    try:
        # 添加延时避免API限制
        time.sleep(1)
        completion = client.chat.completions.create(
            model='deepseek-chat',
            messages=[
                {"role": "system", "content": "你是一个诗词分析专家。请只返回诗词的体裁"},
                {
                    "role": "user",
                    "content": f"这是诗文内容{contents}。分析诗词的体裁，返回json格式 诗词题目:体裁"
                }
            ]
        )
        content = completion.choices[0].message.content
        return content
        
    except Exception as e:
        print(f"分析诗句时出错: {str(e)}")
        return None
def clean_json_string(json_str):
    # 移除 ```json 和 ``` 标记
    json_str = re.sub(r'```json\n|\n```', '', json_str)
    return json_str
def update_df_with_types(content, df):
    # 清理JSON字符串
    clean_json = clean_json_string(content)
    
    try:
        # 解析JSON字符串为Python对象
        poems_info = json.loads(clean_json)
        
        # 创建type列（如果不存在）
        if 'type' not in df.columns:
            df['type'] = '古诗'
            
        # 更新每首诗的体裁
        for i in range(len(poems_info)):
            title = poems_info[i]['诗词题目']
            poem_type = poems_info[i]['体裁']
            df.loc[df['title'] == title, 'type'] = poem_type
    except json.JSONDecodeError as e:
        print(f"JSON解析错误: {str(e)}")
        print(f"问题JSON字符串: {clean_json}")
    except Exception as e:
        print(f"其他错误: {str(e)}")
    return df
def batch_process_types(df, batch_size=2):
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        content = analyze_type(batch)
        if content:
            df = update_df_with_types(content, df)
        time.sleep(1)  # API限制
    return df

df = pd.read_csv('data/legal.csv')
df2 = df.iloc[range(6),:]

In [21]:
def analyze_type(contents):
    # 将内容列表转换为更简洁的格式
    simplified_contents = []
    for _, row in contents.iterrows():
        simplified_contents.append({
            "title": row['title'],
            "content": row['content']
        })
    # 初始化 OpenAI 客户端
    client = OpenAI(
        api_key="sk-5e329eb2ca954936bc587878b2bef459",
        base_url="https://api.deepseek.com/v1"
    )
    try:
        time.sleep(1)
        completion = client.chat.completions.create(
            model='deepseek-chat',
            messages=[
                {"role": "system", "content": "你是一个诗词分析专家。请只返回诗词的体裁"},
                {
                    "role": "user",
                    "content": f"这是诗文内容{simplified_contents}。分析诗词的体裁，返回json格式 诗词题目:体裁"
                }
            ]
        )
        content = completion.choices[0].message.content
        return content
        
    except Exception as e:
        print(f"分析诗句时出错: {str(e)}")
        return None
def update_df_with_types(content, df):
    # 清理JSON字符串
    clean_json = re.sub(r'```json\n|\n```', '', content)
    
    try:
        # 解析JSON字符串为Python对象
        poems_info = json.loads(clean_json)
        
        # 创建type列（如果不存在）
        if 'type' not in df.columns:
            df['type'] = '古诗'
            
        # 更新每首诗的体裁
        for i in range(len(poems_info)):
            title = poems_info[i]['诗词题目']
            poem_type = poems_info[i]['体裁']
            df.loc[df['title'] == title, 'type'] = poem_type
    except json.JSONDecodeError as e:
        print(f"JSON解析错误: {str(e)}")
        print(f"问题JSON字符串: {clean_json}")
    except Exception as e:
        print(f"其他错误: {str(e)}")
    return df

def batch_process_types(df, batch_size=1):
    """
    分批处理数据，每批处理batch_size条记录
    """
    processed_df = pd.DataFrame()  # 存储处理后的结果
    
    # 计算需要处理的批次数
    total_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
    
    for i in range(total_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        print(f"处理批次 {i+1}/{total_batches} (记录 {start_idx}-{end_idx})")
        
        # 获取当前批次的数据
        batch_df = df.iloc[start_idx:end_idx].copy()
        
        # 处理当前批次
        content = analyze_type(batch_df)
        if content:
            batch_df = update_df_with_types(content, batch_df)
            processed_df = pd.concat([processed_df, batch_df])
        
        print(f"完成批次 {i+1}")
    
    return processed_df

df_processed = batch_process_types(df2, batch_size=2)

处理批次 1/3 (记录 0-2)
完成批次 1
处理批次 2/3 (记录 2-4)
完成批次 2
处理批次 3/3 (记录 4-6)
完成批次 3


In [22]:
df_processed

Unnamed: 0,title,poets,content,volumes,type
0,过晋阳宫,李隆基,缅想封唐处，实惟建国初。俯察伊晋野，仰观乃参虚。 井邑龙斯跃，城池凤翔馀。林塘犹沛泽，台榭宛...,3,五言古诗
1,行次成皋途经先圣擒建德之所缅思功业感而赋诗,李隆基,有隋政昏虐，群雄已交争。先圣按剑起，叱咤风云生。 饮马河洛竭，作气嵩华惊。克敌睿图就，擒俘帝...,3,五言古诗
2,校猎义成喜逢大雪率题九韵以示群官,李隆基,弧矢威天下，旌旗游近县。一面施鸟罗，三驱教人战。 暮云成积雪，晓色开行殿。皓然原隰同，不觉林...,3,五言古诗
3,赐诸州刺史以题座右,李隆基,眷言思共理，鉴梦想维良。猗欤此推择，声绩著周行。 贤能既俟进，黎献实伫康。视人当如子，爱人亦...,3,五言古诗
4,送忠州太守康昭远等,李隆基,端拱临中枢，缅怀共予理。不有台阁英，孰振循良美。 分符侯甸内，拜手明庭里。誓节期饮冰，调人方...,3,五言古诗
5,送李邕之任滑台,李隆基,汉家重东郡，宛彼白马津。黎庶既蕃殖，临之劳近臣。 远别初首路，今行方及春。课成应第一，良牧尔当仁。,3,五言古诗


### 将处理好的数据集合转为df

In [57]:
with open('data/全唐诗t.txt','r',encoding='utf-8') as f:
    txt = f.read()


In [58]:
import re 
all_list = re.split(r'卷\d+_?\d*', txt)
volume_nums = [int(num) for num in re.findall(r'卷(\d+)_?\d*', txt)]
cleaned_parts = [re.sub(r'\s+', ' ', part.strip()) for part in all_list if part.strip()]

In [59]:
def clean_to_df(cleaned_parts,volume_nums):
    # 正则表达式：提取题目、诗人和诗文内容
    pattern = r"【(.*?)】([\u4e00-\u9fa5]+)?\s*(.*)"

    # 用于存储提取结果
    poetry_list = []
    for text,volume in zip(cleaned_parts,volume_nums):
        match = re.match(pattern, text)
        if match:
            title = match.group(1)
            poet = match.group(2) if match.group(2) else '佚名'
            content = match.group(3)
            poetry_list.append({"title": title, "poets": poet, "content": content,'volumes':volume})
    df = pd.DataFrame(poetry_list)
    return df

In [60]:
df = clean_to_df(cleaned_parts,volume_nums)
df.to_csv('data/poemsTEST.csv',index=False,encoding='utf-8')
print('成功保存')

成功保存


In [61]:
df

Unnamed: 0,title,poets,content,volumes
0,太子纳妃太平公主出降,李治,龙楼光曙景，鲁馆启朝扉。艳日浓妆影，低星降婺辉。 玉庭浮瑞色，银榜藻祥徽。云转花萦盖，霞飘叶...,2
1,七夕宴悬圃二首,李治,羽盖飞天汉，凤驾越层峦。俱叹三秋阻，共叙一宵欢。 璜亏夜月落，靥碎晓星残。谁能重操杼，纤手濯...,2
2,过温汤,李治,温渚停仙跸，丰郊驻晓旌。路曲回轮影，岩虚传漏声。 暖溜惊湍驶，寒空碧雾轻。林黄疏叶下，野白曙...,2
3,九月九日,李治,端居临玉扆，初律启金商。凤阙澄秋色，龙闱引夕凉。 野净山气敛，林疏风露长。砌兰亏半影，岩桂发...,2
4,谒慈恩寺题奘法师房,李治,停轩观福殿，游目眺皇畿。法轮含日转，花盖接云飞。 翠烟香绮阁，丹霞光宝衣。幡虹遥合彩，定水迥...,2
...,...,...,...,...
382,雩祀乐章·肃和,佚名,朱鸟开辰，苍龙启映。大帝昭飨，群生展敬。 礼备怀柔，功宣舞咏。旬液应序，年祥协庆。,10
383,雩祀乐章·雍和,佚名,绀筵分彩，宝图吐绚。风管晨凝，云歌晓啭。 肃事兰羞，虔申桂奠。百谷斯登，万箱攸荐。,10
384,雩祀乐章·舒和,佚名,凤曲登歌调令序，龙雩集舞泛祥风。 彩旞云回昭睿德，朱干电发表神功。,10
385,雩祀乐章·豫和,佚名,鸟纬迁序，龙星见辰。纯阳在律，明德崇禋。 五方降帝，万宇安人。恭以致享，肃以迎神。,10


In [63]:
def has_special_chars(text):
        """检查是否包含特殊字符"""
        legal_pattern = r'[^\u4e00-\u9fff\【\】\。\，\（\）\-\、\！\？\"\"\：\·\；\《\》\s]'
        special_chars = re.findall(legal_pattern, text)
        return bool(special_chars), ''.join(set(special_chars))
for content in df['content']:
    status, special = has_special_chars(content)
    if status:
        print(f'特殊字符：{special}', content)

特殊字符：□ 今宵冬律尽，来朝丽景新。花馀凝地雪，条含暖吹分。 绶吐芽犹嫩，冰□已镂津。薄红梅色冷，浅绿柳轻春。 送迎交两节，暄寒变一辰。
特殊字符：□ 奇峰嶾嶙箕山北，秀崿岧峣嵩镇南。地首地肺何曾拟， 天目天台倍觉惭。树影蒙茏鄣叠岫，波深汹涌落悬潭。 □愿紫宸居得一，永欣丹扆御通三。
特殊字符：lq 寿丘惟旧迹，酆邑乃前基。粤予承累圣，悬弧亦在兹。 弱龄逢运改，提剑郁匡时。指麾八荒定，怀柔万国夷。 梯山咸入款，驾海亦来思。单于陪武帐，日逐卫文lq。 端扆朝四岳，无为任百司。霜节明秋景，轻冰结水湄。 芸黄遍原隰，禾颖积京畿。共乐还乡宴，欢比大风诗。
特殊字符：z1 醽醁胜兰生，翠涛过玉z1。千日醉不醒，十年味不败。
特殊字符：1 1 风云喜际会，雷雨遂流滋。荐币虚陈礼，动天实精思。 渐侵九夏节，复在三春时。霢霂垂朱阙，飘飖入绿墀。 郊坰既沾足，黍稷有丰期。百辟同康乐，万方伫雍熙。
特殊字符：1 1 良工运精思，巧极似有神。 临窗忽睹繁阴合，再盼真假殊未分。
特殊字符：1 1 辇路生春草，上林花发时。凭高何限意，无复侍臣知。
特殊字符：1 1 注想待元老，识君恨不早。我家柱石衰，忧来学丘祷。
特殊字符：1 1 上元高会集群仙，心斋何事欲祈年。 丹诚傥彻玉帝座，且共吾人庆大田。 蓂生三五叶初齐，上元羽客出桃蹊。 不爱仙家登真诀，愿蒙四海福黔黎。
特殊字符：1 1 人皆苦炎热，我爱夏日长。--李昂1 熏风自南来，殿阁生微凉。 --柳公权
特殊字符：1 1 只解劈牛兼劈树，不能诛恶与诛凶。（咏雷）
特殊字符：2f 逐仙赏，展幽情，逾昆阆，迈蓬瀛。 游鲁馆，陟秦台。污山壁，愧琼瑰。 檀栾竹影，飙f2松声。不烦歌吹，自足娱情。 仰循茅宇，俯眄乔枝。烟霞问讯，风月相知。 枝条郁郁，文质彬彬。山林作伴，松桂为邻。 清波汹涌，碧树冥蒙。莫怪留步，因攀桂丛。 莫论圆峤，休说方壶。何如鲁馆，即是仙都。 玉环腾远创，金埒荷殊荣。弗玩珠玑饰，仍留仁智情。 凿山便作室，凭树即为楹。公输与班尔，从此遂韬声。 登山一长望，正遇九春初。结驷填街术，闾阎满邑居。 斗雪梅先吐，惊风柳未舒。直愁斜日落，不畏酒尊虚。 霁晓气清和，披襟赏薜萝。玳瑁凝春色，琉璃漾水波。 跂石聊长啸，攀松乍短歌。除非物外者，谁就此经过。 暂尔游山第，淹留惜未归。霞窗明月满，涧户白云飞。 书引藤为架，人将薜作衣。此真攀玩所，临睨赏

### 查找诗文体裁

In [21]:
df['title'].value_counts()[:10]

title
句             9
明皇祀圜丘乐章·寿和    3
雩祀乐章·豫和       2
咏雨            2
明皇祀圜丘乐章·太和    2
明皇祀圜丘乐章·豫和    2
唐大飨拜洛乐章·昭和    2
喜雪            2
玄都观           2
唐大飨拜洛乐章·归和    2
Name: count, dtype: int64

In [None]:
df['体裁'] = df['题目'].apply(lambda x: '乐府诗' if '·' in x else '其他诗歌')


In [69]:
# 定义体裁判断函数
def classify_poetry_genre(title):
    # 判断题目中是否包含 "·" 来判定是否为乐府诗
    if '·' in title:
        # 判断 "·" 后面是否包含 "第" 字
        if '第' in title.split('·')[1]:
            return '古体诗'  # 包含 "第" 字，属于古体诗
        else:
            return '乐府诗'  # 否则为乐府诗
    else:
        return '其他诗歌'


            题目   诗人 诗文内容    体裁
0   明皇祀圜丘乐章·寿和   李白  内容1   乐府诗
1  咸亨殿宴近臣诸亲柏梁体   李治  内容2  其他诗歌
2           登高  王之涣  内容3  其他诗歌


### 字向量加聚类

In [42]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def optimize_clusters(vectors, max_clusters=10):
    """使用轮廓系数优化聚类数量"""
    silhouette_scores = []
    
    for n_clusters in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        labels = kmeans.fit_predict(vectors)
        score = silhouette_score(vectors, labels)
        silhouette_scores.append(score)
        
    optimal_clusters = silhouette_scores.index(max(silhouette_scores)) + 2
    return optimal_clusters

def cluster_words(model, word_list):
    """对词向量进行聚类"""
    # 获取词向量
    vectors = []
    valid_words = []
    
    for word in word_list:
        if word in model.wv:
            vectors.append(model.wv[word])
            valid_words.append(word)
    
    vectors = np.array(vectors)
    
    # 优化聚类数量
    n_clusters = optimize_clusters(vectors)
    
    # 执行聚类
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(vectors)
    
    # 整理聚类结果
    clusters = {}
    for word, label in zip(valid_words, labels):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(word)
        
    return clusters

In [43]:
# 评估模型性能
def evaluate_model(model, test_words, topn=20):
    """评估模型在相似词任务上的表现"""
    for word in test_words:
        if word in model.wv:
            similar_words = model.wv.most_similar(
                        positive=[word],
                        topn=topn,
                        restrict_vocab=None
                    )
            print(f"\n{word} 的相似词:")
            for similar_word, score in similar_words[:5]:
                print(f"{similar_word}: {score:.4f}")

In [48]:
import os 
def main():
    with open('data/全唐诗.txt', 'r', encoding='utf-8') as f:
        text = f.read()

    # 2. 更好的文本预处理   
    sentences = []
    for line in text.split('\n'):
        # 移除标点符号
        line = re.sub(r'[^\u4e00-\u9fff]', '', line)
        if len(line) > 1:  # 确保句子长度大于1
            sentences.append(list(line))
    vector_file = "models/word_vectors.model"
    if os.path.exists(vector_file):
        model = Word2Vec.load(vector_file)
    else:
        model = Word2Vec(sentences,
                vector_size=100,     # 降低维度
                window=10,           # 扩大窗口以捕获更多上下文
                min_count=10,        # 提高词频阈值过滤低频词
                sg=1,                # 使用skip-gram
                negative=10,         # 增加负采样
                epochs=20,           # 增加训练轮数
                workers=multiprocessing.cpu_count())
        model.save(vector_file)
    # 5. 测试情感词
    emotion_dict = {}
    results = {}
    emotions = ['悲', '喜', '怒', '乐', '忧', '思']
    for emotion in emotions:
        similar = evaluate_model(model, emotion)
        if isinstance(similar, list):
            # 只显示相似度大于0.45的结果
            emotion_dict[emotion] = [w for w, s in similar if s > 0.45]
            results[emotion] = [(w, s) for w, s in similar if s > 0.45]
    print("\n模型训练结果:")
    print(emotion_dict)

if __name__ == "__main__":
    main()

Model loaded succeed

悲 的相似词:
哀: 0.7226
伤: 0.6705
叹: 0.6043
嗟: 0.6031
泣: 0.5967

喜 的相似词:
贺: 0.5433
逢: 0.5379
幸: 0.5123
忻: 0.5070
频: 0.5007

怒 的相似词:
訇: 0.6412
呀: 0.5794
号: 0.5788
掣: 0.5545
吼: 0.5482

乐 的相似词:
酺: 0.5513
雩: 0.5343
享: 0.5297
歌: 0.5261
顺: 0.5153

忧 的相似词:
患: 0.5945
虑: 0.5510
雠: 0.5358
疹: 0.5254
防: 0.5199

思 的相似词:
忆: 0.6287
望: 0.5865
梦: 0.5665
吟: 0.5279
念: 0.5202

模型评估结果:
{}
