In [3]:
import pandas as pd # 修复了错误的导入语句
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Concatenate, Dropout 
from sklearn.model_selection import KFold # 导入 KFold 用于交叉验证
from sklearn.metrics import f1_score # 导入 f1_score 用于最终评估
import re # We need the regex library for data cleaning
# 修正: 导入 to_categorical
from tensorflow.keras.utils import to_categorical 

# 注意: 要达到 f1-score > 0.81, 必须安装并使用中文分词库 (如 jieba)。
# 请确保在您的环境中已安装 'jieba': pip install jieba

# --- 1. CONFIGURATION AND HYPERPARAMETERS ---

# Set configuration parameters for the model (Hyperparameters)
VOCAB_SIZE = 20000  # 词汇表大小
EMBEDDING_DIM = 100 # 词向量维度
MAX_LEN = 300       # 输入序列最大长度
FILTERS = 128       # 卷积核数量
KERNEL_SIZES = [3, 4, 5] # 卷积核大小 (n-grams)
NUM_CLASSES = 3     # 情感类别数量 (正、中、负)
BATCH_SIZE = 32     # 批次大小
EPOCHS = 10         # 降低 Epochs, 在交叉验证中防止单次训练过久
DROPOUT_RATE = 0.5  # Dropout 比率

# 交叉验证配置
N_SPLITS = 5        # 5 折交叉验证

# --- 2. DATA LOADING AND MERGING ---

print("Step 2: Loading and Merging Data...")

try:
    # Load the datasets
    df_train_data = pd.read_csv("Train_DataSet.csv")
    df_train_labels = pd.read_csv("Train_DataSet_Label.csv")
    df_test = pd.read_csv("Test_DataSet.csv")

    # 假设训练集文本数据与标签按行对应,或基于 ID 列合并
    if df_train_data.shape[0] == df_train_labels.shape[0]:
        df_train_data['label'] = df_train_labels.iloc[:, 0]
    else:
        df_train = pd.merge(df_train_data, df_train_labels, on=df_train_data.columns[0])
        df_train_data = df_train 

    # 确定文本列名和标签列名
    TEXT_COL = df_train_data.columns[1] if df_train_data.columns[1] not in ['label', 'ID'] else df_train_data.columns[2]
    LABEL_COL = 'label'

    X_train_text = df_train_data[TEXT_COL].astype(str).tolist()
    y_train_labels = df_train_data[LABEL_COL].values

    X_test_text = df_test.iloc[:, 1].astype(str).tolist()

except Exception as e:
    print(f"Error during data loading/merging. Please check your file paths and column names.")
    print(f"Error details: {e}")
    exit()

# --- 3. DATA PREPROCESSING (CHINESE WORD SEGMENTATION, CLEANING, TOKENIZATION) ---

# NEW STEP 3.0: Data Cleaning and Segmentation Function
def clean_and_segment_text(text):
    # 1. 简单清洗 (Simple cleaning)
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub(r'[^\w\s\u4e00-\u9fff]', '', text) # Remove non-Chinese/non-alphanumeric chars
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces

    # 2. **中文分词 (CRITICAL for Chinese NLP)**
    # 目标要求 f1 > 0.81, 必须使用分词。
    try:
        import jieba
        # 使用 jieba 进行分词, 结果是词语列表, 然后用空格连接
        # 您可能还需要加载停用词表
        segmented_text = " ".join(jieba.cut(text)) 
        return segmented_text
    except ImportError:
        # 如果未安装 jieba, 则直接返回清洗后的文本, 但准确率会很低
        print("Warning: 'jieba' not installed or imported. Proceeding without Chinese segmentation.")
        return text

print("Step 3: Applying Chinese Segmentation and Cleaning...")

# 应用分词和清洗到所有训练数据
X_train_segmented = [clean_and_segment_text(text) for text in X_train_text]
X_test_segmented = [clean_and_segment_text(text) for text in X_test_text]


print("Step 3.1: Tokenizing and Padding Text (Based on full dataset)...")

# 3.1 Tokenization: 词汇化 (在全量训练集上 fit)
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<unk>") 
tokenizer.fit_on_texts(X_train_segmented) 

# Convert texts to sequences of integers
train_sequences = tokenizer.texts_to_sequences(X_train_segmented)
test_sequences = tokenizer.texts_to_sequences(X_test_segmented)

# 3.2 Padding: 序列填充 (在全量训练集上进行)
X_padded = pad_sequences(train_sequences, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_padded = pad_sequences(test_sequences, maxlen=MAX_LEN, padding='post', truncating='post')

# 3.3 Label Encoding: One-Hot 编码 (对全量标签进行)
y_onehot = to_categorical(y_train_labels, num_classes=NUM_CLASSES)

# --- 4. MODEL DEFINITION FUNCTION ---

def build_text_cnn_model():
    # 4.1 Input Layer
    input_layer = Input(shape=(MAX_LEN,))

    # 4.2 Embedding Layer
    embedding_layer = Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=EMBEDDING_DIM,
        input_length=MAX_LEN
    )(input_layer)

    # 4.3 Convolutional Block
    conv_blocks = []
    for k_size in KERNEL_SIZES:
        conv = Conv1D(
            filters=FILTERS,         
            kernel_size=k_size,      
            activation='relu'        
        )(embedding_layer)

        # Global Max Pooling
        pool = GlobalMaxPooling1D()(conv)
        conv_blocks.append(pool)

    # 4.4 Concatenation, Dropout, and Output Layer
    merged_feature_vector = Concatenate()(conv_blocks)
    dropout_layer = Dropout(DROPOUT_RATE)(merged_feature_vector)
    dense_layer = Dense(128, activation='relu')(dropout_layer) 
    output_layer = Dense(NUM_CLASSES, activation='softmax')(dense_layer)

    model = Model(inputs=input_layer, outputs=output_layer)
    
    # 5.1 Compilation
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

# --- 5. K-FOLD CROSS-VALIDATION AND TRAINING ---

print(f"\nStep 5: Starting {N_SPLITS}-Fold Cross-Validation...")

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
fold_accuracies = []
fold_f1_scores = []

for fold, (train_index, val_index) in enumerate(kf.split(X_padded, y_onehot)):
    print(f"\n--- Fold {fold+1}/{N_SPLITS} ---")
    
    # 划分训练集和验证集
    X_train, X_val = X_padded[train_index], X_padded[val_index]
    y_train, y_val = y_onehot[train_index], y_onehot[val_index]

    # 构建和训练模型
    model = build_text_cnn_model()
    
    # model.summary() # 每次折叠都打印会太长,只打印一次
    if fold == 0:
        print("Model Summary (First Fold):")
        model.summary()

    history = model.fit(
        X_train, y_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(X_val, y_val),
        verbose=1 # 设置为 1 打印进度条
    )
    
    # 验证集评估
    # 预测概率
    val_probabilities = model.predict(X_val)
    # 转换为预测标签 (0, 1, 2)
    val_predictions = np.argmax(val_probabilities, axis=1)
    # 转换为真实标签 (0, 1, 2)
    val_true_labels = np.argmax(y_val, axis=1)
    
    # 计算 F1-score (使用 'macro' 平均, 适合多分类任务)
    f1 = f1_score(val_true_labels, val_predictions, average='macro')
    
    # 记录结果
    fold_accuracies.append(history.history['val_accuracy'][-1])
    fold_f1_scores.append(f1)
    print(f"Fold {fold+1} Validation F1-score: {f1:.4f}")

# --- 6. FINAL ANALYSIS AND PREDICTION ---

avg_f1 = np.mean(fold_f1_scores)
print(f"\n--- Cross-Validation Complete ---")
print(f"Average Validation Accuracy across {N_SPLITS} folds: {np.mean(fold_accuracies):.4f}")
print(f"Average Validation F1-score across {N_SPLITS} folds: {avg_f1:.4f}")

# 最终预测: 使用在所有数据上训练的最终模型 (或选择最佳折叠模型)进行预测
# 为了简单起见，我们重新训练一个模型作为最终模型，但更严谨的做法是使用所有数据重新训练
print("\nStep 6: Predicting on the Test Data using the final model...")

final_model = build_text_cnn_model()
final_model.fit(
    X_padded, y_onehot,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=0 # 不打印训练过程
)

# 6.1 Make predictions on the unseen test data
test_probabilities = final_model.predict(X_test_padded)
test_predictions = np.argmax(test_probabilities, axis=1)

# 6.2 Create the submission DataFrame
df_results = df_test.copy()
df_results['Predicted_Sentiment_Label'] = test_predictions

# Save the results to a new CSV file
output_filename = "Test_DataSet_Predictions.csv"
df_results.to_csv(output_filename, index=False)

print(f"\nTraining Complete! Results saved to {output_filename}")
print("Sample of predictions:")
print(df_results.head())

# --- 7. GUIDANCE ---
if avg_f1 > 0.81:
    print(f"\nCongratulations! Your average F1-score of {avg_f1:.4f} meets the target of 0.81!")
elif avg_f1 > 0.70:
    print(f"\nYour average F1-score of {avg_f1:.4f} meets the Baseline requirement, but not the higher target of 0.81.")
else:
    print(f"\nYour average F1-score of {avg_f1:.4f} is currently below the Baseline requirement of 0.70.")

print("\nTo improve the score to > 0.81, ensure 'jieba' is installed and used for Chinese segmentation. You may also experiment with:\n1. Increasing the Embedding Dimension (EMBEDDING_DIM).\n2. Using a pre-trained Word Embedding model (e.g., Word2Vec, BERT).\n3. Adjusting the Dropout Rate (DROPOUT_RATE) or Epochs (EPOCHS).")

Step 2: Loading and Merging Data...
Step 3: Applying Chinese Segmentation and Cleaning...
Step 3.1: Tokenizing and Padding Text (Based on full dataset)...

Step 5: Starting 5-Fold Cross-Validation...

--- Fold 1/5 ---
Model Summary (First Fold):




Epoch 1/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 71ms/step - accuracy: 0.6456 - loss: 0.8114 - val_accuracy: 0.7568 - val_loss: 0.6423
Epoch 2/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 75ms/step - accuracy: 0.8460 - loss: 0.4018 - val_accuracy: 0.7677 - val_loss: 0.6221
Epoch 3/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 75ms/step - accuracy: 0.9666 - loss: 0.1052 - val_accuracy: 0.7629 - val_loss: 0.8679
Epoch 4/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 79ms/step - accuracy: 0.9908 - loss: 0.0343 - val_accuracy: 0.7643 - val_loss: 1.0036
Epoch 5/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 78ms/step - accuracy: 0.9957 - loss: 0.0184 - val_accuracy: 0.7486 - val_loss: 1.0722
Epoch 6/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 73ms/step - accuracy: 0.9966 - loss: 0.0133 - val_accuracy: 0.7541 - val_loss: 1.0540
Epoch 7/10
[1m1



[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 76ms/step - accuracy: 0.6545 - loss: 0.7955 - val_accuracy: 0.7371 - val_loss: 0.6699
Epoch 2/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 78ms/step - accuracy: 0.8495 - loss: 0.3869 - val_accuracy: 0.7364 - val_loss: 0.6619
Epoch 3/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 74ms/step - accuracy: 0.9637 - loss: 0.1086 - val_accuracy: 0.7275 - val_loss: 0.8556
Epoch 4/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 75ms/step - accuracy: 0.9920 - loss: 0.0328 - val_accuracy: 0.7343 - val_loss: 1.0017
Epoch 5/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 77ms/step - accuracy: 0.9956 - loss: 0.0215 - val_accuracy: 0.7173 - val_loss: 1.0151
Epoch 6/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 77ms/step - accuracy: 0.9959 - loss: 0.0155 - val_accuracy: 0.7159 - val_loss: 1.0812
Epoch 7/10
[1m184/184[0m 



[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 74ms/step - accuracy: 0.6488 - loss: 0.8115 - val_accuracy: 0.7643 - val_loss: 0.6256
Epoch 2/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 73ms/step - accuracy: 0.8489 - loss: 0.3905 - val_accuracy: 0.7691 - val_loss: 0.6130
Epoch 3/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 72ms/step - accuracy: 0.9699 - loss: 0.0946 - val_accuracy: 0.7446 - val_loss: 0.7969
Epoch 4/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 80ms/step - accuracy: 0.9922 - loss: 0.0311 - val_accuracy: 0.7493 - val_loss: 0.9789
Epoch 5/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 87ms/step - accuracy: 0.9944 - loss: 0.0253 - val_accuracy: 0.7548 - val_loss: 1.0639
Epoch 6/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 74ms/step - accuracy: 0.9956 - loss: 0.0194 - val_accuracy: 0.7568 - val_loss: 1.0361
Epoch 7/10
[1m184/184[0m 



[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 74ms/step - accuracy: 0.6393 - loss: 0.8102 - val_accuracy: 0.7418 - val_loss: 0.6550
Epoch 2/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 75ms/step - accuracy: 0.8496 - loss: 0.3981 - val_accuracy: 0.7623 - val_loss: 0.5966
Epoch 3/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 50ms/step - accuracy: 0.9687 - loss: 0.0992 - val_accuracy: 0.7575 - val_loss: 0.8025
Epoch 4/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 47ms/step - accuracy: 0.9927 - loss: 0.0326 - val_accuracy: 0.7561 - val_loss: 0.9083
Epoch 5/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 1s/step - accuracy: 0.9963 - loss: 0.0169 - val_accuracy: 0.7602 - val_loss: 0.9629
Epoch 6/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 51ms/step - accuracy: 0.9966 - loss: 0.0135 - val_accuracy: 0.7554 - val_loss: 1.0279
Epoch 7/10
[1m184/184[0m [32



[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 73ms/step - accuracy: 0.6478 - loss: 0.8051 - val_accuracy: 0.7616 - val_loss: 0.6292
Epoch 2/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 74ms/step - accuracy: 0.8435 - loss: 0.4036 - val_accuracy: 0.7711 - val_loss: 0.6033
Epoch 3/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 74ms/step - accuracy: 0.9661 - loss: 0.1048 - val_accuracy: 0.7473 - val_loss: 0.7739
Epoch 4/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 74ms/step - accuracy: 0.9906 - loss: 0.0380 - val_accuracy: 0.7493 - val_loss: 0.8968
Epoch 5/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 76ms/step - accuracy: 0.9940 - loss: 0.0229 - val_accuracy: 0.7452 - val_loss: 0.9422
Epoch 6/10
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 72ms/step - accuracy: 0.9963 - loss: 0.0150 - val_accuracy: 0.7575 - val_loss: 0.9904
Epoch 7/10
[1m184/184[0m 



[1m230/230[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step

Training Complete! Results saved to Test_DataSet_Predictions.csv
Sample of predictions:
                                 id                          title  \
0  00005a3efe934a19adc0b69b05faeae7                     九江办好人民满意教育   
1  0009dc82c41341d794837271d4dbff5f  中央第三生态环境保护督察组转办我市第三十一批信访件办理情况   
2  000f3763b6074588817c9ab90a22a814               大雨天车被淹，保险公司该不该赔？   
3  00117934dbe240068c95b6d04b08eea7         英特尔新cpu微架构ocean cove曝光   
4  0014cef5ccfa43b3a6b20162a03763fa     公安部侦破一批重大网络赌博案件 德州约局平台成重灾区   

                                             content  \
0  近3年来，九江市紧紧围绕“人本教育、公平教育、优质教育、幸福教育”的目标，努力办好人民满意教...   
1  中央第三生态环境保护督察组转办我市的第三十一批信访件共计1件，截至12月11日，已全部办结，...   
2  核心提示：近日，连续的降雨天气造成中心城区部分路段积水严重，一些市民驾车出行涉水时造成车辆被...   
3  intel在主流平台上确认，今年将推出基于14nm的whiskeylake，明年开始大规模出...   
4  2018年4月，公安部指挥河南、北京、广西等地公安机关联合行动，成功侦破北京联众公司棋牌事业...   

   Predicted_Sentiment_Label  
0                          0  
1              