# Word2Vec

示例

In [1]:
from gensim.models import Word2Vec
from utils import *

train_data = read_train_data()
test_data = read_test_data()

model = Word2Vec(sentences=[['1', '2', '3'], ['2', '3', '4']], vector_size=100, window=5, min_count=1, workers=4)
model.wv.most_similar('1', topn=10)


[('2', 0.06797593086957932),
 ('4', -0.013514931313693523),
 ('3', -0.1116705983877182)]

使用真实数据训练Word2Vec模型

In [2]:
model = Word2Vec(sentences=list(train_data['content']), vector_size=100, window=5, min_count=1, workers=12)

# 分类模型

In [3]:
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F
import numpy as np

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_size, output_size, embedding_matrix, kernel_sizes=[3, 4, 5], num_filters=64):
        super(TextCNN, self).__init__()

        # 嵌入层
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = False

        # 卷积层
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filters, (k, embed_size)) for k in kernel_sizes])

        # 全连接层
        self.fc = nn.Linear(num_filters * len(kernel_sizes), output_size)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1) # [batch, 1, seq_len, embed_size]
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] # [batch, num_filters, seq_len-k+1]
        x = [F.max_pool1d(line, line.size(2)).squeeze(2) for line in x] # [batch, num_filters]
        x = torch.cat(x, 1) # [batch, num_filters * len(kernel_sizes)]
        logits = self.fc(x)
        return logits

class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_classes, embedding_matrix):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False # 使用预训练的词向量
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        avg_pool = torch.mean(lstm_out, 1)
        output = self.fc(avg_pool)
        return output



# 准备数据，创建dataloader

In [4]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import functional as F
import numpy as np

# 创建词汇表
vocab = {word: index for index, word in enumerate(model.wv.index_to_key)}
vocab_size = len(vocab)

# 创建嵌入矩阵
embedding_matrix = np.zeros((vocab_size, 100))
for word, index in vocab.items():
    embedding_matrix[index] = model.wv[word]

# 文本转换为索引
def text_to_index(texts):
    indices = []
    for text in texts:
        index_list = [vocab[word] for word in text if word in vocab]
        indices.append(index_list)
    return indices

# 对文本和标签进行处理
x_train = text_to_index(train_data['content'])
y_train = train_data['label']

# pad sequences
def pad_sequences(sequences, maxlen=None, padding='pre', truncating='pre', value=0.):

    lengths = [len(s) for s in sequences]
    nb_samples = len(sequences)

    if maxlen is None:
        maxlen = np.max(lengths)

    x = (np.ones((nb_samples, maxlen)) * value).astype(np.int32)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # empty list was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError("Truncating type '%s' not understood" % padding)

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError("Padding type '%s' not understood" % padding)

    return x

x_train = pad_sequences(x_train, maxlen=214, padding='post', truncating='post')


x_train_tensor = torch.tensor(x_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

# 在训练集上进行五折交叉验证

In [6]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm


output_size = len(set(y_train))
embed_size = 100
patience = 5
# for BiLSTM
hidden_size = 256

# Use GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

kf = KFold(n_splits=5, shuffle=True, random_state=42)

acc_list = []
for fold, (train_index, val_index) in enumerate(kf.split(x_train_tensor)):
    print(f"Fold {fold + 1}")
    x_train_fold, x_val_fold = x_train_tensor[train_index], x_train_tensor[val_index]
    y_train_fold, y_val_fold = y_train_tensor[train_index], y_train_tensor[val_index]

    train_dataset = TensorDataset(x_train_fold, y_train_fold)
    val_dataset = TensorDataset(x_val_fold, y_val_fold)

    train_loader = DataLoader(train_dataset, batch_size=128)
    val_loader = DataLoader(val_dataset, batch_size=128)

    # 定义和训练模型
    # 1.TextCNN
    # 2.BiLSTM

    # model = TextCNN(vocab_size, embed_size, output_size, embedding_matrix).to(device)


    model = BiLSTMClassifier(vocab_size, embed_size, hidden_size, output_size, embedding_matrix).to(device)
    
    loss_function = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    best_val_loss = float('inf')
    early_stopping_counter = 0

    for epoch in tqdm(range(100)): # 使用tqdm库进行进度展示
        model.train()
        for batch_text, batch_labels in train_loader:
            # 将数据放到GPU上
            batch_text, batch_labels = batch_text.to(device), batch_labels.to(device) 
            optimizer.zero_grad()
            logits = model(batch_text)
            loss = loss_function(logits, batch_labels)
            loss.backward()
            optimizer.step()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_text, batch_labels in val_loader:
                batch_text, batch_labels = batch_text.to(device), batch_labels.to(device)
                logits = model(batch_text)
                loss = loss_function(logits, batch_labels)
                val_loss += loss.item()
        val_loss /= len(val_loader)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= patience:
                print(f"Early stopping at epoch {epoch + 1}")
                break

    # 验证模型
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch_text, batch_labels in val_loader:
            batch_text, batch_labels = batch_text.to(device), batch_labels.to(device)
            logits = model(batch_text)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch_labels.cpu().numpy())

    # 使用classification_report输出结果, 小数点后四位
    print(classification_report(all_labels, all_preds, digits=4))
    acc = accuracy_score(all_labels, all_preds)
    acc_list.append(acc)
    
# print mean accuracy
print(f"Mean accuracy: {np.mean(acc_list)}")

Fold 1


  0%|          | 0/100 [00:00<?, ?it/s]

 17%|█▋        | 17/100 [00:54<04:27,  3.23s/it]

Early stopping at epoch 18





              precision    recall  f1-score   support

           0     0.9912    0.9979    0.9945      2357
           1     0.9883    0.9526    0.9701       443

    accuracy                         0.9907      2800
   macro avg     0.9897    0.9752    0.9823      2800
weighted avg     0.9907    0.9907    0.9906      2800

Fold 2


 10%|█         | 10/100 [00:33<05:04,  3.39s/it]

Early stopping at epoch 11





              precision    recall  f1-score   support

           0     0.9840    1.0000    0.9919      2396
           1     1.0000    0.9035    0.9493       404

    accuracy                         0.9861      2800
   macro avg     0.9920    0.9517    0.9706      2800
weighted avg     0.9863    0.9861    0.9858      2800

Fold 3


 10%|█         | 10/100 [00:33<05:04,  3.39s/it]

Early stopping at epoch 11





              precision    recall  f1-score   support

           0     0.9886    0.9983    0.9934      2348
           1     0.9907    0.9403    0.9648       452

    accuracy                         0.9889      2800
   macro avg     0.9896    0.9693    0.9791      2800
weighted avg     0.9889    0.9889    0.9888      2800

Fold 4


 18%|█▊        | 18/100 [00:58<04:24,  3.23s/it]

Early stopping at epoch 19





              precision    recall  f1-score   support

           0     0.9949    0.9966    0.9958      2362
           1     0.9816    0.9726    0.9771       438

    accuracy                         0.9929      2800
   macro avg     0.9882    0.9846    0.9864      2800
weighted avg     0.9928    0.9929    0.9928      2800

Fold 5


 15%|█▌        | 15/100 [00:48<04:36,  3.26s/it]

Early stopping at epoch 16





              precision    recall  f1-score   support

           0     0.9887    0.9992    0.9939      2373
           1     0.9950    0.9368    0.9650       427

    accuracy                         0.9896      2800
   macro avg     0.9919    0.9680    0.9795      2800
weighted avg     0.9897    0.9896    0.9895      2800

Mean accuracy: 0.9896428571428573


# 使用全部训练集训练模型

In [9]:
# 设定超参数
output_size = len(set(y_train))
embed_size = 100
patience = 5

hidden_size = 256

# Use GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 创建数据加载器

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=128)

# 实例化模型
# model = TextCNN(vocab_size, embed_size, output_size).to(device)
model = BiLSTMClassifier(vocab_size, embed_size, hidden_size, output_size, embedding_matrix).to(device)

# 损失函数和优化器
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 训练模型
for epoch in range(12):

    for batch_text, batch_labels in train_loader:
        optimizer.zero_grad()
        logits = model(batch_text.to(device))
        loss = loss_function(logits, batch_labels.to(device))
        loss.backward()
        optimizer.step()
    

    print(f'Epoch {epoch} | Loss: {loss.item()}')


Epoch 0 | Loss: 0.19075505435466766
Epoch 1 | Loss: 0.08851882815361023
Epoch 2 | Loss: 0.06531248986721039
Epoch 3 | Loss: 0.033402927219867706
Epoch 4 | Loss: 0.0113508440554142
Epoch 5 | Loss: 0.03233465924859047
Epoch 6 | Loss: 0.0040865130722522736
Epoch 7 | Loss: 0.007329711690545082
Epoch 8 | Loss: 0.008489049039781094
Epoch 9 | Loss: 0.007022665347903967
Epoch 10 | Loss: 0.0017358050681650639
Epoch 11 | Loss: 0.0006290804012678564


# 推理

In [10]:
x_test = text_to_index(test_data['content'])
x_test = pad_sequences(x_test, maxlen=214, padding='post', truncating='post')
x_test_tensor = torch.tensor(x_test, dtype=torch.long).to(device)

test_dataset = TensorDataset(x_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

model.eval()
test_pred = []
with torch.no_grad():
    for x_batch in test_loader:
        x_batch = x_batch[0].to(device)
        logits = model(x_batch)
        y_pred = torch.argmax(logits, dim=1)
        test_pred.extend(y_pred.cpu().numpy())

test_data['label'] = test_pred
test_data[['name', 'label']].to_csv('bilstm.csv', index=False)