In [None]:
!nvidia-smi

Sun Dec 18 00:27:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    25W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import gensim
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import Counter
from torch.utils.data import TensorDataset,DataLoader
from torch.optim.lr_scheduler import *

In [None]:
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
Word_Vector_path = '/content/drive/MyDrive/Dataset/data.vector'
Train_Ro_path = '/content/drive/MyDrive/Dataset/train_ro.txt'
Valid_Ro_path = '/content/drive/MyDrive/Dataset/valid_ro.txt'

learning_rate = 0.001  
BATCH_SIZE = 64 
EPOCHS = 5 
model_path = None  # path of pretrain model

In [None]:
def build_word2id(file, save_to_path=None):
    """
    :param file: word2id save path
    :param save_to_path: save word2vec
    :return: None
    """
    word2id = {'_PAD_': 0}
    path = ['/content/drive/MyDrive/Dataset/train.txt', '/content/drive/MyDrive/Dataset/validation.txt']

    for _path in path:
        with open(_path, encoding='utf-8') as f:
            for line in f.readlines():
                sp = line.strip().split()
                for word in sp[1:]:
                    if word not in word2id.keys():
                        word2id[word] = len(word2id)
    if save_to_path:
        with open(file, 'w', encoding='utf-8') as f:
            for w in word2id:
                f.write(w + '\t')
                f.write(str(word2id[w]))
                f.write('\n')

    return word2id

In [None]:
def build_word2vec(fname, word2id, save_to_path=None):
    """
    :param fname: pretrain word2vec.
    :param word2id: the vocabulary set of all word in document.
    :param save_to_path: save word2vec to local
    :return: word2vec vector corresponding to the vocabulary set {id: word2vec}.
    """
    n_words = max(word2id.values()) + 1
    model = gensim.models.KeyedVectors.load_word2vec_format(fname)
    word_vecs = np.array(np.random.uniform(-1., 1., [n_words, model.vector_size]))
    for word in word2id.keys():
        try:
            word_vecs[word2id[word]] = model[word]
        except KeyError:
            pass
    if save_to_path:
        with open(save_to_path, 'w', encoding='utf-8') as f:
            for vec in word_vecs:
                vec = [str(w) for w in vec]
                f.write(' '.join(vec))
                f.write('\n')
    return word_vecs

def cat_to_id(classes=None):
    """
    :param classes: label class
    :return: {class：id}
    """
    if not classes:
        classes = ['0', '1', '2']
    cat2id = {cat: idx for (idx, cat) in enumerate(classes)}
    return classes, cat2id

def load_corpus(path, word2id, max_sen_len=50):
    """
    :param path: data path
    :return: contents，labels(onehot)
    """
    _, cat2id = cat_to_id()
    contents, labels = [], []
    with open(path, encoding='utf-8') as f:
        for line in f.readlines():
            sp = line.strip().split()
            label = sp[0]
            content = [word2id.get(w, 0) for w in sp[1:]]
            content = content[:max_sen_len]
            if len(content) < max_sen_len:
                content += [word2id['_PAD_']] * (max_sen_len - len(content))
            labels.append(label)
            contents.append(content)
    counter = Counter(labels)
    print('Total sample num：%d' % (len(labels)))
    print('class num：')
    for w in counter:
        print(w, counter[w])

    contents = np.asarray(contents)
    labels = np.array([cat2id[l] for l in labels])

    return contents, labels

def load_corpus(path, word2id, max_sen_len=50):
    """
    :param path: data path
    :return: contents，labels(onehot)
    """
    _, cat2id = cat_to_id()
    contents, labels = [], []
    with open(path, encoding='utf-8') as f:
        for line in f.readlines():
            sp = line.strip().split()
            label = sp[0]
            content = [word2id.get(w, 0) for w in sp[1:]]
            content = content[:max_sen_len]
            if len(content) < max_sen_len:
                content += [word2id['_PAD_']] * (max_sen_len - len(content))
            labels.append(label)
            contents.append(content)
    counter = Counter(labels)
    print('Total sample num：%d' % (len(labels)))
    print('class num：')
    for w in counter:
        print(w, counter[w])

    contents = np.asarray(contents)
    labels = np.array([cat2id[l] for l in labels])

    return contents, labels

In [None]:
word2id = build_word2id('./Dataset/word2id.txt')
# print(word2id)
word2vec = build_word2vec(Word_Vector_path, word2id)
print(word2vec.shape)

(449913, 100)


In [None]:
class CONFIG():
    update_w2v = True  # update word2vec when training
    vocab_size = word2vec.shape[0]  # number of word，same to word2id
    n_class = 3  # pos neg neu
    embedding_dim = 100  
    drop_keep_prob = 0.5  # dropout parametre
    kernel_num = 64  # number of filter in conv layer
    kernel_size = [3, 4, 5]  # dimension of conv kernel
    pretrained_embed = word2vec  # our mode

In [None]:
class TextCNN(nn.Module):
    def __init__(self, config):
        super(TextCNN, self).__init__()
        update_w2v = config.update_w2v
        vocab_size = config.vocab_size
        n_class = config.n_class
        embedding_dim = config.embedding_dim
        kernel_num = config.kernel_num
        kernel_size = config.kernel_size
        drop_keep_prob = config.drop_keep_prob
        pretrained_embed = config.pretrained_embed

        # use word2vec
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embed))
        self.embedding.weight.requires_grad = update_w2v
        # conv
        self.conv1 = nn.Conv2d(1, kernel_num, (kernel_size[0], embedding_dim))
        self.conv2 = nn.Conv2d(1, kernel_num, (kernel_size[1], embedding_dim))
        self.conv3 = nn.Conv2d(1, kernel_num, (kernel_size[2], embedding_dim))
        # Dropout
        self.dropout = nn.Dropout(drop_keep_prob)
        # linear
        self.fc = nn.Linear(len(kernel_size) * kernel_num, n_class)

    @staticmethod
    def conv_and_pool(x, conv):
        # x: (batch, 1, sentence_length,  )
        x = conv(x)
        # x: (batch, kernel_num, H_out, 1)
        x = F.relu(x.squeeze(3))
        # x: (batch, kernel_num, H_out)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        #  (batch, kernel_num)
        return x

    def forward(self, x):
        x = x.to(torch.int64)
        x = self.embedding(x)
        x = x.unsqueeze(1)
        x1 = self.conv_and_pool(x, self.conv1)  # (batch, kernel_num)
        x2 = self.conv_and_pool(x, self.conv2)  # (batch, kernel_num)
        x3 = self.conv_and_pool(x, self.conv3)  # (batch, kernel_num)
        x = torch.cat((x1, x2, x3), 1)  # (batch, 3 * kernel_num)
        x = self.dropout(x)
        x = self.fc(x)
        x = F.log_softmax(x, dim=1)
        return x

In [None]:
def train(dataloader, epoch):

    train_loss, train_acc = 0.0, 0.0
    count, correct = 0, 0
    for batch_idx, (x, y) in enumerate(dataloader):
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        correct += (output.argmax(1) == y).float().sum().item()
        count += len(x)

        if (batch_idx + 1) % 100 == 0:
            print('train epoch: {} [{}/{} ({:.0f}%)]\tloss: {:.6f}'.format(
                epoch, batch_idx * len(x), len(dataloader.dataset),
                       100. * batch_idx / len(dataloader), loss.item()))

    train_loss *= BATCH_SIZE
    train_loss /= len(dataloader.dataset)
    train_acc = correct / count
    print('\ntrain epoch: {}\taverage loss: {:.6f}\taccuracy:{:.4f}%\n'.format(epoch, train_loss, 100. * train_acc))
    scheduler.step()

    return train_loss, train_acc

def validation(dataloader, epoch):
    model.eval()
    # valid
    val_loss, val_acc = 0.0, 0.0
    count, correct = 0, 0
    for _, (x, y) in enumerate(dataloader):
        x, y = x.to(DEVICE), y.to(DEVICE)
        output = model(x)
        loss = criterion(output, y)
        val_loss += loss.item()
        correct += (output.argmax(1) == y).float().sum().item()
        count += len(x)

    val_loss *= BATCH_SIZE
    val_loss /= len(dataloader.dataset)
    val_acc = correct / count
    # print acc
    print(
        'validation:train epoch: {}\taverage loss: {:.6f}\t accuracy:{:.2f}%\n'.format(epoch, val_loss, 100 * val_acc))

    return val_loss, val_acc

In [None]:
print('train set: ')
train_contents, train_labels = load_corpus(Train_Ro_path, word2id, max_sen_len=100)
print('\nvalidation set: ')
val_contents, val_labels = load_corpus(Valid_Ro_path, word2id, max_sen_len=100)

config = CONFIG()  

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

train_dataset = TensorDataset(torch.from_numpy(train_contents).type(torch.float),
                              torch.from_numpy(train_labels).type(torch.long))
train_dataloader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True, num_workers=2)

val_dataset = TensorDataset(torch.from_numpy(val_contents).type(torch.float),
                            torch.from_numpy(val_labels).type(torch.long))
val_dataloader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE,
                            shuffle=True, num_workers=2)

train set: 
Total sample num：900000
class num：
2 450556
0 449369
1 75

validation set: 
Total sample num：140323
class num：
2 69880
0 70434
1 9


In [None]:
# model, continue last training
model = TextCNN(config)
if model_path:
    model.load_state_dict(torch.load(model_path))
model.to(DEVICE)

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# loss func
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=5)

In [None]:
train_losses = []
train_acces = []
val_losses = []
val_acces = []

for epoch in range(1, EPOCHS + 1):
    tr_loss, tr_acc = train(train_dataloader, epoch)
    val_loss, val_acc = validation(val_dataloader, epoch)
    train_losses.append(tr_loss)
    train_acces.append(tr_acc)
    val_losses.append(val_loss)
    val_acces.append(val_acc)

model_pth = 'model_' + str(time.time()) + '.pth'
torch.save(model.state_dict(), model_pth)


train epoch: 1	average loss: 0.455859	accuracy:78.9667%

validation:train epoch: 1	average loss: 0.416297	 accuracy:81.40%


train epoch: 2	average loss: 0.383501	accuracy:82.9224%

validation:train epoch: 2	average loss: 0.404445	 accuracy:81.90%


train epoch: 3	average loss: 0.344254	accuracy:85.0419%

validation:train epoch: 3	average loss: 0.413093	 accuracy:81.49%


train epoch: 4	average loss: 0.304907	accuracy:87.0583%

validation:train epoch: 4	average loss: 0.424910	 accuracy:81.67%


train epoch: 5	average loss: 0.264465	accuracy:88.9927%

validation:train epoch: 5	average loss: 0.460834	 accuracy:81.07%

