# text classification step 1: CNN
## 论文：https://arxiv.org/pdf/1408.5882.pdf
## 原文代码：https://github.com/dennybritz/cnn-text-classification-tf

In [1]:
!pip install -U tqdm
!pip install -U nltk
!pip install -U spacy
!pip install -U numpy
!pip install -U pandas
!pip install -U sklearn
!pip install -U torch
!pip install -U torchtext
!python -m spacy download en_core_web_md

Requirement already up-to-date: tqdm in /opt/anaconda3/lib/python3.7/site-packages (4.48.2)
Requirement already up-to-date: nltk in /opt/anaconda3/lib/python3.7/site-packages (3.5)
Requirement already up-to-date: spacy in /opt/anaconda3/lib/python3.7/site-packages (2.3.2)
Requirement already up-to-date: numpy in /opt/anaconda3/lib/python3.7/site-packages (1.19.1)
Requirement already up-to-date: pandas in /opt/anaconda3/lib/python3.7/site-packages (1.1.0)
Requirement already up-to-date: sklearn in /opt/anaconda3/lib/python3.7/site-packages (0.0)
Requirement already up-to-date: torch in /opt/anaconda3/lib/python3.7/site-packages (1.6.0)
Requirement already up-to-date: torchtext in /opt/anaconda3/lib/python3.7/site-packages (0.7.0)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


## 预处理

In [2]:
import pandas as pd

data = pd.read_csv('./IMDB Dataset.csv')
print(len(data))
print(data.head())

50000
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [3]:
from tqdm import tqdm
import nltk

from nltk.stem import WordNetLemmatizer
# nltk.download()
lemmatizer = WordNetLemmatizer()

import re, spacy
nlp = spacy.load('en_core_web_md')

In [4]:
processed_review = []
sentiment = []
word2id = {'<PAD>':0}
# id2word = {0:'<PAD>'}
vocab = set(['<PAD>'])
count = 1
SEQ_LEN = 100

for i in tqdm(range(len(data))):
    text = data.review[i].lower()
    text = re.sub('<.+?>', '', text)
    text = re.sub('[<>]', '', text)
    text = [lemmatizer.lemmatize(token.text) for token in nlp.tokenizer(text)][:SEQ_LEN]

    tmp = [0] * (SEQ_LEN - len(text)) if len(text) < SEQ_LEN else []
        
    for word in text:
        if word not in vocab:
            vocab.add(word)
            word2id[word] = count
            tmp.append(count)
            count += 1
        else:
            tmp.append(word2id[word])

    processed_review.append(tmp)
    
    if data.sentiment[i] == 'positive':
        sentiment.append(1)
    elif data.sentiment[i] == 'negative':
        sentiment.append(0)
    
# id2word = {val:key for key, val in word2id.items()}
print(processed_review[0], sentiment[0])

100%|██████████| 50000/50000 [01:30<00:00, 552.77it/s]

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 8, 34, 35, 36, 13, 37, 38, 39, 40, 41, 42, 2, 43, 23, 44, 45, 46, 22, 47, 3, 48, 49, 19, 50, 35, 23, 25, 26, 51, 24, 52, 53, 3, 54, 55, 56, 57, 19, 25, 52, 58, 59, 60, 30, 61, 62, 63, 23, 64, 56, 43, 19, 38, 26, 65, 23, 46, 3, 66, 67, 2, 3, 68, 26, 69, 13, 24] 1





In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_review, sentiment, train_size=0.5, random_state=1988)
# X_train, X_val, y_train, y_val = train_test_split(train, label, train_size=0.9, random_state=1988)

# train = pd.DataFrame({'review':X_train, 'sentiment': y_train})
# val = pd.DataFrame({'review':X_val, 'sentiment':y_val})
# test = pd.DataFrame({'review':X_test, 'sentiment':y_test})

# train.to_csv('/Users/wenjiazhai/Google Drive/Colab Notebooks/datasets/IMBD train.csv')
# val.to_csv('/Users/wenjiazhai/Google Drive/Colab Notebooks/datasets/IMDB val.csv')
# test.to_csv('/Users/wenjiazhai/Google Drive/Colab Notebooks/datasets/IMDB test.csv')

In [6]:
from torch.utils.data import TensorDataset, DataLoader
import torch

BATCH_SIZE = 64

train_ds = TensorDataset(torch.as_tensor(X_train), torch.as_tensor(y_train))
test_ds = TensorDataset(torch.as_tensor(X_test), torch.as_tensor(y_test))

train_iter = DataLoader(train_ds, batch_size=BATCH_SIZE, drop_last=True) # (BATCH_SIZE, SEQ_LEN)
test_iter = DataLoader(test_ds, batch_size=BATCH_SIZE, drop_last=True)

## 建模

In [7]:
from torch import nn, optim
from torch.nn import functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embed_size, dropout, batch_size):
        super(CNN, self).__init__()
        self.batch_size = batch_size
        
        self.embedding = nn.Embedding(vocab_size, embed_size) # (BATCH_SIZE, SEQ_LEN, embed_size)
        
        self.conv1 = nn.Conv2d(1, 1, 3)
        self.conv2 = nn.Conv2d(1, 1, 4)
        self.conv3 = nn.Conv2d(1, 1, 5)

        self.dropout = nn.Dropout(dropout)
        
        self.fc = nn.Linear(2232, 1)

    def forward(self, x):
        x = self.embedding(x)
        x.unsqueeze_(1) # (BATCH_SIZE, 1, SEQ_LEN, embed_size)
        output1 = self.conv1(x)
        output1 = F.max_pool2d(F.relu(output1), 4)
        
        output2 = self.conv2(x)
        output2 = F.max_pool2d(F.relu(output2), 4)
        
        output3 = self.conv3(x)
        output3 = F.max_pool2d(F.relu(output3), 4)
        output = torch.cat([output1, output2, output3], axis=1)
        output = self.dropout(output)

        return self.fc(output.view(self.batch_size, -1))

In [8]:
EMBED_SIZE = 128
DROPOUT = 0.5
model = CNN(len(vocab), EMBED_SIZE, DROPOUT, BATCH_SIZE)

In [9]:
model

CNN(
  (embedding): Embedding(90171, 128)
  (conv1): Conv2d(1, 1, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(1, 1, kernel_size=(4, 4), stride=(1, 1))
  (conv3): Conv2d(1, 1, kernel_size=(5, 5), stride=(1, 1))
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=2232, out_features=1, bias=True)
)

## 训练

In [10]:
from torch import optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [11]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [12]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for text, label in iterator:
        optimizer.zero_grad()
        preds = model(text)
        loss = criterion(preds.squeeze(), label.float())
        acc = binary_accuracy(preds.squeeze(), label)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [13]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for text, label in iterator:
            preds = model(text)
            loss = criterion(preds.squeeze(), label.float())
            acc = binary_accuracy(preds.squeeze(), label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [14]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - elapsed_mins * 60)
    return elapsed_mins, elapsed_secs

In [15]:
N_EPOCHS = 10

best_test_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(model.state_dict(), 'CNN-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {test_loss:.3f} |  Val. Acc: {test_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 2m 42s
	Train Loss: 0.716 | Train Acc: 50.22%
	 Val. Loss: 0.690 |  Val. Acc: 54.13%
Epoch: 02 | Epoch Time: 2m 38s
	Train Loss: 0.686 | Train Acc: 55.16%
	 Val. Loss: 0.680 |  Val. Acc: 58.40%
Epoch: 03 | Epoch Time: 2m 41s
	Train Loss: 0.652 | Train Acc: 61.60%
	 Val. Loss: 0.624 |  Val. Acc: 65.94%
Epoch: 04 | Epoch Time: 2m 45s
	Train Loss: 0.562 | Train Acc: 71.01%
	 Val. Loss: 0.535 |  Val. Acc: 73.87%
Epoch: 05 | Epoch Time: 2m 45s
	Train Loss: 0.471 | Train Acc: 77.22%
	 Val. Loss: 0.482 |  Val. Acc: 77.02%
Epoch: 06 | Epoch Time: 2m 46s
	Train Loss: 0.392 | Train Acc: 82.14%
	 Val. Loss: 0.458 |  Val. Acc: 78.33%
Epoch: 07 | Epoch Time: 2m 48s
	Train Loss: 0.336 | Train Acc: 85.34%
	 Val. Loss: 0.448 |  Val. Acc: 79.14%
Epoch: 08 | Epoch Time: 2m 57s
	Train Loss: 0.294 | Train Acc: 87.58%
	 Val. Loss: 0.444 |  Val. Acc: 79.80%
Epoch: 09 | Epoch Time: 2m 49s
	Train Loss: 0.248 | Train Acc: 89.98%
	 Val. Loss: 0.448 |  Val. Acc: 80.29%
Epoch: 10 | Epoch T