In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random

In [2]:
# Load a sample csv file test.csv, and use class TabularDataset to generate a torchtext.data.Dataset
file = "E:\Text_Classification\data\Yelp"
TEXT = data.Field(tokenize='spacy')
LABEL = data.Field(sequential=False, use_vocab=False, dtype=torch.float)
# LABEL = data.LabelField(dtype=torch.int8)
tv_dataFields = [("label", LABEL), ("text", TEXT)]
trn = data.TabularDataset.splits(path=file, train='yelp.test.cleaned.csv',\
                                     format='csv',\
                                     skip_header=True, fields=tv_dataFields)

In [3]:
# Split the dataset into train and valid part, no need to do it manually
trn, val = trn[0].split(random_state=random.seed(1234))

In [5]:
# See the format of our dataset, truncate and show the first 10 tokens
# val[0].__dict__
print(val[0].__dict__['label'])
print(val[0].__dict__['text'][:10])

1
['before', 'getting', 'started', ',', 'i', "'d", 'like', 'to', 'point', 'out']


In [6]:
# TODO: word2vec, load glove
TEXT.build_vocab(trn, max_size=25000, vectors="glove.6B.100d")

In [7]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# generate iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (trn, val), 
    batch_size=BATCH_SIZE, 
    device=device,
    sort_key=lambda x: len(x.text), 
    # BucketIterator 依据什么对数据分组
    sort_within_batch=True)

In [8]:
# check the item in iterator
# for item in valid_iterator:
#     print(item)
#     print(item.__dict__.keys())
#     print(item.text)
#     print(item.label)
#     break

In [9]:
# define the CNN network for text classification, same to the tutorial
# https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/4%20-%20Convolutional%20Sentiment%20Analysis.ipynb
import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs,embedding_dim)) for fs in filter_sizes])
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        x = x.permute(1, 0)
                
        #x = [batch size, sent len]
        
        embedded = self.embedding(x)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim=1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [10]:
# Set the hyper parameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

In [11]:
pretrained_embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3794,  0.2277,  0.2154,  ...,  0.1869,  0.4014,  0.2115],
        [-1.3586,  1.0965,  0.9138,  ..., -0.9487,  0.8147,  0.4231]])

In [12]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = rounded_preds == y #convert into float for division 
    acc = correct.sum().float()/len(correct)
    return acc

In [13]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
#         print(predictions, type(predictions))
#         print(batch.label, type(batch.label))
#         break
        
        loss = criterion(predictions, batch.label)
        # loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [14]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
N_EPOCHS = 10
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

# Train and save model
for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% |')
    filepath = 'Yelp-{}.ckpt'.format(epoch + 1)
    torch.save(model.state_dict(), filepath)

| Epoch: 01 | Train Loss: 0.370 | Train Acc: 82.73% | Val. Loss: 0.220 | Val. Acc: 91.00% |
| Epoch: 02 | Train Loss: 0.199 | Train Acc: 92.13% | Val. Loss: 0.185 | Val. Acc: 92.74% |
| Epoch: 03 | Train Loss: 0.145 | Train Acc: 94.41% | Val. Loss: 0.190 | Val. Acc: 92.40% |
| Epoch: 04 | Train Loss: 0.106 | Train Acc: 96.18% | Val. Loss: 0.191 | Val. Acc: 92.67% |
| Epoch: 05 | Train Loss: 0.069 | Train Acc: 97.72% | Val. Loss: 0.244 | Val. Acc: 91.71% |
| Epoch: 06 | Train Loss: 0.048 | Train Acc: 98.34% | Val. Loss: 0.222 | Val. Acc: 92.57% |
| Epoch: 07 | Train Loss: 0.036 | Train Acc: 98.87% | Val. Loss: 0.267 | Val. Acc: 91.99% |
| Epoch: 08 | Train Loss: 0.027 | Train Acc: 99.19% | Val. Loss: 0.293 | Val. Acc: 91.62% |
| Epoch: 09 | Train Loss: 0.022 | Train Acc: 99.32% | Val. Loss: 0.283 | Val. Acc: 92.14% |
| Epoch: 10 | Train Loss: 0.017 | Train Acc: 99.50% | Val. Loss: 0.350 | Val. Acc: 91.83% |


In [17]:
import spacy
nlp = spacy.load('en')

# load the best model and evaluate on the input sentence
model.load_state_dict(torch.load('Yelp-9.ckpt'))
model = model.to(device)
def predict_sentiment(sentence, min_len=5):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    # Note: here we call model(tensor) successfully!
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [20]:
# change it to {0, 1} ? 
predict_sentiment("This film is really great")

0.9750530123710632

In [21]:
# remove the fc layer from original model
modules = list(model.children())[:-2] 
modules.append(list(model.children())[-1])
new_model = nn.Sequential(*modules)
print(new_model)

Sequential(
  (0): Embedding(25002, 100)
  (1): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (2): Dropout(p=0.5)
)


In [22]:
# or wrap the new_model into a class and define the forward 
class PretrainedCNN(nn.Module):
    def __init__(self, seq):
        super().__init__()  
        self.seq = seq
        
    def forward(self, x):  
        #x = [sent len, batch size]        
        x = x.permute(1, 0)            
        return self.seq(x)
new_module = PretrainedCNN(new_model)
print(new_module)

PretrainedCNN(
  (seq): Sequential(
    (0): Embedding(25002, 100)
    (1): ModuleList(
      (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
      (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
      (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
    )
    (2): Dropout(p=0.5)
  )
)


In [23]:
# We hope the get the output vector in the fc layer
def get_vec(sentence, min_len=5):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    # But failed here because of NotImplementedError
    return new_model(tensor)

get_vec("This film is really great")
# print(list(new_model.seq[1][0].parameters()))

NotImplementedError: 

In [24]:
# Then we try the module class (wrapper for nn.sequential)
def get_vec(sentence, min_len=5):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    # Failed agian
    return new_module(tensor)

get_vec("This film is really great")

NotImplementedError: 