## CS310 Natural Language Processing
## Assignment 1. Neural Text Classification

**Total points**: 50

You should roughtly follow the structure of the notebook. Add additional cells if you feel needed. 

You can (and you should) re-use the code from Lab 2. 

Make sure your code is readable and well-structured.

### 0. Import Necessary Libraries

In [1]:
import json
import torch
from datasets import load_dataset

from torch.utils.data import DataLoader
from torch.utils.data import dataset

  from .autonotebook import tqdm as notebook_tqdm


### 1. Data Processing

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = load_dataset('json', data_files='train.jsonl',split='train')
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col not in ['sentence', 'label']])
test_dataset = load_dataset('json', data_files={'test': 'test.jsonl'}, split='test')
test_dataset = test_dataset.remove_columns([col for col in test_dataset.column_names if col not in ['sentence', 'label']])


def convert_label(example):
    example['label'] = example['label'][0]  # 选择列表中的第一个元素
    return example

train_dataset = train_dataset.map(convert_label)
train_iter = iter(train_dataset)

test_dataset = test_dataset.map(convert_label)
test_iter = iter(test_dataset)

count = 0
for item in train_iter:
    print(item)
    count += 1
    if count > 7:
        break

{'sentence': '卖油条小刘说：我说', 'label': 0}
{'sentence': '保姆小张说：干啥子嘛？', 'label': 0}
{'sentence': '卖油条小刘说：你看你往星空看月朦胧，鸟朦胧', 'label': 1}
{'sentence': '卖油条小刘说：咱是不是歇一下这双，疲惫的双腿？', 'label': 0}
{'sentence': '卖油条小刘说：快把我累死了', 'label': 0}
{'sentence': '卖油条小刘说：我说亲爱的大姐你贵姓啊？', 'label': 1}
{'sentence': '保姆小张说：我免贵姓张我叫张凤姑', 'label': 0}
{'sentence': '卖油条小刘说：凤姑', 'label': 0}


In [3]:
import re
import jieba

def basic_chinese_tokenizer(text):
    chinese_char_pattern = r'[\u4e00-\u9fff]'  # Unicode range for common Chinese characters
    tokens = re.findall(chinese_char_pattern, text)
    return tokens

def improved_chinese_tokenizer(text):
    chinese_char_pattern = r'[\u4e00-\u9fff]'
    digits_pattern = r'\d+'
    english_word_pattern = r'[a-zA-Z]+'
    punctuation_pattern = r'[，。！？、：；（）【】《》“”‘’.,;:!?()\[\]{}]+'
    combined_pattern = f"({chinese_char_pattern}|{digits_pattern}|{english_word_pattern}|{punctuation_pattern})"
    tokens = re.findall(combined_pattern, text)
    return tokens

def jieba_chinese_tokenizer(text):
    tokens = jieba.lcut(text)
    return tokens

# tokenizer = basic_chinese_tokenizer
tokenizer = improved_chinese_tokenizer
# tokenizer = jieba_chinese_tokenizer

def yield_tokens(data_iter):
    for batch in data_iter:
        yield tokenizer(batch['sentence'])

In [4]:
count = 0
for tokens in yield_tokens(iter(train_dataset)): # Use a new iterator
    print(tokens)
    count += 1
    if count > 3:
        break

['卖', '油', '条', '小', '刘', '说', '：', '我', '说']
['保', '姆', '小', '张', '说', '：', '干', '啥', '子', '嘛', '？']
['卖', '油', '条', '小', '刘', '说', '：', '你', '看', '你', '往', '星', '空', '看', '月', '朦', '胧', '，', '鸟', '朦', '胧']
['卖', '油', '条', '小', '刘', '说', '：', '咱', '是', '不', '是', '歇', '一', '下', '这', '双', '，', '疲', '惫', '的', '双', '腿', '？']


In [5]:
# build vocabulary
from torchtext.vocab import build_vocab_from_iterator

vocab = build_vocab_from_iterator(yield_tokens(iter(train_dataset)), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])



In [6]:
# Check the vocab
print(vocab(['刘', '说', '你', '看']))
print(vocab(['刘', '说', '咱', '是', '不', '是', '歇', '一', '下']))
print(vocab(['of', 'saucy']))

print("Vocabulary size:", len(vocab))
print(vocab(['你', '好', '世','界', '！','。','@']))
print(vocab(['卖', '油', '条','鸡', '！','。','@','123']))

[423, 1, 4, 55]
[423, 1, 70, 7, 5, 7, 908, 18, 75]
[0, 0]
Vocabulary size: 2840
[4, 34, 407, 501, 71, 301, 0]
[473, 460, 282, 895, 71, 301, 0, 0]


In [7]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

# Test text_pipeline()
tokens = text_pipeline('这是一个示例')
print(tokens)

# Test label_pipeline()
lbl = label_pipeline('1')
print(lbl)

[10, 7, 18, 26, 914, 1453]
1


In [8]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, token_ids_list, offsets = [], [], [0]

    for data in batch:
        _label = data['label']
        _text = data['sentence']
        label_list.append(label_pipeline(_label))
        token_ids = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        token_ids_list.append(token_ids)
        offsets.append(token_ids.size(0)) # Note that offsets contains the length (number of tokens) of each example

    labels = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    token_ids = torch.cat(token_ids_list)

    return labels.to(device), token_ids.to(device), offsets.to(device)

In [9]:
# Use collate_batch to generate the dataloader\
dataloader = DataLoader(
    train_dataset, batch_size=8, shuffle=False, collate_fn=collate_batch
)

In [10]:
# Test the dataloader
for i, (labels, token_ids, offsets) in enumerate(dataloader):
    print(f"batch {i} label: {labels}")
    print(f"batch {i} text: {token_ids}")
    print(f"batch {i} offsets: {offsets}")
    if i == 0:
        break

# What does offsets mean?
print('Number of tokens in this batch: ', token_ids.size(0))
print('Number of examples in one batch: ', labels.size(0))
print('Example 0: ', token_ids[offsets[0]:offsets[1]])
print('Example 7: ', token_ids[offsets[7]:])

# You are expected to see the following output:
# Number of tokens in this batch:  82
# Number of examples in one batch:  8
# Example 0:  tensor([ 4579,    92, 13266,    38,     1,  7742, 10000])
# Example 7:  tensor([   5, 7100])

batch 0 label: tensor([0, 0, 1, 0, 0, 1, 0, 0])
batch 0 text: tensor([ 473,  460,  282,   23,  423,    1,    2,    3,    1,   73,   83,   23,
         113,    1,    2,   98,  483,   46,   59,   33,  473,  460,  282,   23,
         423,    1,    2,    4,   55,    4,  305,  761,  870,   55,  494, 2151,
        2230,    6, 1217, 2151, 2230,  473,  460,  282,   23,  423,    1,    2,
          70,    7,    5,    7,  908,   18,   75,   10,  876,    6, 2201, 2511,
           8,  876, 1133,   33,  473,  460,  282,   23,  423,    1,    2,  187,
          86,    3,  609,  183,    9,  473,  460,  282,   23,  423,    1,    2,
           3,    1,  308,  164,    8,   32,  141,    4,  688,  453,   21,   33,
          73,   83,   23,  113,    1,    2,    3,  820,  688,  453,  113,    3,
         149,  113, 1186,  221,  473,  460,  282,   23,  423,    1,    2, 1186,
         221])
batch 0 offsets: tensor([  0,   9,  20,  41,  64,  77,  96, 112])
Number of tokens in this batch:  121
Number of examples i

### 2. Build the Model

In [11]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, hidden_dim1, hidden_dim2):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.hidden_layers = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim1),
            nn.ReLU(),
            nn.Linear(hidden_dim1, hidden_dim2),
            nn.ReLU()
        )
        self.fc = nn.Linear(hidden_dim2, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        for layer in self.hidden_layers:
            if isinstance(layer, nn.Linear):
                layer.weight.data.uniform_(-initrange, initrange)
                layer.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, token_ids, offsets):
        embedded = self.embedding(token_ids, offsets)
        x = self.hidden_layers(embedded)
        return self.fc(x)

In [12]:

# Build the model
train_iter = iter(train_dataset)
num_class = len(set([data['label'] for data in train_iter]))
vocab_size = len(vocab)
emsize = 64 # embedding size
hidden_dim1 = 16
hidden_dim2 = 8
model = TextClassificationModel(vocab_size, emsize, num_class, hidden_dim1, hidden_dim2).to(device)

In [13]:
# Test the model
model.eval()
with torch.no_grad():
    for i, (labels, token_ids, offsets) in enumerate(dataloader):
        output = model(token_ids, offsets)
        # print(f"batch {i} output: {output}")
        if i == 0:
            break

print('output size:', output.size())

# You are expected to see the following output:
# output size: torch.Size([8, 2])

print(model)

output size: torch.Size([8, 2])
TextClassificationModel(
  (embedding): EmbeddingBag(2840, 64, mode='mean')
  (hidden_layers): Sequential(
    (0): Linear(in_features=64, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=8, bias=True)
    (3): ReLU()
  )
  (fc): Linear(in_features=8, out_features=2, bias=True)
)


In [14]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters
EPOCHS = 10  # epoch
LR = 0.01  # learning rate
BATCH_SIZE = 4  # batch size for training

criterion = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)



In [15]:
# First, obtain some output and labels
model.eval()
with torch.no_grad():
    for i, (labels, token_ids, offsets) in enumerate(dataloader):
        output = model(token_ids, offsets)
        # print(f"batch {i} output: {output}")
        if i == 0:
            break
print('output shape:', output.shape)

loss = criterion(output, labels)
print('loss:', loss)

# Manually calculate the loss
loss_manual = []
for i in range(output.shape[0]):
    ### START YOUR CODE ###
    probs = torch.softmax(output[i], dim=0)
    l = -torch.log(probs[labels[i]])
    ### END YOUR CODE ###
    loss_manual.append(l)
loss_manual = torch.stack(loss_manual)
print('loss_manual mean:', loss_manual.mean())

# You are expected to see the following output:
# output shape: torch.Size([8, 2])
# loss: tensor(0.0115)
# loss_manual mean: tensor(0.0115)

output shape: torch.Size([8, 2])
loss: tensor(0.7348)
loss_manual mean: tensor(0.7348)


### 3. Train and Evaluate

In [16]:
import time
from sklearn.metrics import precision_score, recall_score, f1_score

def train(model, dataloader, optimizer, criterion, epoch: int):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (labels, token_ids, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        ### START YOUR CODE ###
        # Forward pass
        output = model(token_ids, offsets)
        ### END YOUR CODE ###
        try:
            ### START YOUR CODE ###
            # Compute loss
            loss = criterion(output, labels)
            ### END YOUR CODE ###
        except Exception:
            print('Error in loss calculation')
            print('output: ', output.size())
            print('labels: ', labels.size())
            print('token_ids: ', token_ids)
            print('offsets: ', offsets)
            raise
        ### START YOUR CODE ###
        # Backward propagation, grad clipping, and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        ### END YOUR CODE ###

        ### START YOUR CODE ###
        # Calculate correct prediction in current batch
        total_acc += (output.argmax(1) == labels).sum().item()
        ### END YOUR CODE ###

        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

@torch.no_grad()
def evaluate(model, dataloader, criterion):
    model.eval()
    total_acc, total_count = 0, 0
    all_labels = []
    all_preds = []

    for idx, (label, text, offsets) in enumerate(dataloader):
        ### START YOUR CODE ###
        # Similar to the code in train function, but without backpropagation
        output = model(text, offsets)
        loss = criterion(output, label)
        total_acc += (output.argmax(1) == label).sum().item()
        ### END YOUR CODE ###
        total_count += label.size(0)
        all_labels.extend(label.cpu().numpy())
        all_preds.extend(output.argmax(1).cpu().numpy())

    accuracy = total_acc / total_count
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return accuracy, precision, recall, f1

In [17]:
# Prepare train, valid, and test data
train_iter = iter(train_dataset)
test_iter = iter(test_dataset)
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

In [18]:
# Run the training loop
total_accu = None
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train(model, train_dataloader, optimizer, criterion, epoch)
    # accu_val = evaluate(model, valid_dataloader, criterion)
    accu_val, _, _, _ = evaluate(model, valid_dataloader, criterion)

    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

| epoch   1 |   500/ 3011 batches | accuracy    0.675
| epoch   1 |  1000/ 3011 batches | accuracy    0.701
| epoch   1 |  1500/ 3011 batches | accuracy    0.707
| epoch   1 |  2000/ 3011 batches | accuracy    0.737
| epoch   1 |  2500/ 3011 batches | accuracy    0.713
| epoch   1 |  3000/ 3011 batches | accuracy    0.723
-----------------------------------------------------------
| end of epoch   1 | time:  2.67s | valid accuracy    0.694 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   2 |   500/ 3011 batches | accuracy    0.734
| epoch   2 |  1000/ 3011 batches | accuracy    0.727
| epoch   2 |  1500/ 3011 batches | accuracy    0.697
| epoch   2 |  2000/ 3011 batches | accuracy    0.704
| epoch   2 |  2500/ 3011 batches | accuracy    0.717
| epoch   2 |  3000/ 3011 batches | accuracy    0.705
-----------------------------------------------------------
| end of epoch   2 | time:  2.72s | valid accuracy    0.694 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   3 |   500/ 3011 batches | accuracy    0.709
| epoch   3 |  1000/ 3011 batches | accuracy    0.718
| epoch   3 |  1500/ 3011 batches | accuracy    0.725
| epoch   3 |  2000/ 3011 batches | accuracy    0.718
| epoch   3 |  2500/ 3011 batches | accuracy    0.706
| epoch   3 |  3000/ 3011 batches | accuracy    0.705
-----------------------------------------------------------
| end of epoch   3 | time:  2.71s | valid accuracy    0.694 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   4 |   500/ 3011 batches | accuracy    0.714
| epoch   4 |  1000/ 3011 batches | accuracy    0.709
| epoch   4 |  1500/ 3011 batches | accuracy    0.719
| epoch   4 |  2000/ 3011 batches | accuracy    0.717
| epoch   4 |  2500/ 3011 batches | accuracy    0.712
| epoch   4 |  3000/ 3011 batches | accuracy    0.708
-----------------------------------------------------------
| end of epoch   4 | time:  2.70s | valid accuracy    0.696 
-----------------------------------------------------------
| epoch   5 |   500/ 3011 batches | accuracy    0.712
| epoch   5 |  1000/ 3011 batches | accuracy    0.713
| epoch   5 |  1500/ 3011 batches | accuracy    0.707
| epoch   5 |  2000/ 3011 batches | accuracy    0.724
| epoch   5 |  2500/ 3011 batches | accuracy    0.703
| epoch   5 |  3000/ 3011 batches | accuracy    0.724
-----------------------------------------------------------
| end of epoch   5 | time:  2.68s | valid accuracy    0.696 
--------------------------------------------------

In [19]:
# Save the model
torch.save(model.state_dict(), "text_classification_model_improved.pth")

In [20]:
# Evaluate on test data
accu_test, precision_test, recall_test, f1_test = evaluate(model, test_dataloader, criterion)
print("Test accuracy: {:8.3f}".format(accu_test))
print("Test precision: {:8.3f}".format(precision_test))
print("Test recall: {:8.3f}".format(recall_test))
print("Test F1 score: {:8.3f}".format(f1_test))

Test accuracy:    0.733
Test precision:    0.646
Test recall:    0.733
Test F1 score:    0.641


In [21]:
sentiment_labels = ['negative', 'positive']

def predict(text, model, vocab, tokenizer, labels):
    model.eval()
    with torch.no_grad():
        text = torch.tensor(vocab(tokenizer(text)), device=device)
        output = model(text, torch.tensor([0], device=device))
        return labels[output.argmax(1).item()]

ex_text_str = "圆圆说：爷爷"
print("This is a %s sentiment." % (predict(ex_text_str, model, vocab, tokenizer, sentiment_labels)))

This is a negative sentiment.


## 4. Explore Word Segmentation

In [22]:
import re
import jieba

def basic_chinese_tokenizer(text):
    chinese_char_pattern = r'[\u4e00-\u9fff]'  # Unicode range for common Chinese characters
    tokens = re.findall(chinese_char_pattern, text)
    return tokens

def improved_chinese_tokenizer(text):
    chinese_char_pattern = r'[\u4e00-\u9fff]'
    digits_pattern = r'\d+'
    english_word_pattern = r'[a-zA-Z]+'
    punctuation_pattern = r'[，。！？、：；（）【】《》“”‘’.,;:!?()\[\]{}]+'
    combined_pattern = f"({chinese_char_pattern}|{digits_pattern}|{english_word_pattern}|{punctuation_pattern})"
    tokens = re.findall(combined_pattern, text)
    return tokens

def jieba_chinese_tokenizer(text):
    tokens = jieba.lcut(text)
    return tokens

# tokenizer = basic_chinese_tokenizer
# tokenizer = improved_chinese_tokenizer
tokenizer = jieba_chinese_tokenizer

def yield_tokens(data_iter):
    for batch in data_iter:
        yield tokenizer(batch['sentence'])

In [23]:
count = 0
for tokens in yield_tokens(iter(train_dataset)): # Use a new iterator
    print(tokens)
    count += 1
    if count > 3:
        break

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\tyk13\AppData\Local\Temp\jieba.cache
Loading model cost 0.399 seconds.
Prefix dict has been built successfully.


['卖', '油条', '小', '刘说', '：', '我', '说']
['保姆', '小张', '说', '：', '干', '啥子', '嘛', '？']
['卖', '油条', '小', '刘说', '：', '你', '看', '你', '往', '星空', '看', '月', '朦胧', '，', '鸟', '朦胧']
['卖', '油条', '小', '刘说', '：', '咱', '是不是', '歇', '一下', '这', '双', '，', '疲惫', '的', '双腿', '？']


In [24]:
# build vocabulary
from torchtext.vocab import build_vocab_from_iterator

vocab = build_vocab_from_iterator(yield_tokens(iter(train_dataset)), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [25]:
# Check the vocab
print(vocab(['刘', '说', '你', '看']))
print(vocab(['刘', '说', '咱', '是', '不', '是', '歇', '一', '下']))
print(vocab(['of', 'saucy']))

print("Vocabulary size:", len(vocab))
print(vocab(['你', '好', '世','界', '！','。','@']))
print(vocab(['卖', '油', '条','鸡', '！','。','@','123']))

[435, 2, 5, 61]
[435, 2, 146, 12, 20, 12, 1120, 73, 320]
[0, 0]
Vocabulary size: 13847
[5, 48, 0, 0, 43, 153, 0]
[385, 3516, 2129, 6008, 43, 153, 0, 0]


In [26]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

# Test text_pipeline()
tokens = text_pipeline('这是一个示例')
print(tokens)

# Test label_pipeline()
lbl = label_pipeline('1')
print(lbl)

[240, 68, 0]
1


In [27]:
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, token_ids_list, offsets = [], [], [0]

    for data in batch:
        _label = data['label']
        _text = data['sentence']
        label_list.append(label_pipeline(_label))
        token_ids = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        token_ids_list.append(token_ids)
        offsets.append(token_ids.size(0)) # Note that offsets contains the length (number of tokens) of each example

    labels = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    token_ids = torch.cat(token_ids_list)

    return labels.to(device), token_ids.to(device), offsets.to(device)

In [28]:
# Use collate_batch to generate the dataloader\
dataloader = DataLoader(
    train_dataset, batch_size=8, shuffle=False, collate_fn=collate_batch
)

In [29]:
# Test the dataloader
for i, (labels, token_ids, offsets) in enumerate(dataloader):
    print(f"batch {i} label: {labels}")
    print(f"batch {i} text: {token_ids}")
    print(f"batch {i} offsets: {offsets}")
    if i == 0:
        break

# What does offsets mean?
print('Number of tokens in this batch: ', token_ids.size(0))
print('Number of examples in one batch: ', labels.size(0))
print('Example 0: ', token_ids[offsets[0]:offsets[1]])
print('Example 7: ', token_ids[offsets[7]:])

# You are expected to see the following output:
# Number of tokens in this batch:  82
# Number of examples in one batch:  8
# Example 0:  tensor([ 4579,    92, 13266,    38,     1,  7742, 10000])
# Example 7:  tensor([   5, 7100])

batch 0 label: tensor([0, 0, 1, 0, 0, 1, 0, 0])
batch 0 text: tensor([  385,   536,    79,   671,     1,     3,     2,    51,    84,     2,
            1,   185,  1735,    37,    18,   385,   536,    79,   671,     1,
            5,    61,     5,   396, 10492,    61,   626,  5161,     4,  1344,
         5161,   385,   536,    79,   671,     1,   146,   104,  1120,   172,
           16,  4410,     4, 11491,     6,  4414,    18,   385,   536,    79,
          671,     1,   178,    55,     3,  2708,     7,   385,   536,    79,
          671,     1,     3,     2,  1694,     6,   238,     5,  1890,    13,
           18,    51,    84,     2,     1,  4915,  1890,   715,     3,    97,
         3298,   385,   536,    79,   671,     1,  1987])
batch 0 offsets: tensor([ 0,  7, 15, 31, 47, 57, 71, 81])
Number of tokens in this batch:  87
Number of examples in one batch:  8
Example 0:  tensor([385, 536,  79, 671,   1,   3,   2])
Example 7:  tensor([ 385,  536,   79,  671,    1, 1987])


### 2. Build the Model

In [30]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class, hidden_dim1, hidden_dim2):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.hidden_layers = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim1),
            nn.ReLU(),
            nn.Linear(hidden_dim1, hidden_dim2),
            nn.ReLU()
        )
        self.fc = nn.Linear(hidden_dim2, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        for layer in self.hidden_layers:
            if isinstance(layer, nn.Linear):
                layer.weight.data.uniform_(-initrange, initrange)
                layer.bias.data.zero_()
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, token_ids, offsets):
        embedded = self.embedding(token_ids, offsets)
        x = self.hidden_layers(embedded)
        return self.fc(x)

In [31]:

# Build the model
train_iter = iter(train_dataset)
num_class = len(set([data['label'] for data in train_iter]))
vocab_size = len(vocab)
emsize = 64 # embedding size
hidden_dim1 = 16
hidden_dim2 = 8
model = TextClassificationModel(vocab_size, emsize, num_class, hidden_dim1, hidden_dim2).to(device)

In [32]:
# Test the model
model.eval()
with torch.no_grad():
    for i, (labels, token_ids, offsets) in enumerate(dataloader):
        output = model(token_ids, offsets)
        # print(f"batch {i} output: {output}")
        if i == 0:
            break

print('output size:', output.size())

# You are expected to see the following output:
# output size: torch.Size([8, 2])

print(model)

output size: torch.Size([8, 2])
TextClassificationModel(
  (embedding): EmbeddingBag(13847, 64, mode='mean')
  (hidden_layers): Sequential(
    (0): Linear(in_features=64, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=8, bias=True)
    (3): ReLU()
  )
  (fc): Linear(in_features=8, out_features=2, bias=True)
)


In [33]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters
EPOCHS = 10  # epoch
LR = 0.01  # learning rate
BATCH_SIZE = 4  # batch size for training

criterion = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

In [34]:
# First, obtain some output and labels
model.eval()
with torch.no_grad():
    for i, (labels, token_ids, offsets) in enumerate(dataloader):
        output = model(token_ids, offsets)
        # print(f"batch {i} output: {output}")
        if i == 0:
            break
print('output shape:', output.shape)

loss = criterion(output, labels)
print('loss:', loss)

# Manually calculate the loss
loss_manual = []
for i in range(output.shape[0]):
    ### START YOUR CODE ###
    probs = torch.softmax(output[i], dim=0)
    l = -torch.log(probs[labels[i]])
    ### END YOUR CODE ###
    loss_manual.append(l)
loss_manual = torch.stack(loss_manual)
print('loss_manual mean:', loss_manual.mean())

# You are expected to see the following output:
# output shape: torch.Size([8, 2])
# loss: tensor(0.0115)
# loss_manual mean: tensor(0.0115)

output shape: torch.Size([8, 2])
loss: tensor(0.6165)
loss_manual mean: tensor(0.6165)


### 3. Train and Evaluate

In [35]:
import time
from sklearn.metrics import precision_score, recall_score, f1_score

def train(model, dataloader, optimizer, criterion, epoch: int):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (labels, token_ids, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        ### START YOUR CODE ###
        # Forward pass
        output = model(token_ids, offsets)
        ### END YOUR CODE ###
        try:
            ### START YOUR CODE ###
            # Compute loss
            loss = criterion(output, labels)
            ### END YOUR CODE ###
        except Exception:
            print('Error in loss calculation')
            print('output: ', output.size())
            print('labels: ', labels.size())
            print('token_ids: ', token_ids)
            print('offsets: ', offsets)
            raise
        ### START YOUR CODE ###
        # Backward propagation, grad clipping, and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        ### END YOUR CODE ###

        ### START YOUR CODE ###
        # Calculate correct prediction in current batch
        total_acc += (output.argmax(1) == labels).sum().item()
        ### END YOUR CODE ###

        total_count += labels.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()

@torch.no_grad()
def evaluate(model, dataloader, criterion):
    model.eval()
    total_acc, total_count = 0, 0
    all_labels = []
    all_preds = []

    for idx, (label, text, offsets) in enumerate(dataloader):
        ### START YOUR CODE ###
        # Similar to the code in train function, but without backpropagation
        output = model(text, offsets)
        loss = criterion(output, label)
        total_acc += (output.argmax(1) == label).sum().item()
        ### END YOUR CODE ###
        total_count += label.size(0)
        all_labels.extend(label.cpu().numpy())
        all_preds.extend(output.argmax(1).cpu().numpy())

    accuracy = total_acc / total_count
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return accuracy, precision, recall, f1

In [36]:
# Prepare train, valid, and test data
train_iter = iter(train_dataset)
test_iter = iter(test_dataset)
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

In [37]:
# Run the training loop
total_accu = None
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    train(model, train_dataloader, optimizer, criterion, epoch)
    # accu_val = evaluate(model, valid_dataloader, criterion)
    accu_val, _, _, _ = evaluate(model, valid_dataloader, criterion)

    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val

    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

| epoch   1 |   500/ 3011 batches | accuracy    0.709
| epoch   1 |  1000/ 3011 batches | accuracy    0.702
| epoch   1 |  1500/ 3011 batches | accuracy    0.718
| epoch   1 |  2000/ 3011 batches | accuracy    0.716
| epoch   1 |  2500/ 3011 batches | accuracy    0.716
| epoch   1 |  3000/ 3011 batches | accuracy    0.710
-----------------------------------------------------------
| end of epoch   1 | time:  4.34s | valid accuracy    0.711 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   2 |   500/ 3011 batches | accuracy    0.720
| epoch   2 |  1000/ 3011 batches | accuracy    0.701
| epoch   2 |  1500/ 3011 batches | accuracy    0.692
| epoch   2 |  2000/ 3011 batches | accuracy    0.717
| epoch   2 |  2500/ 3011 batches | accuracy    0.723
| epoch   2 |  3000/ 3011 batches | accuracy    0.723
-----------------------------------------------------------
| end of epoch   2 | time:  4.59s | valid accuracy    0.711 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   3 |   500/ 3011 batches | accuracy    0.702
| epoch   3 |  1000/ 3011 batches | accuracy    0.716
| epoch   3 |  1500/ 3011 batches | accuracy    0.713
| epoch   3 |  2000/ 3011 batches | accuracy    0.717
| epoch   3 |  2500/ 3011 batches | accuracy    0.705
| epoch   3 |  3000/ 3011 batches | accuracy    0.722
-----------------------------------------------------------
| end of epoch   3 | time:  4.70s | valid accuracy    0.711 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   4 |   500/ 3011 batches | accuracy    0.707
| epoch   4 |  1000/ 3011 batches | accuracy    0.704
| epoch   4 |  1500/ 3011 batches | accuracy    0.712
| epoch   4 |  2000/ 3011 batches | accuracy    0.723
| epoch   4 |  2500/ 3011 batches | accuracy    0.706
| epoch   4 |  3000/ 3011 batches | accuracy    0.726
-----------------------------------------------------------
| end of epoch   4 | time:  4.67s | valid accuracy    0.711 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   5 |   500/ 3011 batches | accuracy    0.727
| epoch   5 |  1000/ 3011 batches | accuracy    0.725
| epoch   5 |  1500/ 3011 batches | accuracy    0.715
| epoch   5 |  2000/ 3011 batches | accuracy    0.701
| epoch   5 |  2500/ 3011 batches | accuracy    0.701
| epoch   5 |  3000/ 3011 batches | accuracy    0.710
-----------------------------------------------------------
| end of epoch   5 | time:  4.55s | valid accuracy    0.711 
-----------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   6 |   500/ 3011 batches | accuracy    0.708
| epoch   6 |  1000/ 3011 batches | accuracy    0.704
| epoch   6 |  1500/ 3011 batches | accuracy    0.703
| epoch   6 |  2000/ 3011 batches | accuracy    0.724
| epoch   6 |  2500/ 3011 batches | accuracy    0.716
| epoch   6 |  3000/ 3011 batches | accuracy    0.725
-----------------------------------------------------------
| end of epoch   6 | time:  4.40s | valid accuracy    0.713 
-----------------------------------------------------------
| epoch   7 |   500/ 3011 batches | accuracy    0.706
| epoch   7 |  1000/ 3011 batches | accuracy    0.711
| epoch   7 |  1500/ 3011 batches | accuracy    0.715
| epoch   7 |  2000/ 3011 batches | accuracy    0.721
| epoch   7 |  2500/ 3011 batches | accuracy    0.726
| epoch   7 |  3000/ 3011 batches | accuracy    0.709
-----------------------------------------------------------
| end of epoch   7 | time:  4.60s | valid accuracy    0.715 
--------------------------------------------------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


| epoch   9 |   500/ 3011 batches | accuracy    0.691
| epoch   9 |  1000/ 3011 batches | accuracy    0.741
| epoch   9 |  1500/ 3011 batches | accuracy    0.706
| epoch   9 |  2000/ 3011 batches | accuracy    0.719
| epoch   9 |  2500/ 3011 batches | accuracy    0.705
| epoch   9 |  3000/ 3011 batches | accuracy    0.722
-----------------------------------------------------------
| end of epoch   9 | time:  4.51s | valid accuracy    0.711 
-----------------------------------------------------------
| epoch  10 |   500/ 3011 batches | accuracy    0.703
| epoch  10 |  1000/ 3011 batches | accuracy    0.721
| epoch  10 |  1500/ 3011 batches | accuracy    0.724
| epoch  10 |  2000/ 3011 batches | accuracy    0.702
| epoch  10 |  2500/ 3011 batches | accuracy    0.706
| epoch  10 |  3000/ 3011 batches | accuracy    0.731
-----------------------------------------------------------
| end of epoch  10 | time:  4.60s | valid accuracy    0.710 
--------------------------------------------------

In [38]:
# Save the model
torch.save(model.state_dict(), "text_classification_model_improved.pth")

In [39]:
# Evaluate on test data
accu_test, precision_test, recall_test, f1_test = evaluate(model, test_dataloader, criterion)
print("Test accuracy: {:8.3f}".format(accu_test))
print("Test precision: {:8.3f}".format(precision_test))
print("Test recall: {:8.3f}".format(recall_test))
print("Test F1 score: {:8.3f}".format(f1_test))

Test accuracy:    0.739
Test precision:    0.677
Test recall:    0.739
Test F1 score:    0.631


In [40]:
sentiment_labels = ['negative', 'positive']

def predict(text, model, vocab, tokenizer, labels):
    model.eval()
    with torch.no_grad():
        text = torch.tensor(vocab(tokenizer(text)), device=device)
        output = model(text, torch.tensor([0], device=device))
        return labels[output.argmax(1).item()]

ex_text_str = "圆圆说：爷爷"
print("This is a %s sentiment." % (predict(ex_text_str, model, vocab, tokenizer, sentiment_labels)))

This is a negative sentiment.
