In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

for module in [pd, np, torch]:
    print(module.__name__, module.__version__)

pandas 1.5.3
numpy 1.24.3
torch 2.0.0+cu117


In [5]:
import torchdata.datapipes as dp
import torchtext.transforms as T
import spacy
from torchtext.vocab import build_vocab_from_iterator
ch = spacy.load('zh_core_web_sm')

In [6]:
def charTokenize(text):
    return [char for char in text]

print(charTokenize('我爱北京天安门'))

def cnTokenize(text):
    return [tok.text for tok in ch(text)]

print(cnTokenize('我爱北京天安门'))

['我', '爱', '北', '京', '天', '安', '门']
['我', '爱', '北京', '天安门']


In [10]:
train_data_path = './data/train.txt'
valid_data_path= './data/valid.txt'
data_pipe = dp.iter.IterableWrapper([train_data_path, valid_data_path])
data_pipe = dp.iter.FileOpener(data_pipe, mode='r')
data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)

def extract_attributes(row):
    return row[1], row[2]
data_pipe = data_pipe.map(extract_attributes)

def get_tokens(data_iter, tokenizer):
    for _, text in data_iter:
            yield tokenizer(text)

def build_vocab(data_iter, tokenizer):
    vocab = build_vocab_from_iterator(
        get_tokens(data_iter, tokenizer),
        min_freq=2,
        specials=["<pad>", "<sos>", "<eos>", "<unk>"],
        special_first=True
    )
    vocab.set_default_index(vocab["<unk>"])
    return vocab


vocab = build_vocab(data_pipe, charTokenize)
print('Built vocab...')
print(type(vocab), len(vocab))


Built vocab...
<class 'torchtext.vocab.vocab.Vocab'> 7929


In [12]:
# build train pipe and valid pipe
def getTransform(vocab):
    text_transform = T.Sequential(
        T.VocabTransform(vocab),
        T.AddToken(1, begin=True),
        T.AddToken(2, begin=False)
    )
    return text_transform

def apply_transform(sample):
    text_transformer = getTransform(vocab)
    tokenized_text = charTokenize(sample[1])
    return text_transformer(tokenized_text), [1.0 if float(sample[0]) >= 30 else 0., 0. if float(sample[0]) < 30 else 1.]

print(apply_transform(('30',"我爱北京天安门")))
print(apply_transform(('29', "我想买个大西瓜")))

def sortBucket(bucket):
    return sorted(bucket, key=lambda x: len(x[0]))

def separate_batch(batch):
    '''
    Inputs: [(text1, label1), (text2, label2), ...]
    Outputs: ([text1, text2, ...], [label1, label2, ...])
    '''
    texts, labels = zip(*batch)
    return texts, labels


def apply_padding(sample):
    return (T.ToTensor(0)(list(sample[0])), torch.tensor(list(sample[1])))

def build_data_pipe(data_path, batch_size=8):
    data_pipe = dp.iter.IterableWrapper([data_path])
    data_pipe = dp.iter.FileOpener(data_pipe, mode='r')
    data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)
    data_pipe = data_pipe.map(extract_attributes)
    
    data_pipe = data_pipe.map(apply_transform)
    print('Applied transform...')
    #for sample in data_pipe:
    #    print(sample)
    #    break

    data_pipe = data_pipe.bucketbatch(
        batch_size = batch_size, 
        batch_num=5,  # batch_num is the number of batches to keep in a bucket
        bucket_num=1, # bucket_num is the number of buckets
        use_in_batch_shuffle=False, 
        sort_key=sortBucket
    )

    print('Afte batch ...')
    #for batch in data_pipe:
    #    print(batch)
    #    print(len(batch))
    #    break

    data_pipe = data_pipe.map(separate_batch)
    print('After seperate batch ...')
    #for texts, labels in data_pipe:
    #    print(len(texts), texts)
    #    print(len(labels), labels)
    #    break


    data_pipe = data_pipe.map(apply_padding)
    print('After apply padding ...')
    return data_pipe

train_pipe = build_data_pipe(train_data_path)
for texts, labels in train_pipe:
    print(texts)
    print(type(texts), texts.shape)
    print(type(labels), labels.shape)
    break

valid_pipe = build_data_pipe(valid_data_path)
for texts, labels in valid_pipe:
    print(texts)
    print(type(texts), texts.shape)
    print(type(labels), labels.shape)
    break

([1, 25, 278, 349, 598, 97, 225, 335, 2], [1.0, 1.0])
([1, 25, 202, 640, 26, 21, 263, 1443, 2], [0.0, 0.0])
Applied transform...
Afte batch ...
After seperate batch ...
After apply padding ...
tensor([[   1,  864, 1916,  ...,    0,    0,    0],
        [   1,  349,  598,  ...,    0,    0,    0],
        [   1, 1451, 1625,  ...,    0,    0,    0],
        ...,
        [   1, 1625,  145,  ...,    0,    0,    0],
        [   1, 1828,  322,  ...,    0,    0,    0],
        [   1,   93,  135,  ...,   24,    7,    2]])
<class 'torch.Tensor'> torch.Size([8, 285])
<class 'torch.Tensor'> torch.Size([8, 2])
Applied transform...
Afte batch ...
After seperate batch ...
After apply padding ...
tensor([[   1,  136,  701,  ...,    0,    0,    0],
        [   1,  928, 1516,  ...,    0,    0,    0],
        [   1,  136,  249,  ...,    0,    0,    0],
        ...,
        [   1,  551,  604,  ...,    0,    0,    0],
        [   1,  215,   20,  ...,    0,    0,    0],
        [   1,  271, 1363,  ...,   37

In [18]:
class LSTMRegressionModel(nn.Module):
    def __init__(self, embedding_size, hidden_size, vocab_size, embedding_pretrained=None) -> None:
        super(LSTMRegressionModel, self).__init__()
        if embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=vocab_size-2)
        self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, batch_first=True, bidirectional=False)
        self.linear = nn.Linear(hidden_size, 2)

    def forward(self, inputs):
        # [batch_size, seq_len] -> [batch_size, seq_len, embedding_size]
        x = self.embedding(inputs)
        # [batch_size, seq_len, embed_dim] -> [num_layers, batch_size, hidden_size]
        _, (x, _) = self.lstm(x)
        x = x[-1, :, :]
        # [batch_size, hidden_size] -> [batch_size, 1]
        logits  = self.linear(x)
        return logits



def train(dataset, model, loss_fn, optimizer, device='cpu'):
    model.train()
    for batch, (X, y) in enumerate(dataset, 1):
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 200 == 0:
            loss, current = loss.item(), (batch) * len(X)
            print(f"| batch: [{batch:>5d}], loss: {loss:>7f}")

def evaluate(dataset, model, loss_fn, device='cpu'):
    model.eval()
    test_loss, correct = 0., 0.
    num_batches, size = 0, 0
    with torch.no_grad():
        for batch, (X, y) in enumerate(dataset, 1):
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y.argmax(1)).type(torch.float).sum().item()

            num_batches, size = batch, size + len(X) 
        test_loss /= num_batches
        correct /= size
    print(f"\n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")   

In [19]:
import torch.optim as optim
import torch.nn.functional as F
import tqdm
import time

epochs = 3
learning_rate = 1e-3
embedding_dim = 100
hidden_dim = 128
vocab_size = len(vocab)
model = LSTMRegressionModel(embedding_dim, hidden_dim, vocab_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#for batch, (X, y) in enumerate(data_pipe):
#    print(batch, X.size(), y.size())
#    pred = model(X)
#    print(pred.size())
#   break

for epoch in range(epochs):
    epoch_start_time = time.time()
    print(f"Epoch: {epoch}")
    train(train_pipe, model, loss_fn, optimizer)
    acc_val = evaluate(valid_pipe, model, loss_fn)
    print("-" * 59)
    print(f"| end of epoch {epoch:3d} | time: {time.time()-epoch_start_time:5.2f}s | valid accuracy {acc_val:8.3f} ")
    print("-" * 59)

Epoch: 0
| batch: [  200], loss: 0.866439
| batch: [  400], loss: 0.693152
| batch: [  600], loss: 0.693147
| batch: [  800], loss: 0.173287
| batch: [ 1000], loss: 0.519879
| batch: [ 1200], loss: 0.519860
| batch: [ 1400], loss: 0.866434
| batch: [ 1600], loss: 0.519860
| batch: [ 1800], loss: 0.519861


UnboundLocalError: local variable 'size' referenced before assignment