In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

for module in [pd, np, torch]:
    print(module.__name__, module.__version__)

pandas 1.5.3
numpy 1.24.3
torch 2.0.0+cu117


In [2]:
import torchdata.datapipes as dp
import torchtext.transforms as T
import spacy
from torchtext.vocab import build_vocab_from_iterator
ch = spacy.load('zh_core_web_sm')

In [3]:
def charTokenize(text):
    return [char for char in text]

print(charTokenize('我爱北京天安门'))

def cnTokenize(text):
    return [tok.text for tok in ch(text)]

print(cnTokenize('我爱北京天安门'))

['我', '爱', '北', '京', '天', '安', '门']
['我', '爱', '北京', '天安门']


In [10]:
train_data_path = './data/train.txt'
data_pipe = dp.iter.IterableWrapper([train_data_path])
data_pipe = dp.iter.FileOpener(data_pipe, mode='r')
data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)

def extract_attributes(row):
    return row[1], row[2]
data_pipe = data_pipe.map(extract_attributes)

def get_tokens(data_iter, tokenizer):
    for _, text in data_iter:
            yield tokenizer(text)

def build_vocab(data_iter, tokenizer):
    vocab = build_vocab_from_iterator(
        get_tokens(data_iter, tokenizer),
        min_freq=2,
        specials=["<pad>", "<sos>", "<eos>", "<unk>"],
        special_first=True
    )
    vocab.set_default_index(vocab["<unk>"])
    return vocab


vocab = build_vocab(data_pipe, charTokenize)
print('Built vocab...')
print(type(vocab), len(vocab))

def getTransform(vocab):
    text_transform = T.Sequential(
        T.VocabTransform(vocab),
        T.AddToken(1, begin=True),
        T.AddToken(2, begin=False)
    )
    return text_transform

def apply_transform(sample):
    text_transformer = getTransform(vocab)
    tokenized_text = charTokenize(sample[1])
    return text_transformer(tokenized_text), [float(sample[0])]

print(apply_transform(('2',"我爱北京天安门")))

data_pipe = data_pipe.map(apply_transform)
print('Applied transform...')
#for sample in data_pipe:
#    print(sample)
#    break

def sortBucket(bucket):
    return sorted(bucket, key=lambda x: len(x[0]))

data_pipe = data_pipe.bucketbatch(
    batch_size = 8, 
    batch_num=5,  # batch_num is the number of batches to keep in a bucket
    bucket_num=1, # bucket_num is the number of buckets
    use_in_batch_shuffle=False, 
    sort_key=sortBucket
)

print('Afte batch ...')
#for batch in data_pipe:
#    print(batch)
#    print(len(batch))
#    break


def separate_batch(batch):
    '''
    Inputs: [(text1, label1), (text2, label2), ...]
    Outputs: ([text1, text2, ...], [label1, label2, ...])
    '''
    texts, labels = zip(*batch)
    return texts, labels

data_pipe = data_pipe.map(separate_batch)
print('After seperate batch ...')
#for texts, labels in data_pipe:
#    print(len(texts), texts)
#    print(len(labels), labels)
#    break


def apply_padding(sample):
    return (T.ToTensor(0)(list(sample[0])), torch.tensor(list(sample[1])))

data_pipe = data_pipe.map(apply_padding)
print('After apply padding ...')
for texts, labels in data_pipe:
    print(type(texts), texts.shape)
    print(type(labels), labels.shape)
    break

Built vocab...
<class 'torchtext.vocab.vocab.Vocab'> 7929
([1, 25, 278, 349, 598, 97, 225, 335, 2], [2.0])
Applied transform...
Afte batch ...
After seperate batch ...
After apply padding ...
<class 'torch.Tensor'> torch.Size([8, 579])
<class 'torch.Tensor'> torch.Size([8, 1])


In [15]:
class LSTMRegressionModel(nn.Module):
    def __init__(self, embedding_size, hidden_size, vocab_size, embedding_pretrained=None) -> None:
        super(LSTMRegressionModel, self).__init__()
        if embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=vocab_size-2)
        self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, batch_first=True, bidirectional=False)
        self.linear = nn.Linear(hidden_size, 1)

    def forward(self, inputs):
        # [batch_size, seq_len] -> [batch_size, seq_len, embedding_size]
        x = self.embedding(inputs)
        # [batch_size, seq_len, embed_dim] -> [num_layers, batch_size, hidden_size]
        _, (x, _) = self.lstm(x)
        x = x[-1, :, :]
        # [batch_size, hidden_size] -> [batch_size, 1]
        x = self.linear(x)
        return x



def train(dataset, model, loss_fn, optimizer):
    model.train()
    for batch, (X, y) in enumerate(dataset, 1):
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


        if batch * len(X) % 1000 == 0:
            loss, current = loss.item(), (batch) * len(X)
            print(f"\tbatch: [{current:>5d}], loss: {loss:>7f}")


In [16]:
import torch.optim as optim
import torch.nn.functional as F
import tqdm
import time

epochs = 3
learning_rate = 1e-3
embedding_dim = 100
hidden_dim = 128
vocab_size = len(vocab)
model = LSTMRegressionModel(embedding_dim, hidden_dim, vocab_size)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#for batch, (X, y) in enumerate(data_pipe):
#    print(batch, X.size(), y.size())
#    pred = model(X)
#    print(pred.size())
#   break

for epoch in range(epochs):
    epoch_start_time = time.time()
    print(f"Epoch: {epoch}")
    train(data_pipe, model, loss_fn, optimizer)
    print(f"Epoch: {epoch}, escaplse {time.time() - epoch_start_time:10.2f} seconds")


Epoch: 0
	batch: [  800], loss: 775.336182
	batch: [ 1600], loss: 914.114441
	batch: [ 2400], loss: 805.815674
	batch: [ 3200], loss: 878.371582
	batch: [ 4000], loss: 1815.596802
	batch: [ 4800], loss: 1793.880005
	batch: [ 5600], loss: 1074.542969
	batch: [ 6400], loss: 596.021606
	batch: [ 7200], loss: 1499.280884
	batch: [ 8000], loss: 1283.532471
	batch: [ 8800], loss: 1356.803101
	batch: [ 9600], loss: 633.639221
	batch: [10400], loss: 1184.123169
	batch: [11200], loss: 1086.621826
	batch: [12000], loss: 1517.476074
	batch: [12800], loss: 877.793884
	batch: [13600], loss: 1541.178223
	batch: [14400], loss: 954.177002
	batch: [15200], loss: 890.035583
	batch: [16000], loss: 1330.868408
	batch: [16800], loss: 960.231079
	batch: [17600], loss: 1212.364868
	batch: [18400], loss: 1075.360962
	batch: [19200], loss: 1364.616943
	batch: [20000], loss: 994.610840
	batch: [20800], loss: 1004.119995
	batch: [21600], loss: 1377.440918
	batch: [22400], loss: 1106.871826
Epoch: 0, escaplse    