In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

for module in [pd, np, torch]:
    print(module.__name__, module.__version__)

pandas 1.5.3
numpy 1.24.3
torch 2.0.0+cu117


In [3]:
import torchdata.datapipes as dp
import torchtext.transforms as T
import spacy
from torchtext.vocab import build_vocab_from_iterator
ch = spacy.load('zh_core_web_sm')

In [4]:
def charTokenize(text):
    return [char for char in text]

print(charTokenize('我爱北京天安门'))

def cnTokenize(text):
    return [tok.text for tok in ch(text)]

print(cnTokenize('我爱北京天安门'))

['我', '爱', '北', '京', '天', '安', '门']
['我', '爱', '北京', '天安门']


In [11]:
train_data_path = './data/train.txt'
data_pipe = dp.iter.IterableWrapper([train_data_path])
data_pipe = dp.iter.FileOpener(data_pipe, mode='r')
data_pipe = data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)

def extract_attributes(row):
    return row[1], row[2]
data_pipe = data_pipe.map(extract_attributes)

def get_tokens(data_iter, tokenizer):
    for _, text in data_iter:
            yield tokenizer(text)

def build_vocab(data_iter, tokenizer):
    vocab = build_vocab_from_iterator(
        get_tokens(data_iter, tokenizer),
        min_freq=2,
        specials=["<pad>", "<sos>", "<eos>", "<unk>"],
        special_first=True
    )
    vocab.set_default_index(vocab["<unk>"])
    return vocab


vocab = build_vocab(data_pipe, charTokenize)
print('Built vocab...')
print(type(vocab), len(vocab))

def getTransform(vocab):
    text_transform = T.Sequential(
        T.VocabTransform(vocab),
        T.AddToken(1, begin=True),
        T.AddToken(2, begin=False)
    )
    return text_transform

def apply_transform(sample):
    text_transformer = getTransform(vocab)
    tokenized_text = charTokenize(sample[1])
    return text_transformer(tokenized_text), float(sample[0])

print(apply_transform(('2',"我爱北京天安门")))

data_pipe = data_pipe.map(apply_transform)
print('Applied transform...')
#for sample in data_pipe:
#    print(sample)
#    break

def sortBucket(bucket):
    return sorted(bucket, key=lambda x: len(x[0]))

data_pipe = data_pipe.bucketbatch(
    batch_size = 2, 
    batch_num=5,  # batch_num is the number of batches to keep in a bucket
    bucket_num=1, # bucket_num is the number of buckets
    use_in_batch_shuffle=False, 
    sort_key=sortBucket
)

print('Afte batch ...')
#for batch in data_pipe:
#    print(batch)
#    print(len(batch))
#    break


def separate_batch(batch):
    '''
    Inputs: [(text1, label1), (text2, label2), ...]
    Outputs: ([text1, text2, ...], [label1, label2, ...])
    '''
    texts, labels = zip(*batch)
    return texts, labels

data_pipe = data_pipe.map(separate_batch)
print('After seperate batch ...')
#for texts, labels in data_pipe:
#    print(len(texts), texts)
#    print(len(labels), labels)
#    break


def apply_padding(sample):
    return (T.ToTensor(0)(list(sample[0])), torch.tensor(list(sample[1])))

data_pipe = data_pipe.map(apply_padding)
print('After apply padding ...')
for texts, labels in data_pipe:
    print(len(texts), texts)
    print(len(labels), labels)
    break

Built vocab...
<class 'torchtext.vocab.vocab.Vocab'> 7929
([1, 25, 278, 349, 598, 97, 225, 335, 2], 2.0)
Applied transform...
Afte batch ...
After seperate batch ...
tensor([[   1, 2009, 1457,   71,  111,    7, 2009, 1457,   71,  111,    6,   10,
           26,  211,  866,  464,    5,    5,  299,   29,    4,  527,   86, 1365,
          477,  123,  462,   52,   49,   10, 1794,   21,   50,    7,    6, 1001,
          844, 1649,  363,  994,   16,    4,  849,  152,   97,   72,    6,  778,
           21,  156,    4, 1547,  393, 1343,    4,  725,   72,   10,  770,   14,
         3551,  240, 2728,    6,  523,  152,  136, 3246,  387,  393,  156,    2,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    

In [5]:
class LSTMRegressionModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, embedding_pretrained=None) -> None:
        super(LSTMRegressionModel, self).__init__()
        if embedding_pretrained is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_pretrained, freeze=False)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab_size-2)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=False)
        self.linear = nn.Linear(hidden_dim, 1)

    def forward(self, inputs):
        # [batch_size, seq_len] -> [batch_size, seq_len, embed_dim]
        x = self.embedding(inputs)
        # [batch_size, seq_len, embed_dim] -> [batch_size, seq_len, hidden_dim]
        _, (x, _) = self.lstm(x)
        # [batch_size, seq_len, hidden_dim] -> [batch_size, hidden_dim]
        x = self.linear(x)
        return x



def train(dataset, model, loss_fn, optimizer):
    model.train()
    for batch, (X, y) in enumerate(dataset, 0):
        pred = model(X)
        loss = loss_fn(pred, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch+1) * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}]")


import torch.optim as optim
import torch.nn.functional as F
import tqdm

epochs = 5
learning_rate = 1e-3
embedding_dim = 100
hidden_dim = 128
vocab_size = len(vocab)
model = LSTMRegressionModel(embedding_dim, hidden_dim, vocab_size)
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)