In [76]:
# 参考：bairui
# 改进：增加了使用word_embedding方式初始化torch.nn.Embedding代码

In [27]:
import pandas as pd
import numpy as np
import string
import torch
import os
from torch import nn, optim
from torch.autograd import Variable
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

In [4]:
# 常量
DEBUG = True

# 1 使用CBOW方法训练word_embedding

In [5]:
# 读入数据
train = pd.read_csv('data/train.tsv.zip', sep='\t')
test = pd.read_csv('data/test.tsv.zip', sep='\t')

In [6]:
train.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [7]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
# 去除标点符号，并全部小写
train['Phrase1'] = train.Phrase.apply(lambda x: x.translate(str.maketrans('','', string.punctuation)).lower())
#str.maketrans('','', string.punctuation))使用了三个参数，前两个参数是转换，这里没有使用
#第三个参数中出现的字符将被转为None
test['Phrase1'] = test.Phrase.apply(lambda x: x.translate(str.maketrans('','', string.punctuation)).lower())

In [9]:
# 构建预训练语料
train_text = "".join(i for i in train['Phrase1'])
train_text[:300]

'a series of escapades demonstrating the adage that what is good for the goose is also good for the gander  some of which occasionally amuses but none of which amounts to much of a story a series of escapades demonstrating the adage that what is good for the goosea seriesaseriesof escapades demonstra'

In [10]:
test_text = "".join(i for i in test['Phrase1'])
test_text[:100]

'an intermittently pleasing but mostly routine effort an intermittently pleasing but mostly routine e'

In [52]:
raw_text = (" ".join((train_text, test_text))).split(' ')
if DEBUG:
    raw_text = raw_text[:300]

In [53]:
# 构建CBOW训练数据集
CONTEXT_SIZE = 2

vocab = set(raw_text)
word_to_idx = {word: i for i, word in enumerate(vocab)}

data = []
for i in range(CONTEXT_SIZE, len(raw_text)-CONTEXT_SIZE):
    context = [raw_text[i-2], raw_text[i-1], raw_text[i+1], raw_text[i+2]]
    target = raw_text[i]
    data.append((context, target))

In [55]:
# vocab

In [56]:
# 构建CBOW训练模型
class CBOW(nn.Module):
    def __init__(self, n_word, n_dim, context_size):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(n_word, n_dim)
        self.linear1 = nn.Linear(2*context_size*n_dim, 128)
        self.linear2 = nn.Linear(128, n_word)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(1, -1)
        x = self.linear1(x)
        x = F.relu(x, inplace=True)
        x = self.linear2(x)
        x = F.log_softmax(x)
        return x
        
model = CBOW(len(word_to_idx), 50, CONTEXT_SIZE)
if torch.cuda.is_available():
    model = model.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [57]:
# CBOW训练
for epoch in range(40):
    print('epoch {}'.format(epoch))
    print('*'*10)
    running_loss = 0
    for word in data:
        context, target = word
        context = Variable(torch.LongTensor([word_to_idx[i] for i in context]))
        target = Variable(torch.LongTensor([word_to_idx[target]]))
        if torch.cuda.is_available():
            context = context.cuda()
            target = target.cuda()

        out = model(context)
        loss = criterion(out, target)
        running_loss += loss.data

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print('loss: {:.6f}'.format(running_loss / len(data)))

epoch 0
**********


  from ipykernel import kernelapp as app


loss: 4.072994
epoch 1
**********
loss: 3.711773
epoch 2
**********
loss: 3.334427
epoch 3
**********
loss: 2.937793
epoch 4
**********
loss: 2.558218
epoch 5
**********
loss: 2.233326
epoch 6
**********
loss: 1.969870
epoch 7
**********
loss: 1.760421
epoch 8
**********
loss: 1.594467
epoch 9
**********
loss: 1.462327
epoch 10
**********
loss: 1.355362
epoch 11
**********
loss: 1.266763
epoch 12
**********
loss: 1.191616
epoch 13
**********
loss: 1.126652
epoch 14
**********
loss: 1.069730
epoch 15
**********
loss: 1.019395
epoch 16
**********
loss: 0.974600
epoch 17
**********
loss: 0.934440
epoch 18
**********
loss: 0.898068
epoch 19
**********
loss: 0.864874
epoch 20
**********
loss: 0.834359
epoch 21
**********
loss: 0.806063
epoch 22
**********
loss: 0.779833
epoch 23
**********
loss: 0.755207
epoch 24
**********
loss: 0.732053
epoch 25
**********
loss: 0.710120
epoch 26
**********
loss: 0.689293
epoch 27
**********
loss: 0.669369
epoch 28
**********
loss: 0.650322
epoch 29
*****

In [58]:
model.embedding.weight.cpu().detach().numpy().shape

(66, 50)

In [59]:
weights_cbow = model.embedding.weight.cpu().detach().numpy()

# 2 模型构建

In [25]:
class TextRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_of_class, weights=None, rnn_type="RNN"):
        super(TextRNN, self).__init__()

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_of_class = num_of_class
        self.embedding_dim = embedding_dim
        self.rnn_type = rnn_type

        if weights is not None:
            self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, _weight=weights)
        else:
            self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

        if rnn_type == "RNN":
            self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)
            self.hidden2label = nn.Linear(hidden_size, num_of_class)
        elif rnn_type == "LSTM":
            self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True, bidirectional=True)
            self.hidden2label = nn.Linear(hidden_size*2, num_of_class)

    def forward(self, input_sents):
        batch_size, seq_len = input_sents.shape
        embed_out = self.embed(input_sents)

        if self.rnn_type == "RNN":
            h0 = torch.randn(1, batch_size, self.hidden_size)
            _, hn = self.rnn(embed_out, h0)
        elif self.rnn_type == "LSTM":
            h0, c0 = torch.randn(2, batch_size, self.hidden_size), torch.randn(2, batch_size, self.hidden_size)
            output, (hn, _) = self.lstm(embed_out, (h0, c0))

        logits = self.hidden2label(hn).squeeze(0)

        return logits

In [26]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_of_class, embedding_vectors=None, kernel_num=100, kerner_size=[3, 4, 5], dropout=0.5):
        super(TextCNN, self).__init__()
        if embedding_vectors is None:
            self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        else:
            self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, _weight=embedding_vectors)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_num, (K, embedding_dim)) for K in kerner_size])
        self.dropout = nn.Dropout(dropout)
        self.feature2label = nn.Linear(3*kernel_num, num_of_class)

    def forward(self, x):
        embed_out = self.embed(x).unsqueeze(1)
        conv_out = [F.relu(conv(embed_out)).squeeze(3) for conv in self.convs]

        pool_out = [F.max_pool1d(block, block.size(2)).squeeze(2) for block in conv_out]

        pool_out = torch.cat(pool_out, 1)

        logits = self.feature2label(pool_out)

        return logits

# 3 训练集构建

In [71]:
def prepare_data(dataset_path, sent_col_name, label_col_name):
    file_path = os.path.join(dataset_path, "train.tsv.zip")
    data = pd.read_csv(file_path, sep="\t")
    data['Phrase1'] = data.Phrase.apply(lambda x: x.translate(str.maketrans('','', string.punctuation)).lower())
    X = data[sent_col_name].values
    y = data[label_col_name].values
    return X, y

In [29]:
# word2id or id2word
class Language:
    def __init__(self):
        self.word2id = {}
        self.id2word = {}

    def fit(self, sent_list):
        vocab = set()
        for sent in sent_list:
            vocab.update(sent.split(" "))
        word_list = ["<pad>", "<unk>"] + list(vocab)
        self.word2id = {word: i for i, word in enumerate(word_list)}
        self.id2word = {i: word for i, word in enumerate(word_list)}

    def transform(self, sent_list, reverse=False):
        sent_list_id = []
        word_mapper = self.word2id if not reverse else self.id2word
        unk = self.word2id["<unk>"] if not reverse else None
        for sent in sent_list:
            sent_id = list(map(lambda x: word_mapper.get(x, unk), sent.split(" ") if not reverse else sent))
            sent_list_id.append(sent_id)
        return sent_list_id

In [30]:
class ClsDataset(Dataset):
    def __init__(self, sents, labels):
        self.sents = sents
        self.labels = labels

    def __getitem__(self, item):
        return self.sents[item], self.labels[item]

    def __len__(self):
        return len(self.sents)

In [31]:
# 语句对齐
def collate_fn(batch_data):
    batch_data.sort(key=lambda data_pair: len(data_pair[0]), reverse=True)

    sents, labels = zip(*batch_data)
    sents_len = [len(sent) for sent in sents]
    sents = [torch.LongTensor(sent) for sent in sents]
    padded_sents = pad_sequence(sents, batch_first=True, padding_value=0)

    return torch.LongTensor(padded_sents), torch.LongTensor(labels),  torch.FloatTensor(sents_len)

In [64]:
# 寻找id对应的词向量
def get_wordvec(word2id, vec_file_path, vec_dim=50, embedding_type="glove"):
    print("开始加载词向量")
    word_vectors = torch.nn.init.xavier_uniform_(torch.empty(len(word2id), vec_dim))
    word_vectors[0, :] = 0  # <pad>
    found = 0
    if embedding_type == "glove":
        with open(vec_file_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
            for line in lines:
                splited = line.split(" ")
                if splited[0] in word2id:
                    found += 1
                    word_vectors[word2id[splited[0]]] = torch.tensor(list(map(lambda x: float(x), splited[1:])))
                if found == len(word2id) - 1:  # 允许<unk>找不到
                    break
        print("总共 %d个词，其中%d个找到了对应的词向量" % (len(word2id), found))
        return word_vectors.float()
    elif embedding_type == "CBOW":
        for key, idx in word_to_idx.items():
            if key in word2id:
                found += 1
                word_vectors[word2id[key]] = torch.tensor(weights_cbow[idx])
            if found == len(word2id) - 1:  # 允许<unk>找不到
                    break
        print("总共 %d个词，其中%d个找到了对应的词向量" % (len(word2id), found))
        return word_vectors.float()
    else:
        return word_vectors.float()            

In [73]:
def make_dataloader(dataset_path="data", sent_col_name="Phrase1", label_col_name="Sentiment", batch_size=32, vec_file_path="glove.6B.50d.txt", debug=False, embedding_type="CBOW"):
    X, y = prepare_data(dataset_path=dataset_path, sent_col_name=sent_col_name, label_col_name=label_col_name)

    if debug:
        X, y = X[:100], y[:100]

    X_language = Language()
    X_language.fit(X)
    X = X_language.transform(X)

    word_vectors = get_wordvec(X_language.word2id, vec_file_path=vec_file_path, vec_dim=50, embedding_type=embedding_type)


    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    cls_train_dataset, cls_val_dataset = ClsDataset(X_train, y_train), ClsDataset(X_val, y_val)
    cls_train_dataloader = DataLoader(cls_train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    cls_val_dataloader = DataLoader(cls_val_dataset, batch_size=batch_size, collate_fn=collate_fn)

    return cls_train_dataloader, cls_val_dataloader, word_vectors, X_language

# 4 模型训练

In [75]:
model_names = ["LSTM", "RNN", "CNN"]  
learning_rate = 0.001
epoch_num = 500
num_of_class = 5

train_iter, val_iter, word_vectors, X_lang = make_dataloader(batch_size=100, debug=True)

for model_name in model_names[-1:]:
    if model_name == "RNN":
        # 共有3种可以选择的词嵌入类型："CBOW"/"glove"/"random"
        model = TextRNN(vocab_size=len(word_vectors), embedding_dim=50, hidden_size=128, num_of_class=num_of_class, weights=word_vectors, embedding_type="CBOW")
    elif model_name == "CNN":
        model = TextCNN(vocab_size=len(word_vectors), embedding_dim=50, num_of_class=num_of_class, embedding_vectors=word_vectors)
    elif model_name == "LSTM":
        model = TextRNN(vocab_size=len(word_vectors), embedding_dim=50, hidden_size=128, num_of_class=num_of_class, weights=word_vectors, rnn_type="LSTM")
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_fun = torch.nn.CrossEntropyLoss()

    for epoch in range(epoch_num):
        model.train()
        for i, batch in enumerate(train_iter):
            x, y, lens = batch
            logits = model(x)
            optimizer.zero_grad()
            loss = loss_fun(logits, y)
            loss.backward()
            optimizer.step()

        model.eval()
        train_accs = []
        for i, batch in enumerate(train_iter):
            x, y, lens = batch
            _, y_pre = torch.max(logits, -1)
            acc = torch.mean((torch.tensor(y_pre == y, dtype=torch.float)))
            train_accs.append(acc)
        train_acc = np.array(train_accs).mean()

        val_accs = []
        for i, batch in enumerate(val_iter):
            x, y, lens = batch
            logits = model(x)
            _, y_pre = torch.max(logits, -1)
            acc = torch.mean((torch.tensor(y_pre == y, dtype=torch.float)))
            val_accs.append(acc)
        val_acc = np.array(val_accs).mean()
        print("epoch %d train acc:%.2f, val acc:%.2f" % (epoch, train_acc, val_acc))
        if train_acc >= 0.99:
            break

开始加载词向量
总共 51个词，其中34个找到了对应的词向量
epoch 0 train acc:0.08, val acc:0.40
epoch 1 train acc:0.38, val acc:0.55
epoch 2 train acc:0.70, val acc:0.70
epoch 3 train acc:0.80, val acc:0.70
epoch 4 train acc:0.84, val acc:0.70
epoch 5 train acc:0.80, val acc:0.70
epoch 6 train acc:0.77, val acc:0.75




epoch 7 train acc:0.77, val acc:0.75
epoch 8 train acc:0.84, val acc:0.75
epoch 9 train acc:0.80, val acc:0.75
epoch 10 train acc:0.77, val acc:0.70
epoch 11 train acc:0.77, val acc:0.70
epoch 12 train acc:0.82, val acc:0.70
epoch 13 train acc:0.76, val acc:0.70
epoch 14 train acc:0.76, val acc:0.70
epoch 15 train acc:0.76, val acc:0.70
epoch 16 train acc:0.82, val acc:0.70
epoch 17 train acc:0.79, val acc:0.70
epoch 18 train acc:0.81, val acc:0.70
epoch 19 train acc:0.75, val acc:0.70
epoch 20 train acc:0.77, val acc:0.75
epoch 21 train acc:0.82, val acc:0.75
epoch 22 train acc:0.80, val acc:0.75
epoch 23 train acc:0.80, val acc:0.75
epoch 24 train acc:0.84, val acc:0.75
epoch 25 train acc:0.79, val acc:0.75
epoch 26 train acc:0.76, val acc:0.75
epoch 27 train acc:0.76, val acc:0.75
epoch 28 train acc:0.79, val acc:0.75
epoch 29 train acc:0.71, val acc:0.75
epoch 30 train acc:0.81, val acc:0.75
epoch 31 train acc:0.79, val acc:0.75
epoch 32 train acc:0.84, val acc:0.75
epoch 33 train 

KeyboardInterrupt: 

# 测试

In [60]:
for key, value in word_to_idx.items():
    print(key, value)

 0
for 1
occasionally 2
some 3
storyoccasionallyamuses 4
also 5
goosedemonstrating 6
adagedemonstratingthe 7
whichsomeof 8
storyfor 9
storytomuch 10
to 11
independent 12
of 13
goosegoodfor 14
what 15
goose 16
storywhich 17
adagetheadagethat 18
goosegooseis 19
seriesaseriesof 20
storyamusesbut 21
amounts 22
worth 23
storystorythis 24
demonstrating 25
goosethatwhat 26
amuses 27
storybutnone 28
is 29
and 30
which 31
gandergandersome 32
that 33
whichwhichoccasionally 34
but 35
story 36
escapades 37
goosea 38
entertaining 39
storya 40
quiet 41
gander 42
much 43
storysome 44
storymuchof 45
storyamounts 46
introspective 47
the 48
adage 49
none 50
seeking 51
series 52
gooseforthe 53
storyamountsto 54
gooseisgood 55
alsoalsogood 56
a 57
storythe 58
gooseescapadesdemonstrating 59
good 60
storynoneof 61
gooseofescapades 62
goosewhatis 63
this 64
storyis 65


(50,)