# Define model

In [1]:
import numpy as np

In [2]:
norm_reviews = np.load('normalized_reviews.npy')
norm_sentiment = np.load('normalized_sentiment.npy')

In [3]:
print(norm_sentiment.shape,norm_reviews.shape)

(50000,) (50000, 256)


In [4]:
print(norm_reviews[0])
print(norm_sentiment[0])

[     2     52      7      4     72  17965     35   3046     16     53
   2645    124    180  12965   1946     85    773     34  12769      6
     43     36    252      5     23     41     18   2800    106   1583
     21      1      1      1     62    877     16   1873    289     63
  12965     19     51  13159      9  45659   3472      7    718      5
     46    212     10    252     29      4   1392    246      6   1857
    289      5     41     18     40     11    277     14      4  17177
  21364     50  25444      6     41    277  11114     88  14775     21
   9485      8   1607      5   1743     50    718      6     51     18
  16361      5     10      4   2396    238      7      4      1      1
      1     18    179  12965     23     16     18      4   7402    458
      8      4  17868   2968    198     96      1      6     24   6549
   1678     17  19724    119      5     33   6126   1523      7      4
   1001    115     68      4   2778     37   2851  11278      9    625
  6737

In [5]:
'''
python 3.7
bcolz              1.2.1
numpy              1.21.5
pytorch  1.11.0

'''
import os
import bcolz
import numpy as np
import pickle
import torch

def pretrained_word_embeddings(embed_path:str, over_writte:bool, special_tk:bool=True, freeze:bool=True):
    ''' return a torch.nn.Embedding layer, utilizing the pre-trained word vector (e.g., Glove), add 'bos', 'eos', 'unk' and 'pad'.

    :param embed_path: the path where pre-trained matrix cached (e.g., './glove.6B.300d.txt').
    :param over_writte: force to rewritte the existing matrix.
    :param special_tk: whether adding special token -- 'pad', 'unk', bos' and 'eos', at position 0, 1, 2 and 3 by default.
    :param freeze: whether trainable.
    :return: embed -> nn.Embedding, weights_matrix -> np.array, word2idx -> function, idx2word -> function, embed_dim -> int
    '''
    root_dir = embed_path.rsplit(".",1)[0]+".dat"
    out_dir_word = embed_path.rsplit(".",1)[0]+"_words.pkl"
    out_dir_idx = embed_path.rsplit(".",1)[0]+"_idx.pkl"
    out_dir_idx2word = embed_path.rsplit(".", 1)[0] + "_idx2word.pkl"
    if not all([os.path.exists(root_dir),os.path.exists(out_dir_word),os.path.exists(out_dir_idx)]) or over_writte:
        ## process and cache glove ===========================================
        words = []
        idx = 0
        _word2idx = {}
        _idx2word = {}
        vectors = bcolz.carray(np.zeros(1), rootdir=root_dir, mode='w')
        with open(os.path.join(embed_path),"rb") as f:
            for l in f:
                line = l.decode().split()
                word = line[0]
                words.append(word)
                _word2idx[word] = idx
                _idx2word[idx]=word
                idx += 1
                vect = np.array(line[1:]).astype(float)
                vectors.append(vect)
        vectors = bcolz.carray(vectors[1:].reshape((idx, vect.shape[0])), rootdir=root_dir, mode='w')
        vectors.flush()
        pickle.dump(words, open(out_dir_word, 'wb'))
        pickle.dump(_word2idx, open(out_dir_idx, 'wb'))
        pickle.dump(_idx2word,open(out_dir_idx2word,'wb'))
        print("dump word/idx at {}".format(embed_path.rsplit("/",1)[0]))
        ## =======================================================
    ## load glove
    vectors = bcolz.open(root_dir)[:]
    words = pickle.load(open(embed_path.rsplit(".",1)[0]+"_words.pkl", 'rb'))
    _word2idx = pickle.load(open(embed_path.rsplit(".",1)[0]+"_idx.pkl", 'rb'))
    _idx2word=pickle.load(open(embed_path.rsplit(".", 1)[0] + "_idx2word.pkl",'rb'))
    print("Successfully load Golve from {}, the shape of cached matrix: {}".format(embed_path.rsplit("/",1)[0],vectors.shape))

    word_num, embed_dim = vectors.shape
    word_num += 4  if special_tk else 0  ## e.g., 400004
    embedding_matrix = np.zeros((word_num, embed_dim))
    if special_tk:
        embedding_matrix[1] = np.random.normal(scale=0.6, size=(embed_dim, ))
        embedding_matrix[2] = np.random.normal(scale=0.6, size=(embed_dim,))
        embedding_matrix[3] = np.random.normal(scale=0.6, size=(embed_dim,))
        embedding_matrix[4:,:] = vectors
        weights_matrix_tensor = torch.FloatTensor(embedding_matrix)
        pad_idx,unk_idx, bos_idx,eos_idx = 0,1,2,3
        embed_layer= torch.nn.Embedding.from_pretrained(weights_matrix_tensor,freeze=freeze,padding_idx=pad_idx)
        _word2idx = dict([(k,v+4) for k,v in _word2idx.items()])
        _idx2word = dict([(k+4,v) for k,v in _idx2word.items()])
        assert len(_word2idx) + 4 == embedding_matrix.shape[0]
    else:
        embedding_matrix[:,:] = vectors
        weights_matrix_tensor = torch.FloatTensor(embedding_matrix)
        embed_layer = torch.nn.Embedding.from_pretrained(weights_matrix_tensor,freeze=freeze)
        assert len(_word2idx) == embedding_matrix.shape[0]

    def word2idx(word:str):
        if word == '<pad>': return 0
        elif word == '<bos>': return 2
        elif word == '<eos>': return 3
        return _word2idx.get(word,1)
    def idx2word(idx:int):
        if idx == 0: return '<pad>'
        elif idx == 1: return '<unk>'
        elif idx == 2: return '<bos>'
        elif idx == 3: return '<eos>'
        return _idx2word.get(idx,'')
    return embed_layer, embedding_matrix, word2idx,idx2word, embed_dim

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
embed_layer, embedding_matrix, word2idx,idx2word, embed_dim = pretrained_word_embeddings('../../word embeddings/glove.6B.300d.txt',over_writte=True,special_tk=True,freeze=True)

dump word/idx at ../../word embeddings
Successfully load Golve from ../../word embeddings, the shape of cached matrix: (400000, 300)


In [7]:
print(embed_layer.weight.requires_grad)

False


In [8]:
import torch
import torch.nn as nn

class MyModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, n_layers,
                 bidirectional, dropout):
        super(MyModel, self).__init__()
        self.embedding = embed_layer
        self.rnn =  nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout,batch_first=True)
        D = 2 if bidirectional else 1
        self.fc3 = nn.Linear(D*n_layers*hidden_dim,1)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        x = self.embedding(x)                     # 32,256,50
        _,(hns,cells) = self.rnn(x)               # D*num_layer,batch size,hidden dim
        x = hns.view(hns.size()[1],-1)            # reshape to 32,-1
        # x = self.dropout(x)
        outprob = torch.sigmoid(self.fc3(x))      # batch_size, 1
        return outprob.view(outprob.size()[0])    # reshape to 32

In [9]:
# 创建网络模型
myModel = MyModel(300,32,2,True,0.8)

# 损失函数
loss_fn = nn.BCELoss()

# 优化器
optimizer = torch.optim.Adam(myModel.parameters(),0.0001)

# 训练的轮数
epoch = 100

In [10]:
from torchinfo import summary
summary(myModel)

Layer (type:depth-idx)                   Param #
MyModel                                  --
├─Embedding: 1-1                         (120,001,200)
├─LSTM: 1-2                              110,592
├─Linear: 1-3                            129
├─Dropout: 1-4                           --
Total params: 120,111,921
Trainable params: 110,721
Non-trainable params: 120,001,200

In [11]:
print(myModel.embedding.weight.requires_grad)

False


In [12]:
def correct_num(vec1,vec2):
    result = (torch.abs(vec1-vec2)) <0.5
    return torch.sum(result).item()

# Dataset

In [13]:
import torch.utils.data as data

class myDataset(data.Dataset):
    def __init__(self):
        super(myDataset,self).__init__()
        self.review_list = norm_reviews
        self.label_list = norm_sentiment
    def __getitem__(self,idx):
        return  self.review_list[idx],self.label_list[idx]
    def __len__(self):
        return len(self.review_list)

In [14]:
dataset=myDataset()
dataset[2]

(array([     2,     45,    808,     41,     19,     11,   5209,    183,
             8,   2029,     83,     17,     11,    321,   1631,    744,
          1179,      5,   2999,     10,      4,    329,  17113,   2252,
             9,   2645,     11,    901,     15,  21364,   2845,      6,
             4,   2223,     18,  26888,      5,     38,      4,   2473,
            18,  18222,      9,      4,   2157,     36,  26081,     27,
           155,      4,    147,   5876,   1606,   7270,   4902,     28,
             6,    114,     81,    111,     34,   4258,     65,     43,
          4227,     41,     18,     40,    554,    393,    236,     49,
          1122,  10437,      5,     45,    808,     24,     19,   4299,
            16,  10971,   3192,     18,    153,   1861,     10,    428,
             7,      4,   1139,    113,      7,     99,     37,   2850,
             8,      1,      1,      1,     19,      4,    100,     45,
          1155,  11391,     26,     52,      7,  10971,     13, 

In [15]:
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

In [16]:
def myfunc(batch_data):
    '''
    batch_data: 32x2
    '''
    resData = []
    resLabel = []
    for i in batch_data:
        resData.append(i[0])
        resLabel.append(i[1])
    return torch.tensor(np.array(resData),dtype=torch.int),torch.tensor(np.array(resLabel),dtype=torch.float)

In [17]:
trainloader = data.DataLoader(train_dataset, batch_size=32,shuffle=True,collate_fn=myfunc,drop_last=True)
testloader = data.DataLoader(test_dataset, batch_size=32,shuffle=True,collate_fn=myfunc,drop_last=True)

# Train

In [None]:
train_step_loss = []
valid_step_loss = []
train_epoch_loss = []
valid_epoch_loss = []

for i in range(epoch):
    print("-------第 {} 轮训练开始-------".format(i+1))

    # 训练步骤开始
    myModel.train()
    step = 0
    for data in trainloader:
        trData, labels = data
        outputs = myModel(trData) # 求模型的输出
        optimizer.zero_grad() # 梯度清零
        loss = loss_fn(outputs, labels)  # 求loss
        train_step_loss.append(loss.item())
        step += 1

        if (step%100 ==0):
            print(f'第{i+1}轮第{step}训练step时的loss: {loss.item()}')


        # 优化器优化模型
        loss.backward()       # 求梯度
        optimizer.step()      # 更新参数
        
    train_epoch_loss.append(np.average(train_step_loss))

    # 测试步骤开始
    myModel.eval()
    total_accuracy = 0        # 每一轮总的精确度
    with torch.no_grad():     # 不求梯度，不更新参数
        for data in testloader:
            teData, teLabels = data
            outputs = myModel(teData)
            loss = loss_fn(outputs, teLabels)
            valid_step_loss.append(loss.item())
            total_accuracy = total_accuracy + correct_num(teLabels,outputs)

    valid_epoch_loss.append(np.average(valid_step_loss))
    print(f"第{i+1}轮整体测试集上的Loss: {valid_epoch_loss[-1]}")
    print(f"第{i+1}轮整体测试集上的Accuracy: {total_accuracy/len(test_dataset)}")

-------第 1 轮训练开始-------
第1轮第100训练step时的loss: 0.683272659778595
第1轮第200训练step时的loss: 0.6978166103363037
第1轮第300训练step时的loss: 0.6883013844490051
第1轮第400训练step时的loss: 0.687104344367981
第1轮第500训练step时的loss: 0.6872978806495667
第1轮第600训练step时的loss: 0.6909545063972473
第1轮第700训练step时的loss: 0.6953219175338745
第1轮第800训练step时的loss: 0.6951210498809814
第1轮第900训练step时的loss: 0.6988334059715271
第1轮第1000训练step时的loss: 0.7011764645576477
第1轮第1100训练step时的loss: 0.6953855752944946
第1轮第1200训练step时的loss: 0.6904981136322021
第1轮整体测试集上的Loss: 0.6932539508128778
第1轮整体测试集上的Accuracy: 0.4979
-------第 2 轮训练开始-------
第2轮第100训练step时的loss: 0.6924232244491577
第2轮第200训练step时的loss: 0.7010735273361206
第2轮第300训练step时的loss: 0.6920527815818787
第2轮第400训练step时的loss: 0.6978859305381775
第2轮第500训练step时的loss: 0.6886424422264099
第2轮第600训练step时的loss: 0.6947436332702637
第2轮第700训练step时的loss: 0.6953805088996887
第2轮第800训练step时的loss: 0.6873571872711182
第2轮第900训练step时的loss: 0.6947407722473145
第2轮第1000训练step时的loss: 0.6919124722480774
第2轮第1100