# Recurrent Neural Networks

本次作业是要让同学接触 NLP 当中一个简单的 task —— 语句分类（文本分类）

给定一个语句，判断他有没有恶意（负面标 1，正面标 0）

若有任何问题，欢迎来信至助教信箱 ntu-ml-2020spring-ta@googlegroups.com

参考资料

http://colah.github.io/posts/2015-08-Understanding-LSTMs/

https://www.cnblogs.com/zyb993963526/p/13784199.html




In [16]:
# from google.colab import drive
# drive.mount('/content/drive')
# path_prefix = 'drive/My Drive/Colab Notebooks/hw4 - Recurrent Neural Network'
path_prefix = './'

### Download Dataset
有三个档案，分别是 training_label.txt、training_nolabel.txt、testing_data.txt

- training_label.txt：有 label 的 training data（句子配上 0 or 1，+++$+++ 只是分隔符，不要理它）
    - e.g., 1 +++$+++ are wtf ... awww thanks !

- training_nolabel.txt：没有 label 的 training data（只有句子），用来做 semi-supervised learning
    - ex: hates being this burnt !! ouch

- testing_data.txt：你要判断 testing data 里面的句子是 0 or 1

    >id,text

    >0,my dog ate our dinner . no , seriously ... he ate it .

    >1,omg last day sooon n of primary noooooo x im gona be swimming out of school wif the amount of tears am gona cry

    >2,stupid boys .. they ' re so .. stupid !

In [17]:
# from google.colab import drive
# drive.mount('/content/drive')
# path_prefix = 'drive/My Drive/Colab Notebooks/hw4 - Recurrent Neural Network'
path_prefix = './'

In [18]:
# !wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1dPHIl8ZnfDz_fxNd2ZeBYedTat2lfxcO' -O 'drive/My Drive/Colab Notebooks/hw8-RNN/data/training_label.txt'
# !wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1x1rJOX_ETqnOZjdMAbEE2pqIjRNa8xcc' -O 'drive/My Drive/Colab Notebooks/hw8-RNN/data/training_nolabel.txt'
# !wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=16CtnQwSDCob9xmm6EdHHR7PNFNiOrQ30' -O 'drive/My Drive/Colab Notebooks/hw8-RNN/data/testing_data.txt'

!gdown --id '1lz0Wtwxsh5YCPdqQ3E3l_nbfJT1N13V8' --output data.zip
!unzip data.zip
!ls

Downloading...
From: https://drive.google.com/uc?id=1lz0Wtwxsh5YCPdqQ3E3l_nbfJT1N13V8
To: /content/data.zip
45.1MB [00:00, 109MB/s] 
Archive:  data.zip
replace training_label.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: training_label.txt      
  inflating: testing_data.txt        
  inflating: training_nolabel.txt    
ckpt.model  predict.csv  testing_data.txt    training_nolabel.txt
data.zip    sample_data  training_label.txt  w2v_all.model


In [19]:
# this is for filtering the warnings
import warnings
warnings.filterwarnings('ignore')

### Utils

In [20]:
# utils.py
# 这个 block 用来先定义一些等等常用到的函式
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F

def load_training_data(path='training_label.txt'):
    # 把 training 时需要的 data 读进来
    # 如果是 'training_label.txt'，需要读取 label，如果是 'training_nolabel.txt'，不需要读取 label
    # return: x -- list of 每一个句子的所有单词list ; y -- list of label
    if 'training_label' in path:
        with open(path, 'r') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split(' ') for line in lines]
        x = [line[2:] for line in lines]
        y = [line[0] for line in lines]
        return x, y
    else:
        with open(path, 'r') as f:
            lines = f.readlines()
            x = [line.strip('\n').split(' ') for line in lines]
        return x

def load_testing_data(path='testing_data'):
    # 把 testing 时需要的 data 读进来
    # return: x -- list of 每一个句子的所有单词list
    with open(path, 'r') as f:
        lines = f.readlines()
        X = ["".join(line.strip('\n').split(",")[1:]).strip() for line in lines[1:]]
        X = [sen.split(' ') for sen in X]
    return X

def evaluation(outputs, labels):
    # outputs => probability (float)
    # labels => labels 
    # return correct number
    outputs[outputs>=0.5] = 1 # 大于等于 0.5 为正面
    outputs[outputs<0.5] = 0 # 小于 0.5 为负面
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

### Train Word to Vector

In [21]:
# w2v.py
# 这个 block 是用来训练 word to vector 的 word embedding
# 注意！这个 block 在训练 word to vector 时是用 cpu，可能要花到 10 分钟以上
import os
import numpy as np
import pandas as pd
import argparse
from gensim.models import word2vec

def train_word2vec(x):
    # 训练 word to vector 的 word embedding
    # Dimensionality of the word vectors = 250
    # return model
    model = word2vec.Word2Vec(x, size=250, window=5, min_count=5, workers=12, iter=10, sg=1)
    return model

if __name__ == "__main__":
    print("loading training data ...")
    train_x, y = load_training_data('training_label.txt')
    train_x_no_label = load_training_data('training_nolabel.txt')

    print("loading testing data ...")
    test_x = load_testing_data('testing_data.txt')

    #model = train_word2vec(train_x + train_x_no_label + test_x)
    model = train_word2vec(train_x + test_x)
    
    print("saving model ...")
    # model.save(os.path.join(path_prefix, 'model/w2v_all.model'))
    model.save(os.path.join(path_prefix, 'w2v_all.model'))

loading training data ...
loading testing data ...
saving model ...


### Data Preprocess

In [24]:
# preprocess.py
# 这个 block 用来做 data 的预处理
from torch import nn
from gensim.models import Word2Vec

class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path="./w2v.model"):
        self.w2v_path = w2v_path
        self.sentences = sentences  # 单词二维数组
        self.sen_len = sen_len
        self.idx2word = []  # [word, word...]
        self.word2idx = {}  # {word: id}
        self.embedding_matrix = [] # [ vector , vector , ...]
    def get_w2v_model(self):
        # 把之前训练好的 word to vec 模型读进来
        self.embedding = Word2Vec.load(self.w2v_path) # 获取某个单词的词向量 embedding['word']
        self.embedding_dim = self.embedding.vector_size
    def add_embedding(self, word):
        # 把 word 加进 embedding，并赋予他一个随机生成的 representation vector
        # word 只会是 "<PAD>" 或 "<UNK>"
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], dim=0)
    def make_embedding(self, load=True):
        print("Get embedding ...")
        # 取得训练好的 Word2vec word embedding
        if load:
            print("loading word to vec model ...")
            self.get_w2v_model()
        else:
            raise NotImplementedError
        # 制作一个 word2idx 的 dictionary
        # 制作一个 idx2word 的 list
        # 制作一个 word2vector 的 list
        for i, word in enumerate(self.embedding.wv.vocab):
            print('get words #{}'.format(i+1), end='\r')
            #e.g. self.word2index['he'] = 1 
            #e.g. self.index2word[1] = 'he'
            #e.g. self.vectors[1] = 'he' vector
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        print('')
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        # 将 "<PAD>" 跟 "<UNK>" 加进 embedding 里面
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words: {}".format(len(self.embedding_matrix)))
        print("embedding_matrix：", self.embedding_matrix.shape)
        return self.embedding_matrix
        # 返回所有单词word2vector 的 list
    def pad_sequence(self, sentence):
        # 将每个句子变成一样的长度
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == self.sen_len
        return sentence
    def sentence_word2idx(self):
        # return 二维张量 把句子里面的字转成相对应的 index
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            print('sentence count #{}'.format(i+1), end='\r')
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx["<UNK>"])
            sentence_idx = self.pad_sequence(sentence_idx) # 将每个句子变成一样的长度
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)
    def labels_to_tensor(self, y):
        # 把 labels 转成 tensor
        y = [int(label) for label in y]
        return torch.LongTensor(y)


### Dataset

In [25]:
# data.py
# 实作 dataset 所需要的 '__init__', '__getitem__', '__len__'
# 好让 dataloader 能使用
import torch
from torch.utils import data

class TwitterDataset(data.Dataset):
    """
    Expected data shape like:(data_num, data_len)
    Data can be a list of numpy array or a list of lists
    input data shape : (data_num, seq_len, feature_dim)
    
    __len__ will return the number of data
    """
    def __init__(self, X, y):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

### Model

In [26]:
# model.py
# 这个 block 是要拿来训练的模型
import torch
from torch import nn
class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
        super(LSTM_Net, self).__init__()
        # 制作 embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding) # 使用之前训练的embedding weight
        # 是否将 embedding fix 住，如果 fix_embedding 为 False，在训练过程中，embedding 也会跟着被训练
        self.embedding.weight.requires_grad = False if fix_embedding else True
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        # LSTM输入格式 注意batch_fist 参见下网站
        # https://pytorch.org/tutorials/beginner/chatbot_tutorial.html#prepare-data-for-models
        self.classifier = nn.Sequential( nn.Dropout(dropout),
                                         nn.Linear(hidden_dim, 1),
                                         nn.Sigmoid() )
    def forward(self, inputs):
        #input dimension： torch.Size([128, 20])
        inputs = self.embedding(inputs)
        #inputs dimension： torch.Size([128, 20, 250])
        x, _ = self.lstm(inputs, None)
        # x 的 dimension (batch, seq_len, hidden_size)
        #x dimension： torch.Size([128, 20, 150])
        # 取用 LSTM 最后一层的 hidden state
        # batch, seq, feature
        # 最后一个序列最后一层输出的output就是最后一层的 hidden state
        x = x[:, -1, :] 
        x = self.classifier(x)
        return x

### Train

In [27]:
# train.py
# 这个 block 是用来训练模型的
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
    total = sum(p.numel() for p in model.parameters())  # 返回数组中元素的个数
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('\nstart training, parameter total:{}, trainable:{}\n'.format(total, trainable))
    model.train() # 将 model 的模式设为 train，这样 optimizer 就可以更新 model 的参数

    criterion = nn.BCELoss() # 定义损失函数，这里我们使用 binary cross entropy loss
    t_batch = len(train) 
    v_batch = len(valid) 
    optimizer = optim.Adam(model.parameters(), lr=lr) # 将模型的参数给 optimizer，并给予适当的 learning rate
    total_loss, total_acc, best_acc = 0, 0, 0
    for epoch in range(n_epoch):
        total_loss, total_acc = 0, 0
        # 这段做 training
        for i, (inputs, labels) in enumerate(train):
            inputs = inputs.to(device, dtype=torch.long) 
            # device 为 "cuda"，将 inputs 转成 torch.cuda.LongTensor
            labels = labels.to(device, dtype=torch.float) 
            # device为 "cuda"，将 labels 转成 torch.cuda.FloatTensor，因为等等要喂进 criterion，所以型态要是 float
            optimizer.zero_grad() 
            # 由于 loss.backward() 的 gradient 会累加，所以每次喂完一个 batch 后需要归零
            outputs = model(inputs) # 将 input 喂给模型
            outputs = outputs.squeeze() 
            # squeeze（）函数可以删除数组形状中的单维度条目，即把shape中为1的维度去掉，但是对非单维的维度不起作用。
            # 去掉最外面的 dimension，好让 outputs 可以喂进 criterion()
            loss = criterion(outputs, labels) # 计算此时模型的 training loss
            loss.backward() # 算 loss 的 gradient
            optimizer.step() # 更新训练模型的参数
            correct = evaluation(outputs, labels) # 计算此时模型的 training accuracy
            total_acc += (correct / batch_size)
            total_loss += loss.item()
            print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
            	epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
        print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))

        # 这段做 validation
        model.eval() # 将 model 的模式设为 eval，这样 model 的参数就会固定住
        with torch.no_grad():
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(valid):
                inputs = inputs.to(device, dtype=torch.long) 
                # device 为 "cuda"，将 inputs 转成 torch.cuda.LongTensor
                labels = labels.to(device, dtype=torch.float) 
                # device 为 "cuda"，将 labels 转成 torch.cuda.FloatTensor，因为等等要喂进 criterion，所以型态要是 float
                outputs = model(inputs) # 将 input 喂给模型
                outputs = outputs.squeeze() # 去掉最外面的 dimension，好让 outputs 可以喂进 criterion()
                loss = criterion(outputs, labels) # 计算此时模型的 validation loss
                correct = evaluation(outputs, labels) # 计算此时模型的 validation accuracy
                total_acc += (correct / batch_size)
                total_loss += loss.item()

            print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
            if total_acc > best_acc:
                # 如果 validation 的结果优于之前所有的结果，就把当下的模型存下来以备之后做预测时使用
                best_acc = total_acc
                #torch.save(model, "{}/val_acc_{:.3f}.model".format(model_dir,total_acc/v_batch*100))
                torch.save(model, "{}/ckpt.model".format(model_dir))
                print('saving model with acc {:.3f}'.format(total_acc/v_batch*100))
        print('-----------------------------------------------')
        model.train() # 将 model 的模式设为 train，这样 optimizer 就可以更新 model 的参数（因为刚刚转成 eval 模式）

### Test

In [28]:
# test.py
# 这个 block 用来对 testing_data.txt 做预测
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def testing(batch_size, test_loader, model, device):
    model.eval()
    ret_output = []
    with torch.no_grad():
        for i, inputs in enumerate(test_loader):
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs>=0.5] = 1 # 大于等于 0.5 为正面
            outputs[outputs<0.5] = 0 # 小于 0.5 为负面
            # outputs是Tensor并且为float，转化
            ret_output += outputs.int().tolist()
    return ret_output

### Main

In [29]:
# main.py
import os
import torch
import argparse
import numpy as np
from torch import nn
from gensim.models import word2vec
from sklearn.model_selection import train_test_split

# 通过 torch.cuda.is_available() 的回传值进行判断是否有使用 GPU 的环境，如果有的话 device 就设为 "cuda"，没有的话就设为 "cpu"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 处理好各个 data 的路径
train_with_label = os.path.join(path_prefix, 'training_label.txt')
train_no_label = os.path.join(path_prefix, 'training_nolabel.txt')
testing_data = os.path.join(path_prefix, 'testing_data.txt')

w2v_path = os.path.join(path_prefix, 'w2v_all.model') # 处理 word to vec model 的路径

# 定义句子长度、要不要固定 embedding、batch 大小、要训练几个 epoch、learning rate 的值、model 的文件夹路径
sen_len = 20
fix_embedding = True # fix embedding during training
batch_size = 128
epoch = 5
lr = 0.001
# model_dir = os.path.join(path_prefix, 'model/') # model directory for checkpoint model
model_dir = path_prefix # model directory for checkpoint model

print("loading data ...") # 把 'training_label.txt' 跟 'training_nolabel.txt' 读进来
train_x, y = load_training_data(train_with_label)
train_x_no_label = load_training_data(train_no_label)

# 对 input 跟 labels 做预处理
preprocess = Preprocess(train_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
train_x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)

# 制作一个 model 的对象
model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
model = model.to(device) # device为 "cuda"，model 使用 GPU 来训练（喂进去的 inputs 也需要是 cuda tensor）

# 把 data 分为 training data 跟 validation data（将一部份 training data 拿去当作 validation data）
X_train, X_val, y_train, y_val = train_x[:180000], train_x[180000:], y[:180000], y[180000:]
print("X_train.shape: ",X_train.shape)

# 把 data 做成 dataset 供 dataloader 取用
train_dataset = TwitterDataset(X=X_train, y=y_train)
val_dataset = TwitterDataset(X=X_val, y=y_val)

# 把 data 转成 batch of tensors
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True,
                                            num_workers = 8)

val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 8)

# 开始训练
training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model, device)

loading data ...
Get embedding ...
loading word to vec model ...
get words #24694
total words: 24696
embedding_matrix： torch.Size([24696, 250])
torch.Size([180000, 20])

start training, parameter total:6415351, trainable:241351


Train | Loss:0.49792 Acc: 75.013
Valid | Loss:0.46203 Acc: 78.100 
saving model with acc 78.100
-----------------------------------------------

Train | Loss:0.44426 Acc: 79.047
Valid | Loss:0.44646 Acc: 78.717 
saving model with acc 78.717
-----------------------------------------------

Train | Loss:0.42734 Acc: 80.114
Valid | Loss:0.43530 Acc: 79.563 
saving model with acc 79.563
-----------------------------------------------

Train | Loss:0.41528 Acc: 80.802
Valid | Loss:0.43161 Acc: 79.518 
-----------------------------------------------

Train | Loss:0.40362 Acc: 81.491
Valid | Loss:0.42182 Acc: 80.001 
saving model with acc 80.001
-----------------------------------------------


### Predict and Write to csv file

In [30]:
# 开始测试模型并做预测
print("loading testing data ...")
test_x = load_testing_data(testing_data)
preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = TwitterDataset(X=test_x, y=None)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 8)
print('\nload model ...')
model = torch.load(os.path.join(model_dir, 'ckpt.model'))
outputs = testing(batch_size, test_loader, model, device)

# 写到 csv 档案供上传 Kaggle
tmp = pd.DataFrame({"id":[str(i) for i in range(len(test_x))],"label":outputs})
print("save csv ...")
tmp.to_csv(os.path.join(path_prefix, 'predict.csv'), index=False)
print("Finish Predicting")

# 以下是使用 command line 上传到 Kaggle 的方式
# 需要先 pip install kaggle、Create API Token，详细请看 https://github.com/Kaggle/kaggle-api 以及 https://www.kaggle.com/code1110/how-to-submit-from-google-colab
# kaggle competitions submit [competition-name] -f [csv file path]] -m [message]
# e.g., kaggle competitions submit ml-2020spring-hw4 -f output/predict.csv -m "......"

loading testing data ...
Get embedding ...
loading word to vec model ...
get words #24694
total words: 24696
embedding_matrix： torch.Size([24696, 250])
sentence count #200000
load model ...
save csv ...
Finish Predicting


#### Check where the files are

In [31]:
!pwd
!ls

/content
ckpt.model  predict.csv  testing_data.txt    training_nolabel.txt
data.zip    sample_data  training_label.txt  w2v_all.model


#### Download the files to your computer

In [32]:
from google.colab import files
files.download('predict.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>