# Deep Continuous Bag of Words (Deep CBOW) Text Classifier

The code below implements a continuous bag of words text classifier.
- We tokenize the text, create a vocabulary and encode each piece of text in the dataset
- We create embeddings for inputs and sum them together
- The resulting vector is fed to hidden neural network, which generates a new vector that is multiplied to a weights matrix
- We then add the bias and obtain scores
- The scores are applied a softmax to generate probabilities which are used for the final classification

The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). 

![img txt](https://github.com/dair-ai/ML-Notebooks/blob/main/img/deep_cbow.png?raw=true)

In [1]:
import torch #导包
import random
import torch.nn as nn

In [None]:
''' uncomment to download the data
%%capture

# download the files
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt

# create the data folders
!mkdir data data/classes
!cp dev.txt data/classes
!cp test.txt data/classes
!cp train.txt data/classes
'''

## Read and Process the Data

In [2]:
# function to read in data, process each line and split columns by " ||| "
def read_data(filename):
    data = []
    with open(filename, 'r') as f:
        for line in f:
            line = line.lower().strip() #将大写字母转换为小写字母并删除开头结尾的空白字符
            line = line.split(' ||| ') #按照指定分隔符|||进行拆分
            data.append(line) #将line添加到data列表中
    return data

train_data = read_data('data/classes/train.txt') #读取训练数据
test_data = read_data('data/classes/test.txt') #读取测试数据

# creating the word and tag indices
word_to_index = {} #空字典，用来存储单词索引
word_to_index["<unk>"] = len(word_to_index) # add <UNK> to dictionary
tag_to_index = {} #存储标注索引

# create word to index dictionary and tag to index dictionary from data
def create_dict(data, check_unk=False): #创建字典
    for line in data: #遍历data列表
        for word in line[1].split(" "): #遍历每一个句子，分隔符为‘ ’
            if check_unk == False:
                if word not in word_to_index: #如果该词不在单词字典中，则添加到单词字典
                    word_to_index[word] = len(word_to_index)
            else: #check_unk==true
                if word not in word_to_index: #如果这个词不在单词字典中，则用<unk>替代该单词
                    word_to_index[word] = word_to_index["<unk>"]

        if line[0] not in tag_to_index: #如果该标注不在字典中，则添加到标注字典
            tag_to_index[line[0]] = len(tag_to_index)

create_dict(train_data) #创建训练数据字典
create_dict(test_data, check_unk=True) #创建测试数据字典

# create word and tag tensors from data
def create_tensor(data): #为数据创建张量
    for line in data:
        yield([word_to_index[word] for word in line[1].split(" ")], tag_to_index[line[0]]) #若该单词在data中，则输出单词和标注索引

train_data = list(create_tensor(train_data)) #将训练数据转换为张量
test_data = list(create_tensor(test_data)) #将测试数据转换为张量

number_of_words = len(word_to_index) #单词数量
number_of_tags = len(tag_to_index) #标注数量

## Model

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu" #使用设备

# create a simple neural network with embedding layer, bias, and xavier initialization
class DeepCBoW(nn.Module): #深度连续词袋模型
    def __init__(self, nwords, ntags, hidden_size, num_layers, emb_size): #词汇表大小，标签数量，隐藏层大小，隐藏层数量，嵌入向量维度
        super(DeepCBoW, self).__init__() #初始化模型

        self.num_layers = num_layers

        # layers
        self.embedding = nn.Embedding(nwords, emb_size) #嵌入层，将单词索引转换为对应的嵌入向量
        self.linears = nn.ModuleList([nn.Linear(emb_size if i ==0 else hidden_size, hidden_size) \
            for i in range(num_layers)]) #线性层

        # use xavier initialization for weights
        nn.init.xavier_uniform_(self.embedding.weight) #对嵌入层权重初始化
        for i in range(self.num_layers):
            nn.init.xavier_uniform_(self.linears[i].weight) #对线性层权重初始化

        # output layer
        self.output_layer = nn.Linear(hidden_size, ntags) #输出层，线性输出

    def forward(self, x):
        emb = self.embedding(x) # seq x emb_size 对输入进行嵌入操作
        emb_sum = torch.sum(emb, dim=0) # emb_size 嵌入向量总和
        h = emb_sum.view(1, -1) # reshape to (1, emb_size) 重塑enb_sum为（1，-1）的二维张量h
        for i in range(self.num_layers):
            h = torch.tanh(self.linears[i](h)) #对h进行线性变换，再将线性变换结果进行非线性变换，将结果作为下一次线性变换的输入
        out = self.output_layer(h) # 1 x ntags 将最后一次线性变换的结果传递给输出层
        return out

HIDDEN_SIZE = 64 #隐藏层大小
NUM_LAYERS = 2 # hidden layers 隐藏层数量
EMB_SIZE = 64 #嵌入向量维度
model = DeepCBoW(number_of_words, number_of_tags, HIDDEN_SIZE, NUM_LAYERS, EMB_SIZE).to(device) #顶定模型
criterion = nn.CrossEntropyLoss() #交叉熵损失函数
optimizer = torch.optim.Adam(model.parameters()) # Adam 优化器
type = torch.LongTensor

if torch.cuda.is_available(): #判断当前设备是否支持gpu
    model.to(device) #将模型移动到gpu上
    type = torch.cuda.LongTensor

## Model Training

In [4]:
# perform training of the Bow model

for epoch in range(10):
    # perform training
    model.train() #训练模型
    random.shuffle(train_data) #随机打乱数据顺序
    total_loss = 0.0 #累计损失
    train_correct = 0 #正确预测的数量
    for sentence, tag in train_data: #每个训练样本包括句子和标记
        sentence = torch.tensor(sentence).type(type) #转换数据类型为type
        tag = torch.tensor([tag]).type(type)
        output = model(sentence) #将句子作为输入传递给模型，得到输出
        predicted = torch.argmax(output.data.detach()).item() #获取输出张量中概率最大的标记作为预测
        
        loss = criterion(output, tag) #计算损失
        total_loss += loss.item() #加入到total_loss变量中

        optimizer.zero_grad() #清除之前的梯度
        loss.backward() #计算损失相对于模型参数的梯度
        optimizer.step() #更新模型参数

        if predicted == tag: train_correct+=1 #若预测标签于真是标签相同，预测正确数量+1

    # perform testing of the model
    model.eval() #模型测试
    test_correct = 0 #测试正确数量
    for sentence, tag in test_data:
        sentence = torch.tensor(sentence).type(type) #转换数据类型为type
        output = model(sentence)  #将句子作为输入传递给模型，得到输出
        predicted = torch.argmax(output.data.detach()).item() #获取输出张量中概率最大的标记作为预测
        if predicted == tag: test_correct += 1 #若预测标签于真是标签相同，预测正确数量+1
    
    # print model performance results
    log = f'epoch: {epoch+1} | ' \
        f'train loss/sent: {total_loss/len(train_data):.4f} | ' \
        f'train accuracy: {train_correct/len(train_data):.4f} | ' \
        f'test accuracy: {test_correct/len(test_data):.4f}'
    #训练数据的平均损失，训练数据的准确率，测试数据的准确率
    print(log) #输出log

epoch: 1 | train loss/sent: 1.4293 | train accuracy: 0.3765 | test accuracy: 0.3941
epoch: 2 | train loss/sent: 1.0343 | train accuracy: 0.5729 | test accuracy: 0.4127
epoch: 3 | train loss/sent: 0.6565 | train accuracy: 0.7583 | test accuracy: 0.3801
epoch: 4 | train loss/sent: 0.4013 | train accuracy: 0.8586 | test accuracy: 0.3783
epoch: 5 | train loss/sent: 0.2659 | train accuracy: 0.9079 | test accuracy: 0.3959
epoch: 6 | train loss/sent: 0.1747 | train accuracy: 0.9419 | test accuracy: 0.3787
epoch: 7 | train loss/sent: 0.1257 | train accuracy: 0.9573 | test accuracy: 0.3805
epoch: 8 | train loss/sent: 0.0860 | train accuracy: 0.9702 | test accuracy: 0.3719
epoch: 9 | train loss/sent: 0.0652 | train accuracy: 0.9768 | test accuracy: 0.3747
epoch: 10 | train loss/sent: 0.0434 | train accuracy: 0.9860 | test accuracy: 0.3887


Bad pipe message: %s [b'I7{\xddYY9\x10\xe5', b"\xee\x8a\xf0\xff\xe6\x1a\xd2\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x00", b'\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00']
Bad pipe message: %s [b'\xe1\x05', b'\xb0\x87g\xc6U\xd5G\xa2.\xd2\xf7\x05\x9fL\x00\x00\xa6\xc0,\xc0', b'\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V']
Bad pipe message: %s [b"\xc0$\xc0(\x00k\x00j\xc0s\xc0w\x00\xc4\x00\xc3\xc0#\xc0'\x00g\x00@\xc0r\xc0v\x00\xbe\x00\xbd\xc0\n\xc0\x14\x009\x008\x00\x88\x00\x87\xc0\t\xc0\x13\x003\x002\x00\x9a\x00\x99