In [1]:
import pandas as pd
import numpy as np
import torch
import time
import random
import os

## 查看数据集分布

In [2]:
pd.read_csv("data/train_one_label.csv").head(2)

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0


In [3]:
pd.read_csv("data/test.csv").head(2)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...


## 使用torchtext构建数据集

In [4]:
from torchtext import data
from torchtext.vocab import Vectors
from torch.nn import init
from tqdm import tqdm

In [5]:
tokenize = lambda x: x.split()
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=200)
LABEL = data.Field(sequential=False, use_vocab=False)

    # 若不想自定义继承自Dataset的类MyDataset，也可直接使用torchtext.data.Dataset来构建数据集
    # 完整示例如下
    def get_dataset(csv_data, text_field, label_field, test=False):

        fields = [("id", None), # we won't be needing the id, so we pass in None as the field
                     ("comment_text", text_field), ("toxic", label_field)]       
        examples = []

        if test:
            # 如果为测试集，则不加载label
            for text in tqdm(csv_data['comment_text']):
                examples.append(data.Example.fromlist([None, text, None], fields))
        else:
            for text, label in tqdm(zip(csv_data['comment_text'], csv_data['toxic'])):
                examples.append(data.Example.fromlist([None, text, label], fields))
        return examples, fields

    train_data = pd.read_csv('data/train_one_label.csv')
    valid_data = pd.read_csv('data/valid_one_label.csv')
    test_data = pd.read_csv("data/test.csv")
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True)
    LABEL = data.Field(sequential=False, use_vocab=False)

    # 得到构建Dataset所需的examples和fields
    train_examples, train_fields = get_dataset(train_data, TEXT, LABEL)
    valid_examples, valid_fields = get_dataset(valid_data, TEXT, LABEL)
    test_examples, test_fields = get_dataset(test_data, TEXT, None, test=True)
    # 构建Dataset数据集
    train = data.Dataset(train_examples, train_fields)
    valid = data.Dataset(valid_examples, valid_fields)
    test = data.Dataset(test_examples, test_fields)

### 自定义Dataset类

In [6]:
train_path = 'data/train_one_label.csv'
valid_path = "data/valid_one_label.csv"
test_path = "data/test.csv"

# 定义Dataset
class MyDataset(data.Dataset):
    name = 'Grand Dataset'

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    def __init__(self, path, text_field, label_field, test=False, aug=False, **kwargs):
        fields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("comment_text", text_field), ("toxic", label_field)]
        
        examples = []
        csv_data = pd.read_csv(path)
        print('read data from {}'.format(path))

        if test:
            # 如果为测试集，则不加载label
            for text in tqdm(csv_data['comment_text']):
                examples.append(data.Example.fromlist([None, text, None], fields))
        else:
            for text, label in tqdm(zip(csv_data['comment_text'], csv_data['toxic'])):
                if aug:
                    # do augmentation
                    rate = random.random()
                    if rate > 0.5:
                        text = self.dropout(text)
                    else:
                        text = self.shuffle(text)
                # Example: Defines a single training or test example.Stores each column of the example as an attribute.
                examples.append(data.Example.fromlist([None, text, label], fields))
        # 之前是一些预处理操作，此处调用super调用父类构造方法，产生标准Dataset
        # super(MyDataset, self).__init__(examples, fields, **kwargs)
        super(MyDataset, self).__init__(examples, fields)

    def shuffle(self, text):
        text = np.random.permutation(text.strip().split())
        return ' '.join(text)

    def dropout(self, text, p=0.5):
        # random delete some text
        text = text.strip().split()
        len_ = len(text)
        indexs = np.random.choice(len_, int(len_ * p))
        for i in indexs:
            text[i] = ''
        return ' '.join(text)


### 构建数据集

In [7]:
train = MyDataset(train_path, text_field=TEXT, label_field=LABEL, test=False, aug=1)
valid = MyDataset(valid_path, text_field=TEXT, label_field=LABEL, test=False, aug=1)
# 因为test没有label,需要指定label_field为None
test = MyDataset(test_path, text_field=TEXT, label_field=None, test=True, aug=1)

25it [00:00, 8599.82it/s]
25it [00:00, 9660.73it/s]
100%|██████████| 33/33 [00:00<00:00, 28926.23it/s]

read data from data/train_one_label.csv
read data from data/valid_one_label.csv
read data from data/test.csv





In [8]:
print(train[0].__dict__.keys())
print(test[0].__dict__.keys())

dict_keys(['comment_text', 'toxic'])
dict_keys(['comment_text'])


In [9]:
print(train[0].comment_text)

['i', 'at', 'explanation', 'dolls', 'just', 'talk', 'new', 'since', 'closure', 'reverted?', 'retired', 'fac.', 'after', 'york', 'now.89.205.38.27', 'they', 'gas', 'edits', 'hardcore', 'fan', 'template', 'from', 'username', 'were', "weren't", 'please', 'the', 'my', 'page', "don't", 'the', 'remove', 'on', 'vandalisms,', 'and', 'made', 'why', 'metallica', 'some', "i'm", 'under', 'voted', 'the']


### 构建词表，最简单的方式

In [10]:
TEXT.build_vocab(train)

###  通过预训练的词向量来构建词表的方式示例，以glove.6B.300d词向量为例
    cache = 'mycache'
    if not os.path.exists(cache):
        os.mkdir(cache)
    vectors = Vectors(name='/Users/wyw/Documents/vectors/glove/glove.6B.300d.txt', cache=cache)
    # 指定 Vector 缺失值的初始化方式，没有命中的token的初始化方式
    vectors.unk_init = init.xavier_uniform_ 
    TEXT.build_vocab(train, min_freq=5, vectors=vectors)
    # 查看词表元素
    TEXT.vocab.vectors
    

In [11]:
TEXT.vocab.freqs.most_common(10)

[('the', 65),
 ('to', 35),
 ('you', 29),
 ('of', 26),
 ('and', 24),
 ('is', 19),
 ('that', 19),
 ('a', 19),
 ('i', 18),
 ('this', 16)]

### 构建数据集迭代器

In [12]:
from torchtext.data import Iterator, BucketIterator
# 若只针对训练集构造迭代器
# train_iter = data.BucketIterator(dataset=train, batch_size=8, shuffle=True, sort_within_batch=False, repeat=False)

# 同时对训练集和验证集进行迭代器的构建
train_iter, val_iter = BucketIterator.splits(
        (train, valid), # 构建数据集所需的数据集
        batch_sizes=(8, 8),
        device=-1, # 如果使用gpu，此处将-1更换为GPU的编号
        sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

test_iter = Iterator(test, batch_size=8, device=-1, sort=False, sort_within_batch=False, repeat=False)

In [13]:
for idx, batch in enumerate(train_iter):
    print(batch)
    text, label = batch.comment_text, batch.toxic
    print(text.shape, label.shape)


[torchtext.data.batch.Batch of size 8 from GRAND DATASET]
	[.comment_text]:[torch.LongTensor of size 200x8]
	[.toxic]:[torch.LongTensor of size 8]
torch.Size([200, 8]) torch.Size([8])

[torchtext.data.batch.Batch of size 8 from GRAND DATASET]
	[.comment_text]:[torch.LongTensor of size 200x8]
	[.toxic]:[torch.LongTensor of size 8]
torch.Size([200, 8]) torch.Size([8])

[torchtext.data.batch.Batch of size 1 from GRAND DATASET]
	[.comment_text]:[torch.LongTensor of size 200x1]
	[.toxic]:[torch.LongTensor of size 1]
torch.Size([200, 1]) torch.Size([1])

[torchtext.data.batch.Batch of size 8 from GRAND DATASET]
	[.comment_text]:[torch.LongTensor of size 200x8]
	[.toxic]:[torch.LongTensor of size 8]
torch.Size([200, 8]) torch.Size([8])


# 使用torchtext构建的数据集实现LSTM
- 因数据集太小，无法收敛，只作为demo熟悉torchtext和pytorch之间的用法

In [14]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [15]:
weight_matrix = TEXT.vocab.vectors

In [16]:
class LSTM(nn.Module):

    def __init__(self):
        super(LSTM, self).__init__()
        self.word_embeddings = nn.Embedding(len(TEXT.vocab), 300)  # embedding之后的shape: torch.Size([200, 8, 300])
        # 若使用预训练的词向量，需在此处指定预训练的权重
        # embedding.weight.data.copy_(weight_matrix)
        self.lstm = nn.LSTM(input_size=300, hidden_size=128, num_layers=1)  # torch.Size([200, 8, 128])
        self.decoder = nn.Linear(128, 2)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out = self.lstm(embeds)[0]  # lstm_out:200x8x128
        # 取最后一个时间步
        final = lstm_out[-1]  # 8*128
        y = self.decoder(final)  # 8*2 
        return y


In [17]:
model = LSTM()
model.train()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)
loss_funtion = F.cross_entropy

In [18]:
for epoch, batch in enumerate(train_iter):
    optimizer.zero_grad()
    predicted = model(batch.comment_text)

    loss = loss_funtion(predicted, batch.toxic)
    loss.backward()
    optimizer.step()
    print(loss)

tensor(0.7700, grad_fn=<NllLossBackward>)
tensor(0.0174, grad_fn=<NllLossBackward>)
tensor(0.7216, grad_fn=<NllLossBackward>)
tensor(0.7747, grad_fn=<NllLossBackward>)
