In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
toxic_comments = pd.read_csv("../input/toxic_comments.csv")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [None]:
from collections import Counter
import random
import numpy as np

In [None]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device('cuda' if USE_CUDA else 'cpu')

- 多个linear，所以label是个列表
- X shape is 2-dimensions, label shpe is 3-dimensions

In [None]:
def read_corpus(path):
    toxic_comments = pd.read_csv(path)
    sentences = []
    labels = []
    for idx, line in toxic_comments.iterrows():
        sentences.append(line['comment_text'].strip().lower().split())
        labels.append([line['toxic'], line['severe_toxic'], line['obscene'], line['threat'], line['insult'], line['identity_hate']])
    return sentences, labels

In [None]:
toxic_comments_path = "../input/toxic_comments.csv"
sentences, labels = read_corpus(toxic_comments_path)

In [None]:
train_sentences = sentences[: round(2/8 * len(sentences))]
test_sentences = sentences[round(2/8 * len(sentences)):]
train_labels = labels[: round(2/8 * len(labels))]
test_labels = labels[round(2/8 * len(labels)):]

In [None]:
print(train_sentences[0])

In [None]:
def build_vocab(sentences, word_size=20000):
    c = Counter()
    for sentence in sentences:
        for word in sentence:
            c[word] += 1
    print('the number of all words is: ', len(c))
    words_most_common = c.most_common(word_size)
    idx2word = ['<pad>', '<unk>'] + [item[0] for item in words_most_common]
    word2idx = {w:i for i, w in enumerate(idx2word)}
    
    return word2idx, idx2word

In [None]:
WORD_SIZE = 20000
word2idx, idx2word = build_vocab(train_sentences, word_size=WORD_SIZE)

In [None]:
def numeralization(sentences, labels, word2idx):
    numeral_sent = [[word2idx.get(w, word2idx['<unk>']) for w in sent] for sent in sentences]
    return list(zip(numeral_sent, labels))

In [None]:
numeral_train_data = numeralization(train_sentences, train_labels, word2idx)
numeral_test_data = numeralization(test_sentences, test_labels, word2idx)

In [None]:
print(numeral_train_data[0])

# the batch transfrom2tensor

In [None]:
def transform2tensor(batch_sentences):
    lengths = [len(s) for s in batch_sentences]
    max_len = max(lengths)
    batch_size = len(batch_sentences)
    batch = torch.zeros(batch_size, max_len, dtype=torch.long)
    for i, length in enumerate(lengths):
        batch[i, :length] = torch.tensor(batch_sentences[i])
    
    return batch

In [None]:
def generate_batch(numeral_sentences_labels, batch_size=32):
    batchs = []
    num_sample = len(numeral_sentences_labels)
    random.shuffle(numeral_sentences_labels)
    numeral_sent = [n[0] for n in numeral_sentences_labels]
    numeral_lab = [n[1] for n in numeral_sentences_labels]
    for start in range(0, num_sample, batch_size):
        end = start + batch_size
        if end > num_sample:
            # last batch
            batch_sentences = numeral_sent[start: num_sample]
            batch_labels = numeral_lab[start: num_sample]
            batch_sent_tensor = transform2tensor(batch_sentences)
            batch_lab_tensor = torch.tensor(batch_labels)
        else:
            batch_sentences = numeral_sent[start: end]
            batch_labels = numeral_lab[start: end]
            batch_sent_tensor = transform2tensor(batch_sentences)
            batch_lab_tensor = torch.tensor(batch_labels)
        batchs.append((batch_sent_tensor, batch_lab_tensor))
    return batchs

In [None]:
train_data = generate_batch(numeral_train_data)

In [None]:
train_data

# build model

In [None]:
class ToxicModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_size, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.linear0 = nn.Linear(embed_dim, output_size)
        self.linear1 = nn.Linear(embed_dim, output_size)
        self.linear2 = nn.Linear(embed_dim, output_size)
        self.linear3 = nn.Linear(embed_dim, output_size)
        self.linear4 = nn.Linear(embed_dim, output_size)
        self.linear5 = nn.Linear(embed_dim, output_size)
    
    def forward(self, text, toxic_type):
        # [batch_size, sent_len] -> [batch_size, sent_len, embed_dim]
        embed = self.embedding(text)
        # [batch_size, sent_len, embed_dim] -> [batch_size, embed_dim]
        embed_pooled = F.avg_pool2d(embed, (embed.size(1), 1)).squeeze(1)
        
        if toxic_type == 0:
            out = self.linear0(embed_pooled)
        elif toxic_type == 1:
            out = self.linear1(embed_pooled)
        elif toxic_type == 2:
            out = self.linear2(embed_pooled)
        elif toxic_type == 3:
            out = self.linear3(embed_pooled)
        elif toxic_type == 4:
            out = self.linear4(embed_pooled)
        elif toxic_type == 5:
            out = self.linear5(embed_pooled)
        return out        

In [None]:
VOCAB_SIZE = len(word2idx)
EMBEDDING_DIM = 100
OUTPUT_SIZE = 1
PAD_IDX = word2idx['<pad>']

In [None]:
model = ToxicModel(VOCAB_SIZE, EMBEDDING_DIM, OUTPUT_SIZE, PAD_IDX)

In [None]:
model

# define criterion

In [None]:
criterion = nn.BCEWithLogitsLoss()
model = model.to(DEVICE)
criterion = criterion.to(DEVICE)
optimizer = optim.Adam(model.parameters())

In [None]:
def get_accuracy(out, y):
    # out, y : [batch_size, 6]
#     out = [[torch.sigmoid(lable) for lable in labels] for labels in out]
    y = y.cpu().numpy().astype(np.float)
#     print('out is: ', out)
#     print('y is: ', y)
    
    correct = np.array([out[idx].tolist() == y[idx].tolist() for idx in range(len(y))])
#     print('correct is: ', correct)
    acc = correct.sum() / len(correct)
    return acc

In [None]:
BATCH_SIZE = 128
def train(train_data, model, criterion, optimizer):
    model.train()
    
     # using backward
    num_epoch = epoch_loss = epoch_acc = 0
    
    for i, (text, label) in enumerate(train_data):
        optimizer.zero_grad()
        losses = []
        outes = np.zeros([BATCH_SIZE, label.shape[1]], dtype=np.float)
        if USE_CUDA:
            text = text.cuda()
            label = label.cuda()
#         print(label)
        for toxic_type in range(label.shape[1]):
            
            singel_label = label[:, toxic_type].unsqueeze(1)
            out = model(text, toxic_type)
            
            loss = criterion(out, singel_label.float())

            losses.append(loss)
#             print(out)
#             print('outes shape is:', out.shape)
            
            for idx, single_out in enumerate(out):
                outes[idx][toxic_type] = torch.round(torch.sigmoid(single_out[0]))
#         print(losses)
        all_loss_value = sum(losses)
        
#         print(all_loss_value)
        acc = get_accuracy(outes, label)
        all_loss_value.backward()
        optimizer.step()
        num_epoch += 1
        epoch_loss += all_loss_value.item()
        epoch_acc += acc.item()

        print('the {} iterator finished!'.format(i))
    
    return epoch_loss / num_epoch, epoch_acc / num_epoch
            

In [None]:
NUM_EPOCHS = 5

for epoch in range(NUM_EPOCHS):
    train_data = generate_batch(numeral_train_data, BATCH_SIZE)
    epoch_loss, epoch_acc = train(train_data, model, criterion, optimizer)
    print('the epoch_loss is: {}, the epoch_acc is: {}'.format(epoch_loss, epoch_acc))