## Requirements

- pytorch
- torchtext
- pandas
- scikit-learn # 예정
- tqdm
- gensim


In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader, Dataset, TensorDataset
import pandas as pd 

## Model 설명

- embedding layer
- |
- convolutional layer (kernel = 3 x embedding dim)
- |
- leakyrelu
- |
- dropout
- |
- maxpool w.r.t time axis
- |
- fcn1 for each labels
- | | | | | | |
- fcn2 for each labels ( -> binary output )
- | | | | | | |
- CrossEntropyLoss

In [2]:
class Net(nn.Module):
    def __init__(self, 
                 vocab_size,
                 embedding_dim,
                 len_sentence,
                 channel_size=4,
                 fc_dim=128,
                 padding_idx=1,
                 dropout=0.3,
                 num_labels=7,
                 batch_size=32,
                 is_cuda=False
                ):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocab_size+2, embedding_dim=embedding_dim, padding_idx=padding_idx)
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.channel_size = channel_size
        self.len_sentence = len_sentence
        self.batch_size = batch_size
        
        self.conv2d = nn.Conv2d(1, out_channels=channel_size, kernel_size=(3, embedding_dim), stride=1)
        # output : batch x channel x (len_sentence - 2) x 1
        # -> squeeze : batch x channel x (len_sentence - 2)
        self.relu = nn.LeakyReLU()
        self.dropout1d = nn.Dropout(p=dropout)
        self.pool1d = nn.MaxPool1d(kernel_size=2)
        # output : batch x channel x (len_sentence - 2) / 2
        
        self.bottleneck_size = channel_size * (len_sentence - 2) / 2
#         print ("Linear size : %sx(%s-2)/2"%(channel_size, len_sentence), self.bottleneck_size)
        assert self.bottleneck_size.is_integer()
        self.bottleneck_size = int(self.bottleneck_size)
        
        self.fcns1 = [nn.Linear(self.bottleneck_size, fc_dim) for i in range(num_labels)]
        self.fcns2 = [nn.Linear(fc_dim, 2) for i in range(num_labels)]
        
        for i, fcn in enumerate(self.fcns1):
            self.add_module("fcn1-"+str(i), fcn)
        
        for i, fcn2 in enumerate(self.fcns2):
            self.add_module("fcn2-"+str(i), fcn2)
        
        self.fc_dim = fc_dim
        self.num_labels = num_labels
    
    def forward(self, sentence, other_features=None):
#         print("sentence ", sentence.shape)
        image = self.embedding(sentence)
#         print(bottleneck.shape)
        image.unsqueeze_(1)
#         print("image ", image.shape)
        
        bottleneck = self.conv2d(image)
        bottleneck.squeeze_(3)
        bottleneck = self.relu(bottleneck)
        bottleneck = self.dropout1d(bottleneck)
        bottleneck = self.pool1d(bottleneck)
#         print("bt shape ", bottleneck.shape)
        
        bottleneck = bottleneck.view(-1, self.bottleneck_size)
        fcns_1 = []
        for i in range(self.num_labels):
            fcns_1.append(self.fcns1[i](bottleneck))
        
        fcns_2 = []
        for i in range(self.num_labels):
            fcns_2.append(self.fcns2[i](fcns_1[i]))
            
        return fcns_2 # return num_labels


In [3]:
class config:
    vocab_size = 20000
    embedding_dim = 50
    len_sentence = 30
    num_labels = 7
    min_freq = 3

In [4]:
def get_pd_data(path : str):
    df = pd.read_csv(path)
    return df

In [5]:
train = get_pd_data('./data/train.csv')

In [6]:
test = get_pd_data('./data/test.csv')

In [7]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Preprocess (1)
----
###  Set captial character ratio (not be used now)
- 문장 내의 대문자 비율을 나중에 뉴럴넷의 input으로 줄 예정

In [None]:
def set_capital_ratio(df : pd.DataFrame):
    df['alphas'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isalpha()))
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['cap_ratio'] = df.apply(lambda row: float(row['capitals']) / (float(row['alphas']) + 1), axis=1)


In [None]:
set_capital_ratio(train), set_capital_ratio(test)

In [None]:
train.head()

## Preprocess(2)
-----
### Word tokenize
- gensim의 tokenize function

In [None]:
from gensim.utils import simple_tokenize

In [None]:
def tokenizer(string : str):
    return [s for s in simple_tokenize(string)]

In [None]:
tk_train = train['comment_text'].str.lower().apply(tokenizer)
tk_test = train['comment_text'].str.lower().apply(tokenizer)

In [None]:
tk_train[:5]

In [None]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## Preprocess(3)
----
### Add Normal column label
- toxic하지 않은 label로 분류되는 것에, normal=1 의 새로운 라벨 추가

In [None]:
train['normal'] = 0
train.loc[train[labels].sum(axis=1) == 0, 'normal'] = 1

In [None]:
labels.append('normal')

In [None]:
y_labels = train[labels]
y_labels.head()

## Validation
----
### 10000 개의 Validation set
    TODO
    Validation 나누기 전에 shuffle

In [None]:
valid_num = 10000

In [None]:
tk_valid = tk_train[-valid_num:]
y_valid = y_labels[-valid_num:]
tk_train = tk_train[:-valid_num]
y_labels = y_labels[:-valid_num]

In [None]:
from torchtext import data, datasets

## Preprocess(3)
---
### torchtext.data.Field
- word dictionary, word to index 구현

In [None]:
TEXT = data.Field(sequential=True,  
                  # 들어갈 데이터가 sequential 인가요? 우리는 tokenize한 word의 sequence를 다룰거니까 True입니다. Defualt로도 True임.
                  tokenize=tokenizer, 
                  # 그 데이터를 tokenize할 함수를 지정할 수 있습니다. 우리는 gensim library의 tokenize 함수를 쓸건데요
                  # 뭐 굳이 그거 말고도 직접 정의해도 되고 str.split 같은걸 써넣어도 됩니다.
                  # :: 그런 줄 알았는데 아무 tokenize 함수나 쓰면 안되고, generator가 아닌 tokenized list 를 반환하는 함수여야합니다..
                  # :::: 이게 아닐거같기도 함.
                  fix_length=config.len_sentence,
                 # 아마 tokenize된 길이 제한 같은데 한번 확인해볼게요. 특이사항으로는 length 넘으면 자르고, 안넘으면 padding을 채웁니다
                  # :: 그게 아니고 vector화 했을 때의 길이 제한일 것 같아요. 확인해보겠습니다.
                  pad_first=True,
                  # padding이 앞에서부터 붙냐, 뒤에서부터 붙냐는 겁니다.
                  tensor_type=torch.cuda.LongTensor
                  # cuda를 써도 됩니다
                 )

In [None]:
TEXT.build_vocab(tk_train, tk_valid, max_size=config.vocab_size, min_freq=config.min_freq)

In [None]:
def batchify(tk_train, y_labels, batch_size=32):
    for i in range(0, len(tk_train), batch_size):
        yield tk_train[i:min(i+batch_size, len(tk_train))], y_labels[i:min(i+batch_size, len(tk_train))]

In [None]:
net = Net(vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, len_sentence=config.len_sentence,
         channel_size=8, num_labels=config.num_labels, batch_size=32).cuda()

In [None]:
net

In [None]:
optimizer = optim.Adam(net.parameters())
criterions = [nn.CrossEntropyLoss() for i in range(config.num_labels)]

In [None]:
from tqdm import tqdm

In [None]:
net.train(True)

In [None]:
def validation(net, tk_valid, y_valid : pd.DataFrame, TEXT : data.Field):
    val_corrects = [0 for i in range(config.num_labels)]
    val_expectations = []
    for val_step, (batch_val, y_val) in enumerate(batchify(tk_valid, y_valid.values)):
        var_batch = TEXT.process(batch_val, device=0, train=False)
        var_y = Variable(torch.cuda.LongTensor(y_val)).transpose(dim0=0, dim1=1)
        pred_score = net(var_batch.transpose(dim0=0, dim1=1))
        for i, score in enumerate(pred_score):
            _, pred = score.max(dim=1)
            val_corrects[i] += (pred == var_y[i]).float().sum()
            
    return valid_loss, acc, expectaions

In [None]:
train_corrects = [0 for i in range(config.num_labels)]
train_loss = 0

for step, (batch, y_label) in tqdm(enumerate(batchify(tk_train, y_labels.values))):
    
    var_batch = TEXT.process(batch, device=0, train=True)
    var_y = Variable(torch.cuda.LongTensor(y_label)).transpose(dim0=0, dim1=1)

    pred_score = net(var_batch.transpose(dim0=0, dim1=1))
    
    net.zero_grad()
    y_total_loss = 0
    for i, score in enumerate(pred_score):
        _, pred = score.max(dim=1)
        train_corrects[i] += (pred == var_y[i]).float().sum()
        y_loss = criterions[i](score, var_y[i])
#         print(y_loss.data[0])
        y_total_loss += y_loss
    
    y_total_loss.backward()
    if step % 1000 == 999:
        net.train(False)
        val_corrects = [0 for i in range(config.num_labels)]
        for val_step, (batch_val, y_val) in enumerate(batchify(tk_valid, y_valid.values)):
            var_batch = TEXT.process(batch_val, device=0, train=False)
            var_y = Variable(torch.cuda.LongTensor(y_val)).transpose(dim0=0, dim1=1)
            pred_score = net(var_batch.transpose(dim0=0, dim1=1))
            for i, score in enumerate(pred_score):
                _, pred = score.max(dim=1)
                val_corrects[i] += (pred == var_y[i]).float().sum()
        
        for i, val_correct in enumerate(val_corrects):
            print(step, labels[i], val_correct.data[0] / valid_num )
#         print(step, val_corrects)
        print(step, "loss, ", y_total_loss.data[0])
        net.train(True)
    optimizer.step()

In [None]:
from sklearn.metrics import roc_auc_score


## TODO
---
### roc_auc_score w.r.t. validation set's score
- Kaggle form에 맞추어 column-wise roc auc score 계산