# ISSUES #
## validation set에 대해 계산한 Columnwise mean ROC AUC가 실제 테스트셋에 대해 제출했을 때 값과 차이가 많이 남

## Requirements

- pytorch
- torchtext
- pandas
- scikit-learn
- numpy
- tqdm
- gensim


In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader, Dataset, TensorDataset
import pandas as pd 
import random
import numpy as np

## Model 설명

- embedding layer
- |
- convolutional layer (kernel = 3 x embedding dim)
- |
- leakyrelu
- |
- dropout
- |
- maxpool w.r.t time axis
- |
- fcn1 for each labels
- |
- fcn2 for each labels ( -> label output )
- | 
- sigmoid - BinaryCrossEntropyLoss

In [2]:
class Net(nn.Module):
    def __init__(self, 
                 vocab_size,
                 embedding_dim,
                 len_sentence,
                 channel_size=4,
                 x2_size=1, # additional data - cap ratio
                 fc_dim=128,
                 padding_idx=1,
                 dropout=0.3,
                 num_labels=7,
                 batch_size=32,
                 is_cuda=False,
                 n_gram=5,
                 additional_kernel_size=1,
                ):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocab_size+2, embedding_dim=embedding_dim, padding_idx=padding_idx)
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.channel_size = channel_size
        self.len_sentence = len_sentence
        self.batch_size = batch_size
        self.x2_size = x2_size
        self.kernel_size = (n_gram, embedding_dim + additional_kernel_size)
        self.n_gram = n_gram
        self.additional_kernel_size = additional_kernel_size
        self.conv2d = nn.Conv2d(1, out_channels=channel_size, kernel_size=self.kernel_size, stride=1)
        # output : batch x channel x (len_sentence - 2) x 1
        
        # -> squeeze : batch x channel x (len_sentence - 2)
        self.relu = nn.ReLU(inplace=True)
        self.dropout1d = nn.Dropout(p=dropout)
        self.pool1d = nn.AvgPool1d(kernel_size=2)
        # output : batch x channel x (len_sentence - 2) / 2
        
        self.bottleneck_size = channel_size * (len_sentence - (self.n_gram-1)) / 2
        assert self.bottleneck_size.is_integer()
        self.bottleneck_size = int(self.bottleneck_size) + self.x2_size
        
        self.fcn1 = nn.Linear(self.bottleneck_size, fc_dim)
        self.relu1 = nn.ReLU(inplace=True)
        self.fcn2 = nn.Linear(fc_dim, num_labels)
        self.sigmoid = nn.Sigmoid()
        self.fc_dim = fc_dim
        self.num_labels = num_labels
    
    def forward(self, sentence, cap_ratio, other_features):
#         print("sentence ", sentence.shape)
        image = self.embedding(sentence)
#         print(bottleneck.shape)
#         image.unsqueeze_(1)
#         print("image ", image.shape)
        # batch x channel x sentence_length x embedding
        cap_ratio.unsqueeze_(2)
#         print("cap_Ratio :", cap_ratio.shape)
        new_image = torch.cat([image, cap_ratio], dim=2)
        new_image.unsqueeze_(1)
        bottleneck = self.conv2d(new_image)
        bottleneck.squeeze_(3)
        bottleneck = self.relu(bottleneck) # batch x channel x features
        bottleneck = self.dropout1d(bottleneck)
        bottleneck = self.pool1d(bottleneck)
#         print("bt shape ", bottleneck.shape)
        
        bottleneck = bottleneck.view(-1, self.bottleneck_size - self.x2_size)
        if self.x2_size > 0:
            bottleneck = torch.cat([bottleneck, other_features], dim=1)
        
        fcn = self.relu1(self.fcn1(bottleneck))
        fcn = self.fcn2(fcn)
        logit = self.sigmoid(fcn)
        
        return logit

In [3]:
class config:
    vocab_size = 20000
    embedding_dim = 100 # TODO: max 300
    len_sentence = 100
    num_labels = 6
    min_freq = 1
    batch_size = 64
    channel_size = 128
    seed = 0
    dropout = 0.5 # TODO: batch norm으로 대체 추천
    x2_size = 1
    valid_num = 10000
    n_gram = 5

In [4]:

# seed 고정
torch.cuda.manual_seed_all(config.seed)
torch.manual_seed(config.seed)
random.seed(config.seed)
np.random.seed(config.seed)
torch.backends.cudnn.deterministic=True


def get_pd_data(path : str):
    df = pd.read_csv(path)
    return df

train = get_pd_data('./data/train.csv')

test = get_pd_data('./data/test.csv')

train.head()

## Preprocess (1)
----
###  Set captial character ratio
- 문장 내의 대문자 비율을 뉴럴넷의 input으로 줌

def set_capital_ratio(df : pd.DataFrame):
    df['alphas'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isalpha()))
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['cap_ratio'] = df.apply(lambda row: float(row['capitals']) / (float(row['alphas']) + 1), axis=1)


set_capital_ratio(train), set_capital_ratio(test)

train.head()

## Preprocess(2)
-----
### Word tokenize
- gensim의 tokenize function

In [5]:
from gensim.utils import simple_tokenize

In [6]:
def tokenizer(string : str):
    return [s for s in simple_tokenize(string)]

In [7]:
def get_cap_ratio_of_words(tokens : list):
    return [(sum(1 for c in token if c.isupper()))/(sum(1 for c in token if c.isalpha()) + 1) for token in tokens]

tk_train = train['comment_text'].apply(tokenizer)
tk_test = train['comment_text'].apply(tokenizer)
tk_cap_ratio_train = tk_train.apply(get_cap_ratio_of_words)
tk_cap_ratio_test = tk_test.apply(get_cap_ratio_of_words)

tk_train = train['comment_text'].str.lower().apply(tokenizer)
tk_test = train['comment_text'].str.lower().apply(tokenizer)

tk_cap_ratio_train.head()

tk_train[:5]

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
x_features = ['cap_ratio']

## Preprocess(3)
----
### Add Normal column label
- toxic하지 않은 label로 분류되는 것에, normal=1 의 새로운 라벨 추가 (하지않음)

In [8]:
# train['normal'] = 0
# train.loc[train[labels].sum(axis=1) == 0, 'normal'] = 1

In [9]:
# labels.append('normal')

y_labels = train[labels]
y_labels.head(n=10)

# Add manually engineered features ex) capital ratio of sentence
x2 = train[x_features]
x2.head(n=10)

checkpoint = dict({
    'tk_train' : tk_train,
    'tk_test' : tk_test,
    'tk_cap_ratio_train' : tk_cap_ratio_train,
    'tk_cap_ratio_test' : tk_cap_ratio_test,
    'y_labels' : y_labels,
    'x2' : x2,
})
torch.save(checkpoint, './saved_state.tch')

In [10]:
saved = torch.load('./saved_state.tch')
tk_train = saved['tk_train']
tk_test = saved['tk_test']
tk_cap_ratio_train = saved['tk_cap_ratio_train']
tk_cap_ratio_test = saved['tk_cap_ratio_test']
y_labels = saved['y_labels']
x2 = saved['x2']

## Validation
----
### 10000 개의 Validation set

In [11]:
# x, y, z : pd.Dataframe of pd.series
def shuffle_lists(*pargs):
    shuffler = np.random.permutation(len(pargs[0]))
    retargs = []
    for x in pargs:
        x = x.iloc[shuffler]
        retargs.append(x)
    return retargs

In [12]:
def padding_cap_ratio(tokens : list):
    L = len(tokens)
    ret = tokens[:min(L, config.len_sentence)]
    ret = [1] * (config.len_sentence - L) + ret
    return np.array(ret)

In [13]:
tk_train, x2, tk_cap_ratio_train, y_labels = shuffle_lists(tk_train, x2, tk_cap_ratio_train, y_labels)

In [14]:
tk_cap_ratio_train = np.array([padding_cap_ratio(row) for row in tk_cap_ratio_train])

In [15]:
tk_valid = tk_train[-config.valid_num:]
tk_cap_ratio_valid = tk_cap_ratio_train[-config.valid_num:]
x2_valid = x2[-config.valid_num:]
y_valid = y_labels[-config.valid_num:]

tk_train = tk_train[:-config.valid_num]
y_train = y_labels[:-config.valid_num]
x2_train = x2[:-config.valid_num]
tk_cap_ratio_train = tk_cap_ratio_train[:-config.valid_num]

In [16]:
len(tk_valid), len(tk_cap_ratio_valid), len(x2_valid), len(y_valid)

(10000, 10000, 10000, 10000)

In [17]:
len(tk_train), len(tk_cap_ratio_train), len(x2_train), len(y_train)

(149571, 149571, 149571, 149571)

In [18]:
from torchtext import data, datasets

## Preprocess(3)
---
### torchtext.data.Field
- word dictionary, word to index 구현

In [19]:
TEXT = data.Field(sequential=True,  
                  # 들어갈 데이터가 sequential 인가요? 우리는 tokenize한 word의 sequence를 다룰거니까 True입니다. Defualt로도 True임.
                  tokenize=tokenizer, 
                  # 그 데이터를 tokenize할 함수를 지정할 수 있습니다. 우리는 gensim library의 tokenize 함수를 쓸건데요
                  # 뭐 굳이 그거 말고도 직접 정의해도 되고 str.split 같은걸 써넣어도 됩니다.
                  # :: 그런 줄 알았는데 아무 tokenize 함수나 쓰면 안되고, generator가 아닌 tokenized list 를 반환하는 함수여야합니다..
                  # :::: 이게 아닐거같기도 함.
                  fix_length=config.len_sentence,
                 # 아마 tokenize된 길이 제한 같은데 한번 확인해볼게요. 특이사항으로는 length 넘으면 자르고, 안넘으면 padding을 채웁니다
                  # :: 그게 아니고 vector화 했을 때의 길이 제한일 것 같아요. 확인해보겠습니다.
                  pad_first=True,
                  # padding이 앞에서부터 붙냐, 뒤에서부터 붙냐는 겁니다.
                  tensor_type=torch.cuda.LongTensor
                  # cuda를 써도 됩니다
                 )

In [20]:
TEXT.build_vocab(tk_train, tk_valid, max_size=config.vocab_size, min_freq=config.min_freq)

In [21]:
def batchify(*pargs, batch_size=32):
    for i in range(0, len(pargs[0]), batch_size):
        end = min(i+batch_size, len(tk_train))
        yield [batch[i:end] for batch in pargs]

In [22]:
net = Net(vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, len_sentence=config.len_sentence,
         x2_size=config.x2_size, channel_size=config.channel_size, dropout=config.dropout, num_labels=config.num_labels, batch_size=config.batch_size).cuda()

In [23]:
net

Net(
  (embedding): Embedding(20002, 100, padding_idx=1)
  (conv2d): Conv2d (1, 128, kernel_size=(5, 101), stride=(1, 1))
  (relu): ReLU(inplace)
  (dropout1d): Dropout(p=0.5)
  (pool1d): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,), ceil_mode=False, count_include_pad=True)
  (fcn1): Linear(in_features=6145, out_features=128)
  (relu1): ReLU(inplace)
  (fcn2): Linear(in_features=128, out_features=6)
  (sigmoid): Sigmoid()
)

In [24]:
optimizer = optim.Adam(net.parameters())
criterion = nn.BCELoss()
# criterions = [nn.CrossEntropyLoss() for i in range(config.num_labels)]
# -> Binary

In [25]:
from tqdm import tqdm

In [26]:
def validation(net, tk_valid, tk_cap_ratio_valid, x2_valid, y_valid : pd.DataFrame, TEXT : data.Field, criterion):
    net.train(False)
    val_score = None
    valid_loss = 0 
    for val_step, (batch_val, cr_val, x2_val, y_val) in enumerate(batchify(tk_valid, tk_cap_ratio_valid, x2_valid.values, y_valid.values, batch_size=config.batch_size)):
        var_x = TEXT.process(batch_val, device=0, train=False).transpose(dim0=0, dim1=1)
        var_cr = Variable(torch.cuda.FloatTensor(cr_val))
        var_y = Variable(torch.cuda.FloatTensor(y_val))
        var_x2 = Variable(torch.cuda.FloatTensor(x2_val))
        pred_score = net(var_x, var_cr, var_x2)
        if val_score is None:
            val_score = pred_score.data
        else:
            val_score = torch.cat([val_score, pred_score.data])
        y_loss = criterion(pred_score, var_y)
        valid_loss += y_loss.data[0]
    net.train(True)
    valid_loss /= len(tk_valid) / config.batch_size
            
    return valid_loss, val_score

In [39]:
train_corrects = [0 for i in range(config.num_labels)]
train_loss = 0
net.train(True)
for step, (x_train_, cr_train_, x2_train_, y_train_) in tqdm(enumerate(batchify(tk_train, tk_cap_ratio_train, x2_train.values, y_train.values, batch_size=config.batch_size))):
    var_x = TEXT.process(x_train_, device=0, train=True).transpose(dim0=0, dim1=1)
    var_cr = Variable(torch.cuda.FloatTensor(cr_train_))
    var_y = Variable(torch.cuda.FloatTensor(y_train_))
    var_x2 = Variable(torch.cuda.FloatTensor(x2_train_))
    pred_score = net(var_x, var_cr, var_x2)
    
    net.zero_grad()
    
    
    y_loss = criterion(pred_score, var_y)
#     if step % 100 == 99:
#         print(y_loss.data[0])
    y_loss.backward()
    if step % 1000 == 999:
        valid_loss, val_score = validation(net, tk_valid, tk_cap_ratio_valid, x2_valid, y_valid, TEXT, criterion)
        print("valid loss", valid_loss)
#         print("valid acc", [i.data[0] for i in valid_acc])

    optimizer.step()
    

1028it [00:06, 155.24it/s]

valid loss 0.054999392579495904


2030it [00:13, 155.28it/s]

valid loss 0.05417662822753191


2338it [00:14, 157.06it/s]


In [40]:
valid_loss, val_score = validation(net, tk_valid, tk_cap_ratio_valid, x2_valid, y_valid, TEXT, criterion)

In [41]:
from sklearn.metrics import roc_auc_score


## TODO
---
### roc_auc_score w.r.t. validation set's score
- Kaggle form에 맞추어 column-wise roc auc score 계산

In [42]:
roc_auc_scores = 0
for i in range(config.num_labels): # minus 1 for normal 
    score = roc_auc_score( y_valid.values[:, i], val_score.cpu().numpy()[:, i])
    print(score)
    roc_auc_scores += score

0.966439002321
0.987104608907
0.983839732223
0.96161002333
0.978819611599
0.95693447737


In [43]:
print(roc_auc_scores / (config.num_labels))

0.972457909292


In [48]:
roc_auc_score(y_valid.values, val_score.cpu().numpy())

0.97245790929169551

- 0.975129977385 - CHANNEL 128 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.5 + AvgPool1d between CNN + sentence length 100 + embedding 100 + min_freq 1 + kernel size 5 + epoch 3
- 0.971308288115 - same above + word capital embedding + epoch 2
- 0.97435546273329887 - same above + epoch 3
- 0.97201811975211161 - same above + epoch 4