# ISSUES #
## validation set에 대해 계산한 Columnwise mean ROC AUC가 실제 테스트셋에 대해 제출했을 때 값과 차이가 많이 남

## Requirements

- pytorch
- torchtext
- pandas
- scikit-learn
- numpy
- tqdm
- gensim


In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader, Dataset, TensorDataset
import pandas as pd 
import random
import numpy as np

## Model 설명

- embedding layer
- |
- convolutional layer (kernel = 3 x embedding dim)
- |
- leakyrelu
- |
- dropout
- |
- maxpool w.r.t time axis
- |
- fcn1 for each labels
- | | | | | | |
- fcn2 for each labels ( -> binary output )
- | | | | | | |
- CrossEntropyLoss

In [2]:
class Net(nn.Module):
    def __init__(self, 
                 vocab_size,
                 embedding_dim,
                 len_sentence,
                 channel_size=4,
                 x2_size=1, # additional data - cap ratio
                 fc_dim=128,
                 padding_idx=1,
                 dropout=0.3,
                 num_labels=7,
                 batch_size=32,
                 is_cuda=False
                ):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocab_size+2, embedding_dim=embedding_dim, padding_idx=padding_idx)
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.channel_size = channel_size
        self.len_sentence = len_sentence
        self.batch_size = batch_size
        self.x2_size = x2_size
        
        self.conv2d = nn.Conv2d(1, out_channels=channel_size, kernel_size=(5, embedding_dim), stride=1)
        # output : batch x channel x (len_sentence - 2) x 1
        
        # -> squeeze : batch x channel x (len_sentence - 2)
        self.relu = nn.ReLU(inplace=True)
        self.dropout1d = nn.Dropout(p=dropout)
        self.pool1d = nn.AvgPool1d(kernel_size=2)
        # output : batch x channel x (len_sentence - 2) / 2
        
        self.bottleneck_size = channel_size * (len_sentence - 4) / 2
#         print ("Linear size : %sx(%s-2)/2"%(channel_size, len_sentence), self.bottleneck_size)
        assert self.bottleneck_size.is_integer()
        self.bottleneck_size = int(self.bottleneck_size) + self.x2_size
        
        self.fcns1 = [nn.Linear(self.bottleneck_size, fc_dim) for i in range(num_labels)]
        self.relu1 = [nn.ReLU(inplace=True) for i in range(num_labels)]
        self.fcns2 = [nn.Linear(fc_dim, 2) for i in range(num_labels)]
        
        
        for i in range(num_labels):
            self.add_module("fcn1-"+str(i), self.fcns1[i])
        for i in range(num_labels):
            self.add_module("relu1-"+str(i), self.relu1[i])
        for i in range(num_labels):
            self.add_module("fcn2-"+str(i), self.fcns2[i])
        
        self.fc_dim = fc_dim
        self.num_labels = num_labels
    
    def forward(self, sentence, other_features):
#         print("sentence ", sentence.shape)
        image = self.embedding(sentence)
#         print(bottleneck.shape)
        image.unsqueeze_(1)
#         print("image ", image.shape)
        
        bottleneck = self.conv2d(image)
        bottleneck.squeeze_(3)
        bottleneck = self.relu(bottleneck) # batch x channel x features
        bottleneck = self.dropout1d(bottleneck)
        bottleneck = self.pool1d(bottleneck)
#         print("bt shape ", bottleneck.shape)
        
        bottleneck = bottleneck.view(-1, self.bottleneck_size - self.x2_size)
        if self.x2_size > 0:
            bottleneck = torch.cat([bottleneck, other_features], dim=1)

        fcns_1 = []
        for i in range(self.num_labels):
            fcns_1.append(self.relu1[i](self.fcns1[i](bottleneck)))
        
        fcns_2 = []
        for i in range(self.num_labels):
            fcns_2.append(self.fcns2[i](fcns_1[i]))
            
        return fcns_2 # return num_labels


In [3]:
class config:
    vocab_size = 20000
    embedding_dim = 100 # TODO: max 300
    len_sentence = 100
    num_labels = 7 # TODO: 6 강추
    min_freq = 1
    batch_size = 64
    channel_size = 128
    seed = 0
    dropout = 0.5 # TODO: batch norm으로 대체 추천
    x2_size = 1

In [4]:

# seed 고정
torch.cuda.manual_seed_all(config.seed)
torch.manual_seed(config.seed)
random.seed(config.seed)
np.random.seed(config.seed)

In [5]:
def get_pd_data(path : str):
    df = pd.read_csv(path)
    return df

In [6]:
train = get_pd_data('./data/train.csv')

In [7]:
test = get_pd_data('./data/test.csv')

In [8]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


## Preprocess (1)
----
###  Set captial character ratio
- 문장 내의 대문자 비율을 뉴럴넷의 input으로 줌

In [9]:
def set_capital_ratio(df : pd.DataFrame):
    df['alphas'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isalpha()))
    df['capitals'] = df['comment_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['cap_ratio'] = df.apply(lambda row: float(row['capitals']) / (float(row['alphas']) + 1), axis=1)


In [10]:
set_capital_ratio(train), set_capital_ratio(test)

(None, None)

In [11]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,alphas,capitals,cap_ratio
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,203,17,0.083333
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,73,8,0.108108
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,186,4,0.02139
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,486,11,0.022587
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,50,2,0.039216


## Preprocess(2)
-----
### Word tokenize
- gensim의 tokenize function

In [12]:
from gensim.utils import simple_tokenize

In [13]:
def tokenizer(string : str):
    return [s for s in simple_tokenize(string)]

In [14]:
tk_train = train['comment_text'].str.lower().apply(tokenizer)
tk_test = train['comment_text'].str.lower().apply(tokenizer)

In [15]:
tk_train[:5]

0    [explanation, why, the, edits, made, under, my...
1    [d, aww, he, matches, this, background, colour...
2    [hey, man, i, m, really, not, trying, to, edit...
3    [more, i, can, t, make, any, real, suggestions...
4    [you, sir, are, my, hero, any, chance, you, re...
Name: comment_text, dtype: object

In [16]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
x_features = ['cap_ratio']

## Preprocess(3)
----
### Add Normal column label
- toxic하지 않은 label로 분류되는 것에, normal=1 의 새로운 라벨 추가

In [17]:
train['normal'] = 0
train.loc[train[labels].sum(axis=1) == 0, 'normal'] = 1

In [18]:
labels.append('normal')

In [19]:
y_labels = train[labels]
y_labels.head(n=10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,normal
0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1
5,0,0,0,0,0,0,1
6,1,1,1,0,1,0,0
7,0,0,0,0,0,0,1
8,0,0,0,0,0,0,1
9,0,0,0,0,0,0,1


In [20]:
# Add manually engineered features ex) capital ratio of sentence
x2 = train[x_features]
x2.head(n=10)

Unnamed: 0,cap_ratio
0,0.083333
1,0.108108
2,0.02139
3,0.022587
4,0.039216
5,0.021277
6,0.973684
7,0.043478
8,0.019391
9,0.033333


## Validation
----
### 10000 개의 Validation set

In [21]:
from sklearn.utils import shuffle

In [22]:
# x, y, z : pd.Dataframe of pd.series
def shuffle_x_y(x , x2, y):
    shuffler = np.random.permutation(len(x))
    x = x.iloc[shuffler]
    y = y.iloc[shuffler]
    x2 = x2.iloc[shuffler]
    return x, x2, y

In [23]:
tk_train, x2, y_labels = shuffle_x_y(tk_train, x2, y_labels)

In [24]:
valid_num = 10000

In [25]:
tk_valid = tk_train[-valid_num:]
y_valid = y_labels[-valid_num:]
tk_train = tk_train[:-valid_num]
y_labels = y_labels[:-valid_num]
x2_valid = x2[-valid_num:]
x2_train = x2[:-valid_num]

In [26]:
from torchtext import data, datasets

## Preprocess(3)
---
### torchtext.data.Field
- word dictionary, word to index 구현

In [27]:
TEXT = data.Field(sequential=True,  
                  # 들어갈 데이터가 sequential 인가요? 우리는 tokenize한 word의 sequence를 다룰거니까 True입니다. Defualt로도 True임.
                  tokenize=tokenizer, 
                  # 그 데이터를 tokenize할 함수를 지정할 수 있습니다. 우리는 gensim library의 tokenize 함수를 쓸건데요
                  # 뭐 굳이 그거 말고도 직접 정의해도 되고 str.split 같은걸 써넣어도 됩니다.
                  # :: 그런 줄 알았는데 아무 tokenize 함수나 쓰면 안되고, generator가 아닌 tokenized list 를 반환하는 함수여야합니다..
                  # :::: 이게 아닐거같기도 함.
                  fix_length=config.len_sentence,
                 # 아마 tokenize된 길이 제한 같은데 한번 확인해볼게요. 특이사항으로는 length 넘으면 자르고, 안넘으면 padding을 채웁니다
                  # :: 그게 아니고 vector화 했을 때의 길이 제한일 것 같아요. 확인해보겠습니다.
                  pad_first=True,
                  # padding이 앞에서부터 붙냐, 뒤에서부터 붙냐는 겁니다.
                  tensor_type=torch.cuda.LongTensor
                  # cuda를 써도 됩니다
                 )

In [28]:
TEXT.build_vocab(tk_train, tk_valid, max_size=config.vocab_size, min_freq=config.min_freq)

In [29]:
def batchify(tk_train, x2, y_labels, batch_size=32):
    for i in range(0, len(tk_train), batch_size):
        end = min(i+batch_size, len(tk_train))
        yield tk_train[i:end], x2[i:end], y_labels[i:end]

In [30]:
net = Net(vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, len_sentence=config.len_sentence,
         x2_size=config.x2_size, channel_size=config.channel_size, dropout=config.dropout, num_labels=config.num_labels, batch_size=config.batch_size).cuda()

In [31]:
net

Net(
  (embedding): Embedding(20002, 100, padding_idx=1)
  (conv2d): Conv2d (1, 128, kernel_size=(5, 100), stride=(1, 1))
  (relu): ReLU(inplace)
  (dropout1d): Dropout(p=0.5)
  (pool1d): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,), ceil_mode=False, count_include_pad=True)
  (fcn1-0): Linear(in_features=6145, out_features=128)
  (fcn1-1): Linear(in_features=6145, out_features=128)
  (fcn1-2): Linear(in_features=6145, out_features=128)
  (fcn1-3): Linear(in_features=6145, out_features=128)
  (fcn1-4): Linear(in_features=6145, out_features=128)
  (fcn1-5): Linear(in_features=6145, out_features=128)
  (fcn1-6): Linear(in_features=6145, out_features=128)
  (relu1-0): ReLU(inplace)
  (relu1-1): ReLU(inplace)
  (relu1-2): ReLU(inplace)
  (relu1-3): ReLU(inplace)
  (relu1-4): ReLU(inplace)
  (relu1-5): ReLU(inplace)
  (relu1-6): ReLU(inplace)
  (fcn2-0): Linear(in_features=128, out_features=2)
  (fcn2-1): Linear(in_features=128, out_features=2)
  (fcn2-2): Linear(in_features=128, ou

In [32]:
optimizer = optim.Adam(net.parameters())
criterions = [nn.CrossEntropyLoss() for i in range(config.num_labels)]
# -> Binary

In [33]:
from tqdm import tqdm

In [34]:
def validation(net, tk_valid, x2_valid, y_valid : pd.DataFrame, TEXT : data.Field, criterions : list):
    net.train(False)
    val_corrects = [0 for i in range(config.num_labels)]
    val_score = []
    valid_loss = 0 
    sigmoid = nn.Sigmoid()
    for val_step, (batch_val, x2_val, y_val) in enumerate(batchify(tk_valid, x2_valid.values, y_valid.values, batch_size=config.batch_size)):
        y_total_loss = 0
        var_batch = TEXT.process(batch_val, device=0, train=False).transpose(dim0=0, dim1=1)
        var_y = Variable(torch.cuda.LongTensor(y_val)).transpose(dim0=0, dim1=1)
        var_x2 = Variable(torch.cuda.FloatTensor(x2_val))
        pred_score = net(var_batch, var_x2)
        val_score.append([sigmoid(score).data.cpu() for score in pred_score])
        for i, score in enumerate(pred_score):
            _, pred = score.max(dim=1)
            val_corrects[i] += (pred == var_y[i]).float().sum()
            y_loss = criterions[i](score, var_y[i])
            y_total_loss += y_loss
        valid_loss += y_total_loss
    net.train(True)
    valid_loss /= len(tk_valid) / config.batch_size
    
    for i, val_correct in enumerate(val_corrects):
        val_corrects[i] = val_corrects[i] / len(tk_valid)
        
    return valid_loss, val_corrects, val_score

In [41]:
train_corrects = [0 for i in range(config.num_labels)]
train_loss = 0
net.train(True)
for step, (batch, x2, y_label) in tqdm(enumerate(batchify(tk_train, x2_train.values, y_labels.values, batch_size=config.batch_size))):
    var_batch = TEXT.process(batch, device=0, train=True).transpose(dim0=0, dim1=1)
    var_y = Variable(torch.cuda.LongTensor(y_label)).transpose(dim0=0, dim1=1)
    var_x2 = Variable(torch.cuda.FloatTensor(x2))
    pred_score = net(var_batch, var_x2)
    
    net.zero_grad()
    y_total_loss = 0
    for i, score in enumerate(pred_score):
        _, pred = score.max(dim=1)
        train_corrects[i] += (pred == var_y[i]).float().sum()
        y_loss = criterions[i](score, var_y[i])
#         print(y_loss.data[0])
        y_total_loss += y_loss
    
    
    if step % 1000 == 999:
        valid_loss, valid_acc, val_score = validation(net, tk_valid, x2_valid, y_valid, TEXT, criterions)
        print("valid loss", valid_loss)
        print("valid acc", [i.data[0] for i in valid_acc])
        print("y_total_loss", y_total_loss.data[0])
    y_total_loss.backward()
    optimizer.step()
    

1012it [00:10, 94.75it/s]

valid loss Variable containing:
 0.4560
[torch.cuda.FloatTensor of size 1 (GPU 0)]

valid acc [0.9587999582290649, 0.9889999628067017, 0.9767999649047852, 0.9968999624252319, 0.9693999886512756, 0.9908999800682068, 0.9577999711036682]
y_total_loss 0.552423894405365


2013it [00:21, 94.52it/s]

valid loss Variable containing:
 0.4402
[torch.cuda.FloatTensor of size 1 (GPU 0)]

valid acc [0.9607999920845032, 0.9891999959945679, 0.9763000011444092, 0.9968999624252319, 0.9674999713897705, 0.9908999800682068, 0.9606999754905701]
y_total_loss 0.796908974647522


2338it [00:24, 95.65it/s]


In [42]:
valid_loss, valid_acc, val_score = validation(net, tk_valid, x2_valid, y_valid, TEXT, criterions)

In [43]:
predictions = [] # 모든 라벨의 validation set에 대한 예측값
for i in range(config.num_labels):
    prediction = [s[i] for s in val_score] # i번째 라벨에 대한 예측값을 배치에 따라 모음
    predictions.append(torch.cat(prediction))


In [44]:
from sklearn.metrics import roc_auc_score


## TODO
---
### roc_auc_score w.r.t. validation set's score
- Kaggle form에 맞추어 column-wise roc auc score 계산

In [45]:
roc_auc_scores = 0
for i in range(config.num_labels - 1): # minus 1 for normal 
    score = roc_auc_score( y_valid[labels[i]].values, predictions[i].numpy()[:, 1])
    print(score)
    roc_auc_scores += score

0.965643741112
0.983693775685
0.97803789524
0.964748785752
0.974100205935
0.946592009262


In [46]:
print(roc_auc_scores / (config.num_labels - 1))

0.968802735498


- 0.928248892508
- 0.928127157466
- 0.935732919807 - CHANNEL 24
- 0.934171154598 - CHANNEL 64
- 0.948519433931 - CHANNEL 64 + PURE RELU between FCN
- 0.949684088036 - CHANNEL 64 + PURE RELU between FCN + PURE RELU between CNN
- 0.949644094585 - CHANNEL 64 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.5
- 0.950924024766 - CHANNEL 64 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.5 + AvgPool1d between CNN
- 0.949550358684 - CHANNEL 64 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.3 + AvgPool1d between CNN
- 0.960614877434 - same + 2epoch
- 0.960766345684 - same + 3epoch
- 0.950225198889 - CHANNEL 64 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.3 + AvgPool1d between CNN + capital ratio
- 0.960447066963 - same + 2epoch
- 0.962149265317 - same + 3epoch
- 0.954162859988 - CHANNEL 64 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.3 + AvgPool1d between CNN + capital ratio + sentence length 50
- 0.962025767267 - same + 2epoch
- 0.965634499276 - same + 3epoch
- 0.964100160935 - same + 4epoch (score down)
- 0.962314010080 - same + 5epoch (score down)
- 0.959030453926 - CHANNEL 64 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.3 + AvgPool1d between CNN + capital ratio + sentence length 50 + embedding 100
- 0.964376057907 - same + 2epoch
- 0.964409415850 - same + 3epoch
- 0.963119039023 - same + 4epoch (score down)
- 0.959078240257 - CHANNEL 64 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.3 + AvgPool1d between CNN + capital ratio + sentence length 50 + embedding 100 + min_freq 1
- 0.963425059656 - same + 2epoch
- 0.964142162405 - same + 3epoch
- 0.958696961492 - CHANNEL 64 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.3 + AvgPool1d between CNN + capital ratio + sentence length 50 + embedding 100 + min_freq 1
- 0.965250520502 - same + 2epoch
- 0.964957852521 - same + 3epoch (score down)
- 0.959665545778 - CHANNEL 64 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.3 + AvgPool1d between CNN + sentence length 50 + embedding 100 + min_freq 1
- 0.964868120935 - same + 2epoch
- 0.965769582949 - same + 3epoch
- 0.966142587757 - same + 4epoch
- 0.963965925012 - same + 5epoch
- 0.968513983110 - CHANNEL 128 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.5 + AvgPool1d between CNN + sentence length 100 + embedding 100 + min_freq 1
- 0.970571357335 - same + 2epoch
- 0.971408464827 - same + 3epoch
- 0.969902034731 - same + 4epoch (score down)
- 0.966766524384 - CHANNEL 128 + PURE RELU between FCN + PURE RELU between CNN + dropout 0.5 + AvgPool1d between CNN + sentence length 100 + embedding 100 + min_freq 1 + kernel size 5
- 0.971493644385 - same + 2epoch
- 0.970291057843 - same + 3epoch (score down)
