In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader, Dataset, TensorDataset

In [2]:
from collections import OrderedDict

In [3]:
class Net(nn.Module):
    def __init__(self, 
                 vocab_size,
                 embedding_dim,
                 len_sentence,
                 channel_size=4,
                 fc_dim=128,
                 padding_idx=1,
                 dropout=0.3,
                 num_labels=7,
                 batch_size=32,
                ):
        super(Net, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim=embedding_dim, padding_idx=padding_idx)
        self.embedding_dim = embedding_dim
        self.vocab_size = vocab_size
        self.channel_size = channel_size
        self.len_sentence = len_sentence
        self.batch_size = batch_size
        
        self.convnet = nn.Sequential(OrderedDict(
        [
            ('layer 1', nn.Conv2d(1, out_channels=channel_size, kernel_size=3, stride=1, padding=0)), # input : 1 * embedding_dim * len_sentence => 
            ('relu 1', nn.LeakyReLU(inplace=True)), # output : channel_size * (embedding_dim-2) * (len_sentence-2)
            ('avgpool 1', nn.AvgPool2d(2, stride=2)), # output : channel_size * (embedding_dim-2)/2 * (len_sentence-2)/2
            ('batchnorm 1', nn.BatchNorm2d(channel_size)),
            ('dropout 1', nn.Dropout2d(p=dropout, inplace=True)),
            ('layer 2', nn.Conv2d(in_channels=channel_size, out_channels=channel_size*2, kernel_size=3, stride=1, padding=0)), 
            ('relu 2', nn.LeakyReLU(inplace=True)), 
            ('avgpool 2', nn.AvgPool2d(2, stride=2)), # output : channel_size * 2 * ((embedding_dim-2)/2 -2)/2 * (((len_sentence-2)/2)-2)/2  
        ]))
        
        self.bottleneck_size = channel_size * ((embedding_dim-2)/2 -2) * (((len_sentence-2)/2)-2)/2 
        print ("Linear size : ", self.bottleneck_size)
        assert self.bottleneck_size.is_integer()
        self.bottleneck_size = int(self.bottleneck_size)
        self.fcns1 = [nn.Linear(self.bottleneck_size, fc_dim) for i in range(num_labels)]
        self.fcns2 = [nn.Linear(fc_dim, 2) for i in range(num_labels)]
        self.fc_dim = fc_dim
        self.num_labels = num_labels
    
    def forward(self, sentence, other_features=None):
        
        image = self.embedding(sentence)
        
        bottleneck = self.convnet(image)
        print(bottleneck.shape)
        bottleneck = bottleneck.view(-1, self.bottleneck_size)
        
        fcns_1 = []
        for i in range(self.num_labels):
            fcns_1.append(self.fcns1[i](bottleneck))
        
        fcns_2 = []
        for i in range(self.num_labels):
            fcns_2.append(self.fcns2[i](fcns_1[i]))
            
        return fcns_2 # return num_labels


In [4]:
class config:
    vocab_size = 20000
    embedding_dim = 50
    len_sentence = 30
    num_labels = 7

In [5]:
from load_data import *



In [6]:
train = get_pd_data('./data/train.csv')

In [7]:
test = get_pd_data('./data/test.csv')

In [8]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [9]:
set_capital_ratio(train), set_capital_ratio(test)

(None, None)

In [10]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,alphas,capitals,cap_ratio
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,203,17,0.083333
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,73,8,0.108108
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,186,4,0.02139
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,486,11,0.022587
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,50,2,0.039216


In [19]:
tk_train = train['comment_text'].str.lower().apply(tokenizer)
tk_test = train['comment_text'].str.lower().apply(tokenizer)

In [20]:
from torchtext import data, datasets

In [21]:
TEXT = data.Field(sequential=True,  
                  # 들어갈 데이터가 sequential 인가요? 우리는 tokenize한 word의 sequence를 다룰거니까 True입니다. Defualt로도 True임.
                  tokenize=tokenizer, 
                  # 그 데이터를 tokenize할 함수를 지정할 수 있습니다. 우리는 gensim library의 tokenize 함수를 쓸건데요
                  # 뭐 굳이 그거 말고도 직접 정의해도 되고 str.split 같은걸 써넣어도 됩니다.
                  # :: 그런 줄 알았는데 아무 tokenize 함수나 쓰면 안되고, generator가 아닌 tokenized list 를 반환하는 함수여야합니다..
                  # :::: 이게 아닐거같기도 함.
                  fix_length=20,
                 # 아마 tokenize된 길이 제한 같은데 한번 확인해볼게요. 특이사항으로는 length 넘으면 자르고, 안넘으면 padding을 채웁니다
                  # :: 그게 아니고 vector화 했을 때의 길이 제한일 것 같아요. 확인해보겠습니다.
                  pad_first=True,
                  # padding이 앞에서부터 붙냐, 뒤에서부터 붙냐는 겁니다.
                  tensor_type=torch.cuda.LongTensor
                  # cuda를 써도 됩니다
                 )

In [22]:
TEXT.build_vocab(tk_train, tk_test)

In [24]:
tk_train[:5]

0    [explanation, why, the, edits, made, under, my...
1    [d, aww, he, matches, this, background, colour...
2    [hey, man, i, m, really, not, trying, to, edit...
3    [more, i, can, t, make, any, real, suggestions...
4    [you, sir, are, my, hero, any, chance, you, re...
Name: comment_text, dtype: object

In [33]:
rt = TEXT.process(tk_train[:10], device=0, train=False)

In [34]:
rt

Variable containing:

Columns 0 to 5 
 6.6000e+02  1.0000e+00  4.0300e+02  6.2000e+01  1.0000e+00  1.0000e+00
 7.8000e+01  1.0000e+00  4.1400e+02  4.0000e+00  1.0000e+00  1.0000e+00
 2.0000e+00  1.0000e+00  4.0000e+00  3.8000e+01  1.0000e+00  1.0000e+00
 1.3100e+02  1.0000e+00  7.1000e+01  2.3000e+01  1.0000e+00  1.0000e+00
 1.3400e+02  1.5200e+02  1.3800e+02  1.0000e+02  1.0000e+00  1.0000e+00
 1.7900e+02  1.5879e+04  1.5000e+01  5.8000e+01  1.0000e+00  1.0000e+00
 3.2000e+01  5.2000e+01  2.4900e+02  3.3500e+02  7.0000e+00  1.0000e+00
 6.4800e+02  2.5130e+03  3.0000e+00  1.3850e+03  1.6210e+03  1.0000e+00
 4.3040e+03  1.4000e+01  7.7000e+01  1.6000e+01  2.1000e+01  1.0000e+00
 1.0898e+04  5.4400e+02  3.1300e+02  2.0140e+03  3.2000e+01  1.0000e+00
 9.9900e+02  3.6070e+03  1.1000e+01  4.0000e+00  3.2910e+03  2.5490e+03
 8.8000e+01  4.0000e+00  1.9000e+01  5.7140e+03  5.8000e+01  3.4000e+01
 3.2900e+02  7.1000e+01  5.5000e+01  2.5000e+01  1.0240e+03  3.9000e+01
 5.0000e+01  4.3110e+03  1

In [None]:
import pandas as pd

In [None]:
train_set = pd.read_csv('./data/train.csv')
test_set = pd.read_csv('./data/test.csv')
train_set.head()

In [None]:
from gensim.utils import tokenize, simple_tokenize

In [None]:
valid_ratio = 0.2
train_num = int(len(train_set) * (1-valid_ratio))
train_datasets = train_set.loc[0:train_num]
valid_datasets = train_set.loc[train_num:]
valid_datasets.head()

In [None]:
train_datasets.to_csv('./data/train_cached.csv', index=False)
valid_datasets.to_csv('./data/valid_cached.csv', index=False)

In [None]:
from load_data import *

In [None]:
from torchtext import data, datasets

In [None]:
TEXT = data.Field(sequential=True,  
                  # 들어갈 데이터가 sequential 인가요? 우리는 tokenize한 word의 sequence를 다룰거니까 True입니다. Defualt로도 True임.
                  tokenize=tokenizer, 
                  # 그 데이터를 tokenize할 함수를 지정할 수 있습니다. 우리는 gensim library의 tokenize 함수를 쓸건데요
                  # 뭐 굳이 그거 말고도 직접 정의해도 되고 str.split 같은걸 써넣어도 됩니다.
                  # :: 그런 줄 알았는데 아무 tokenize 함수나 쓰면 안되고, generator가 아닌 tokenized list 를 반환하는 함수여야합니다..
                  # :::: 이게 아닐거같기도 함.
                  fix_length=20,
                 # 아마 tokenize된 길이 제한 같은데 한번 확인해볼게요. 특이사항으로는 length 넘으면 자르고, 안넘으면 padding을 채웁니다
                  # :: 그게 아니고 vector화 했을 때의 길이 제한일 것 같아요. 확인해보겠습니다.
                  pad_first=True,
                  # padding이 앞에서부터 붙냐, 뒤에서부터 붙냐는 겁니다.
                  tensor_type=torch.LongTensor
                  # cuda를 써도 됩니다
                 )



In [None]:
set_capital_ratio(train_set)

In [None]:
comments = train_set['comment_text'].apply(tokenizer)

In [None]:
comments

In [None]:
TEXT.build_vocab(train, val, # 몇 개의 단어 셋을 줘도 괜찮습니다.
                 max_size=config.vocab_size,  # 단어사전의 최대 크기입니다. 여기에는 padding word, unknown word는 포함되지 않습니다.
                 min_freq=3  # 몇 개 이상 나타나는 단어에 대해서만 vocab을 생성합니다. 
                )

In [None]:
aa

In [None]:
net = Net(vocab_size=config.vocab_size, embedding_dim=config.embedding_dim, len_sentence=config.len_sentence, batch_size=10)

In [None]:
net

In [None]:
inputs = torch.randn((20, 1, 50, 30))

In [None]:
net(Variable(inputs))