In [2]:
import pandas as pd

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


import random

from collections import defaultdict

In [3]:
epoch = 3
special_token = 2 #Unknown word, embedding
max_len = 100
batch_size = 100
batch_per_epoch = 1500

In [8]:
def replace(string):
    c = '!#$%^&*(.)[]{};:,/"<>?-`@\'~”—=_·\n+123456“7890‘’'
    for i in range(len(c)):
        string = string.replace(c[i]," ")
    return(string)
data_set = pd.read_csv('./src/train.csv')
test_set = pd.read_csv('./src/test.csv')

In [9]:
data = replace(data_set["comment_text"].str.cat(sep = "|zkr")).split("|zkr")
data_test = replace(test_set["comment_text"].str.cat(sep = "|zkr")).split("|zkr")
test_id = test_set["id"].str.cat(sep = "|").split("|")

In [10]:
sentence = list(" ".join(d.split()) for d in data)
test_sentence = list(" ".join(d.split()) for d in data_test)

In [11]:
words = set(("".join([j for i in sentence for j in i])).split())

In [12]:
word_to_ix = defaultdict(int)
for i,word in enumerate(words):
    word_to_ix[word] = i+special_token
word_to_ix["<unk>"] = 0
word_to_ix["<pad>"] = 1

In [13]:
def sentence_to_dict(sentence,max_len,word_to_ix):
    s_split = sentence.split()
    indi = [word_to_ix[word] for word in s_split]
    if len(indi) < max_len:
        indi += [word_to_ix["<pad>"]] * (max_len - len(indi))
    else:
        indi = indi[:max_len]
    return indi

In [14]:
def create_batch(sentences,max_len,word_to_ix,batch_size,data_set):
    correct = []
    sentence = []
    for _ in range(batch_size):
        index = random.randint(0,len(sentences)-1)
        sentence.append(sentences[index])
        correct.append([data_set["toxic"][index],data_set["severe_toxic"][index],data_set["obscene"][index],data_set["threat"][index],data_set["insult"][index],\
                        data_set["identity_hate"][index]])
    sen_idx = [sentence_to_dict(sen,max_len,word_to_ix) for sen in sentence]
    return sen_idx,correct

In [15]:
def validate_batch(test_sentence,max_len,word_to_ix,batch_size,test_id,test_index):
    test_id_batch = []
    test_sentence_batch = []
    end = test_index+batch_size if test_index+batch_size < len (test_id) else len (test_id)
    for i in range(test_index,end):
        test_id_batch.append(test_id[i])
        test_sentence_batch.append(test_sentence[i])
    test_sen_idx = [sentence_to_dict(sen,max_len,word_to_ix) for sen in test_sentence_batch]
    return test_sen_idx,test_id_batch

In [16]:
class Classifier(nn.Module):
    def __init__(self,vocab,max_len,special_token,output_nb = 12,dim_emb=70):
        super(Classifier, self).__init__()
        self.max_len = max_len
        self.dim_emb = dim_emb
        self.embedding = nn.Embedding(len(vocab)+special_token, dim_emb)
        self.linear = nn.Linear(max_len*dim_emb,output_nb)
    def forward(self, vector):
        p = self.embedding(vector)
        p = p.view(-1,self.max_len*self.dim_emb)
        p = self.linear(p)
        #nl = nn.Softmax(0)
        #p = nl(p)
        return p

In [17]:
clf = Classifier(words,max_len,special_token)

criterion = nn.CrossEntropyLoss()#

optimizer = optim.Adagrad(clf.parameters())

In [64]:
avg_loss = 0
for e in range(epoch):
    for batch_idx in range(batch_per_epoch):

        batch,Y = create_batch(sentence,max_len,word_to_ix,batch_size,data_set)

        batch_tensor = torch.LongTensor(batch)
        Y = torch.LongTensor(Y)

        optimizer.zero_grad()
        result = clf(batch_tensor)

        loss = criterion(result.view(-1,2),Y.view(-1)) 

        avg_loss += loss.item()
        loss.backward()

        optimizer.step()

        if batch_idx%500 == 0:
            print("Epoch: ",e,"|Batch:",batch_idx,"|Loss :", round(avg_loss/500, 8))
            avg_loss = 0

Epoch:  0 |Batch: 0 |Loss : 0.00029613
Epoch:  0 |Batch: 500 |Loss : 0.08859602
Epoch:  0 |Batch: 1000 |Loss : 0.08679001
Epoch:  1 |Batch: 0 |Loss : 0.08172032
Epoch:  1 |Batch: 500 |Loss : 0.07964151
Epoch:  1 |Batch: 1000 |Loss : 0.07610735
Epoch:  2 |Batch: 0 |Loss : 0.07317241


KeyboardInterrupt: 

In [65]:
#this is gonna change
#first no thinking try

nl = nn.Softmax(-1)
sub = [[] for _ in range(len(test_sentence))]
for test_index in range(0,len(test_sentence),batch_size):

        batch,identity = validate_batch(test_sentence,max_len,word_to_ix,batch_size,test_id,test_index)

        batch_tensor = torch.LongTensor(batch)

        result = clf(batch_tensor)
        
        p = nl(nl(result.view(-1,2)))
        
        end = batch_size if test_index+batch_size < len (test_id) else len (identity)
        for i in range(0,end):
            sub[test_index+i].append(identity[i])
            for j in range(0,6):
                sub[test_index+i].append(p[i*6+j][1].item())

submission = pd.DataFrame([i[0] for i in sub], columns=['id'])
submission["toxic"] = [i[1] for i in sub]
submission["severe_toxic"] = [i[2] for i in sub]
submission["obscene"] = [i[3] for i in sub]
submission["threat"] = [i[4] for i in sub]
submission["insult"] = [i[5] for i in sub]
submission["identity_hate"] = [i[6] for i in sub]
submission.to_csv('../src/submission.csv',index=False)