In [197]:
import re
import sys
import unicodedata
import emoji
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
import pandas as pd
import multiprocessing
from nltk import TweetTokenizer
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing import text, sequence
from gensim.models import KeyedVectors
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F

In [2]:
CUSTOM_TABLE = str.maketrans(
    {
        "\xad": None,
        "\x7f": None,
        "\ufeff": None,
        "\u200b": None,
        "\u200e": None,
        "\u202a": None,
        "\u202c": None,
        "‘": "'",
        "’": "'",
        "`": "'",
        "“": '"',
        "”": '"',
        "«": '"',
        "»": '"',
        "ɢ": "G",
        "ɪ": "I",
        "ɴ": "N",
        "ʀ": "R",
        "ʏ": "Y",
        "ʙ": "B",
        "ʜ": "H",
        "ʟ": "L",
        "ғ": "F",
        "ᴀ": "A",
        "ᴄ": "C",
        "ᴅ": "D",
        "ᴇ": "E",
        "ᴊ": "J",
        "ᴋ": "K",
        "ᴍ": "M",
        "Μ": "M",
        "ᴏ": "O",
        "ᴘ": "P",
        "ᴛ": "T",
        "ᴜ": "U",
        "ᴡ": "W",
        "ᴠ": "V",
        "ĸ": "K",
        "в": "B",
        "м": "M",
        "н": "H",
        "т": "T",
        "ѕ": "S",
        "—": "-",
        "–": "-",
    }
)


In [3]:
WORDS_REPLACER = [
    ("sh*t", "shit"),
    ("s**t", "shit"),
    ("f*ck", "fuck"),
    ("fu*k", "fuck"),
    ("f**k", "fuck"),
    ("f*****g", "fucking"),
    ("f***ing", "fucking"),
    ("f**king", "fucking"),
    ("p*ssy", "pussy"),
    ("p***y", "pussy"),
    ("pu**y", "pussy"),
    ("p*ss", "piss"),
    ("b*tch", "bitch"),
    ("bit*h", "bitch"),
    ("h*ll", "hell"),
    ("h**l", "hell"),
    ("cr*p", "crap"),
    ("d*mn", "damn"),
    ("stu*pid", "stupid"),
    ("st*pid", "stupid"),
    ("n*gger", "nigger"),
    ("n***ga", "nigger"),
    ("f*ggot", "faggot"),
    ("scr*w", "screw"),
    ("pr*ck", "prick"),
    ("g*d", "god"),
    ("s*x", "sex"),
    ("a*s", "ass"),
    ("a**hole", "asshole"),
    ("a***ole", "asshole"),
    ("a**", "ass"),
]

In [17]:
REGEX_REPLACER = []
for origin, new in WORDS_REPLACER:
    o1 = origin.replace("*", "\*")
    REGEX_REPLACER.append((re.compile(o1), new))
RE_SPACE = re.compile(r"\s")
RE_MULTI_SPACE = re.compile(r"\s+")

In [25]:
invalid_lst = []
for i in range(sys.maxunicode+1):
    if unicodedata.category(chr(i)) == "Mn":
        invalid_lst.append(i)
NMS_TABLE = dict.fromkeys(ind for ind in invalid_lst)

In [5]:
HEBREW_TABLE = {i: "א" for i in range(0x0590, 0x05FF)}
ARABIC_TABLE = {i: "ا" for i in range(0x0600, 0x06FF)}
CHINESE_TABLE = {i: "是" for i in range(0x4E00, 0x9FFF)}
KANJI_TABLE = {i: "ッ" for i in range(0x2E80, 0x2FD5)}
HIRAGANA_TABLE = {i: "ッ" for i in range(0x3041, 0x3096)}
KATAKANA_TABLE = {i: "ッ" for i in range(0x30A0, 0x30FF)}


In [26]:
TABLE = dict()
TABLE.update(CUSTOM_TABLE)
TABLE.update(NMS_TABLE)
TABLE.update(CHINESE_TABLE)
TABLE.update(HEBREW_TABLE)
TABLE.update(ARABIC_TABLE)
TABLE.update(HIRAGANA_TABLE)
TABLE.update(KATAKANA_TABLE)
TABLE.update(KANJI_TABLE)

In [27]:
EMOJI_REGEXP = emoji.get_emoji_regexp()

In [31]:
UNICODE_EMOJI_MY = {}
for k, v in emoji.UNICODE_EMOJI_ALIAS.items():
    v = v.strip(':')
    v = v.replace('_', ' ')
    UNICODE_EMOJI_MY[k] = f" EMJ {v} "

In [10]:
def replace(match):
    return UNICODE_EMOJI_MY.get(match.group(0))

def my_demojize(string):
    return re.sub("\ufe0f", "", EMOJI_REGEXP.sub(replace, string))

def normalize(text):
    text = my_demojize(text)
    text = RE_SPACE.sub(" ", text)
    text = unicodedata.normalize("NFKD", text)
    text = text.translate(TABLE)
    text = RE_MULTI_SPACE.sub(" ", text).strip()
    for pattern, repl in REGEX_REPLACER:
        text = pattern.sub(repl, text)

    return text

In [35]:
PORTER_STEMMER = PorterStemmer()
LANCASTER_STEMMER = LancasterStemmer()
SNOWBALL_STEMMER = SnowballStemmer("english")

In [36]:
def word_forms(word):
    yield word
    yield word.lower()
    yield word.upper()
    yield word.capitalize()
    yield PORTER_STEMMER.stem(word)
    yield LANCASTER_STEMMER.stem(word)
    yield SNOWBALL_STEMMER.stem(word)

In [38]:
def maybe_get_embedding(word, model):
    for form in word_forms(word):
        if form in model:
            return model[form]

    word = word.strip("-'")
    for form in word_forms(word):
        if form in model:
            return model[form]

    return None

In [117]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def gensim_to_embedding_matrix(word2index, path):
    model = load_embeddings(path)
    embedding_matrix = np.zeros((max(word2index.values()) + 1, 300), dtype=np.float32)
    unknown_words = []

    for word, i in word2index.items():
        maybe_embedding = maybe_get_embedding(word, model)
        if maybe_embedding is not None:
            embedding_matrix[i] = maybe_embedding
        else:
            unknown_words.append(word)

    return embedding_matrix, unknown_words

In [77]:
data = pd.read_csv('train.csv')

In [46]:
with multiprocessing.Pool(processes=2) as pool:
     text_list = pool.map(normalize, data.comment_text.tolist())

In [175]:
x = text_list
y = np.where(data['target'] >= 0.5, 1, 0)
y_aux_train = data[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
yy = np.hstack([y[:, np.newaxis], y_aux_train])

In [176]:
X_train, X_test, y_train, y_test = train_test_split(x, yy, test_size=0.3, random_state=2019)

In [177]:
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

x_train = []
word_dict = {}
word_index = 1

for doc in X_train:
    word_seq = []
    for word in tknzr.tokenize(doc):
        if word not in word_dict:
            word_dict[word] = word_index
            word_index += 1
        word_seq.append(word_dict[word])
    x_train.append(word_seq)

In [178]:
x_train = sequence.pad_sequences(x_train, maxlen=200,padding = 'post')

In [179]:
word_dict['unknown-words-in-test'] = 0 

In [180]:
glove_matrix, _ = gensim_to_embedding_matrix(word_dict,"glove.840B.300d.txt",)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [181]:
wiki_matrix, _ = gensim_to_embedding_matrix(word_dict,"wiki-news-300d-1M.vec",)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [182]:
embedding_matrix = np.concatenate([glove_matrix, wiki_matrix], axis=-1)
embedding_matrix.shape

(476648, 600)

In [183]:
x_test = []
for doc in X_test:
    word_seq = []
    for word in tknzr.tokenize(doc):
        if word in word_dict:
            word_seq.append(word_dict[word])
        else:
            word_seq.append(0)
    x_test.append(word_seq)

In [184]:
x_test = sequence.pad_sequences(x_test, maxlen=200,padding = 'post')

In [185]:
x_test

array([[    0, 30600,    10, ...,     0,     0,     0],
       [   51,    52,   298, ...,     0,     0,     0],
       [   95,   187,    10, ...,     0,     0,     0],
       ...,
       [ 3683,    53,   592, ...,     0,     0,     0],
       [  350,     9,    10, ...,     0,     0,     0],
       [ 8365,    34,    35, ...,     0,     0,     0]], dtype=int32)

In [186]:
x_train_torch = torch.tensor(x_train, dtype=torch.long)
x_test_torch = torch.tensor(x_test, dtype=torch.long)
y_train_torch = torch.tensor(y_train, dtype=torch.float32)

In [187]:
features = len(word_dict)+1

In [188]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    
        x = x.permute(0, 3, 2, 1) 
        x = super(SpatialDropout, self).forward(x)  
        x = x.permute(0, 3, 2, 1) 
        x = x.squeeze(2) 
        return x

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [189]:
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        self.embedding = nn.Embedding(features, 600)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(600, 128, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(128 * 2, 128, bidirectional=True, batch_first=True)
        
        self.linear1 = nn.Linear(512, 512)
        self.linear2 = nn.Linear(512, 512)
        
        self.linear_out = nn.Linear(512, 1)
        self.linear_aux_out = nn.Linear(512, num_aux_targets)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        avg_pool = torch.mean(h_lstm2, 1)
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        
        out = torch.cat([result, aux_result], 1)
        return out 

In [190]:
model = NeuralNet(embedding_matrix, 6)

In [1]:
def train_model(model, train, test, loss_fn, output_dim, lr=0.001,
                batch_size=512, n_epochs=4,
                enable_checkpoint_ensemble=True):
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    
    for epoch in range(n_epochs):
        
        scheduler.step()
        
        model.train()
        avg_loss = 0.
        
        for x_batch, y_batch in tqdm(train_loader, disable=False):
            x_batch = data[:-1]
            y_batch = data[-1]

            y_pred = model(*x_batch)            
            loss = loss_fn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        test_preds = np.zeros((len(test), output_dim))
        
        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy())

            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        all_test_preds.append(test_preds)
        
        print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
              epoch + 1, n_epochs, avg_loss, elapsed_time))

    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)    
    else:
        test_preds = all_test_preds[-1]
        
    return test_preds

In [2]:
train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
test_dataset = data.TensorDataset(x_test_torch)

all_test_preds = []

    
    
test_preds = train_model(model, train_dataset, test_dataset, output_dim=y_train_torch.shape[-1], 
                             loss_fn=nn.BCEWithLogitsLoss(reduction='mean'))
all_test_preds.append(test_preds)
print()

NameError: name 'data' is not defined

In [150]:
torch.cat((torch.tensor([0,0]), torch.tensor([1,1])), 1)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
mispell_dict = {"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"}