In [2]:
import embedding_prepare
import clean_data
import create_table 
import multiprocessing
import pandas as pd
import processing 
from tqdm import tqdm
import numpy as np
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.python.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
from tensorflow.python.keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from tensorflow.python.keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate, Add
from tensorflow.python.keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.models import Model
from tensorflow.python.keras import backend as K
from keras.engine.topology import Layer
from tensorflow.python.keras import initializers, regularizers, constraints, optimizers, layers

In [3]:
Table = create_table.uncommon_char_table()
data = pd.read_csv('train.csv')

In [4]:
len(data)

1804874

In [4]:
dat = data.comment_text.tolist()
text_list = []

In [5]:
for d in tqdm(dat):
    text_list.append(clean_data.normalize(d))

100%|██████████| 1804874/1804874 [21:37<00:00, 1391.07it/s]


In [6]:
X_train, X_test, y_train, y_test = processing.clean(text_list,data)

In [7]:
x_train,x_test, word_dict = processing.token(X_train,X_test)

In [8]:
glove_matrix, _ = embedding_prepare.gensim_to_embedding_matrix(word_dict,"glove.840B.300d.txt",)
wiki_matrix, _ = embedding_prepare.gensim_to_embedding_matrix(word_dict,"wiki-news-300d-1M.vec",)
embedding_matrix = np.concatenate([glove_matrix, wiki_matrix], axis=-1)

2196017it [02:36, 13988.54it/s]
999995it [01:06, 15024.41it/s]


In [16]:
x_train_torch = torch.tensor(x_train, dtype=torch.long)
x_test_torch = torch.tensor(x_test, dtype=torch.long)
y_train_torch = torch.tensor(y_train, dtype=torch.float32)
features = len(word_dict)+1

In [21]:
train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
test_dataset = data.TensorDataset(x_test_torch)

In [17]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    
        x = x.permute(0, 3, 2, 1) 
        x = super(SpatialDropout, self).forward(x)  
        x = x.permute(0, 3, 2, 1) 
        x = x.squeeze(2) 
        return x

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [18]:
class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        self.embedding = nn.Embedding(features, 600)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)
        
        self.lstm1 = nn.LSTM(600, 128, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(128 * 2, 128, bidirectional=True, batch_first=True)
        
        self.linear1 = nn.Linear(512, 512)
        self.linear2 = nn.Linear(512, 512)
        
        self.linear_out = nn.Linear(512, 1)
        self.linear_aux_out = nn.Linear(512, num_aux_targets)
        
    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)
        
        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        avg_pool = torch.mean(h_lstm2, 1)
        max_pool, _ = torch.max(h_lstm2, 1)
        
        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))
        
        hidden = h_conc + h_conc_linear1 + h_conc_linear2
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        
        out = torch.cat([result, aux_result], 1)
        return out 

In [19]:
model = NeuralNet(embedding_matrix, 6)

In [27]:
def train_model(model, train, test, loss_fn, output_dim, lr=0.001,
                batch_size=512, n_epochs=4,
                enable_checkpoint_ensemble=True):
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]
    
    for epoch in range(n_epochs):
        
        scheduler.step()
        
        model.train()
        avg_loss = 0.
        
        for x_batch, y_batch in tqdm(train_loader, disable=False):
            y_pred = model(x_batch)            
            loss = loss_fn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        model.eval()
        test_preds = np.zeros((len(test), output_dim))
        
        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy())

            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        all_test_preds.append(test_preds)
        
        print('Epoch {}/{} \t loss={:.4f}'.format(
              epoch + 1, n_epochs, avg_loss))

    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)    
    else:
        test_preds = all_test_preds[-1]
        
    return test_preds

In [28]:
all_test_preds = []
test_preds = train_model(model, train_dataset, test_dataset, output_dim=y_train_torch.shape[-1], 
                             loss_fn=nn.BCEWithLogitsLoss(reduction='mean'))
all_test_preds.append(test_preds)
print()

100%|██████████| 2468/2468 [6:18:55<00:00,  8.09s/it]  
  0%|          | 0/2468 [00:00<?, ?it/s]

Epoch 1/4 	 loss=0.1041


100%|██████████| 2468/2468 [18:00:01<00:00,  7.76s/it]     
  0%|          | 0/2468 [00:00<?, ?it/s]

Epoch 2/4 	 loss=0.1026


100%|██████████| 2468/2468 [19:26:31<00:00,  7.93s/it]      
  0%|          | 0/2468 [00:00<?, ?it/s]

Epoch 3/4 	 loss=0.1016


100%|██████████| 2468/2468 [94:38:22<00:00,  8.78s/it]       


Epoch 4/4 	 loss=0.1009



In [39]:
test_preds[0]

array([0.17992039, 0.25732917, 0.01291539, 0.01643718, 0.06532176,
       0.167978  , 0.08475173])

In [43]:
len(y_test)

541463

In [42]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in tqdm([i * 0.01 for i in range(100)]):
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [46]:
ys = []
for y in y_test:
    ys.append(int(y[0]))

In [48]:
preds = []
for p in test_preds:
    if p[0] > 0.5:
        preds.append(1)
    else:
        preds.append(0)

In [50]:
accuracy_score(ys,preds)

0.9541409108286254

In [64]:
loss = tf.keras.losses.BinaryCrossentropy(
    from_logits=True,
    reduction=tf.keras.losses.Reduction.NONE)

In [9]:
inp = Input(shape=(200,))
x = Embedding(features-1, 600, weights=[embedding_matrix], trainable=False)(inp)
x = SpatialDropout1D(0.3)(x)

x1 = Bidirectional(LSTM(128, return_sequences=True))(x)
x2 = Bidirectional(LSTM(128, return_sequences=True))(x1)

max_pool = GlobalMaxPooling1D()(x2)
avg_pool = GlobalAveragePooling1D()(x2)

conc = Concatenate()([max_pool, avg_pool])

linear1 = Dense(512, activation='relu')(conc)
linear2 = Dense(512, activation='relu')(conc)

hidden = Add()([conc, linear1, linear2])

result = Dense(1)(conc)
result_aux = Dense(6)(conc)

output = Concatenate()([result, result_aux])

model = Model(inp, output)
adam = optimizers.Adam(lr=0.001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

NameError: name 'features' is not defined

In [None]:
model.fit(x_train, y_train, batch_size=512, epochs=1, verbose=1)

In [77]:
y2 = []
for y in y_train:
    y2.append(y[0])

In [85]:
len(x_train)

1263411

In [None]:
steps_per_epoch = np.ceil(len(x_train) / 512)