In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dl-kaggle-dataset/cleaned_train_x.csv
/kaggle/input/dl-kaggle-dataset/cleaned_val_x.csv
/kaggle/input/dl-kaggle-dataset/Train.py
/kaggle/input/dl-kaggle-dataset/cleaned_test_x.csv
/kaggle/input/dl-kaggle-dataset/train_y.csv
/kaggle/input/dl-kaggle-dataset/train_x.csv
/kaggle/input/dl-kaggle-dataset/test_x.csv
/kaggle/input/dl-kaggle-dataset/glove.840B.300d.txt
/kaggle/input/dl-kaggle-dataset/DataPreprocessing.py
/kaggle/input/dl-kaggle-dataset/val_x.csv
/kaggle/input/dl-kaggle-dataset/val_y.csv
/kaggle/input/dl-kaggle-dataset/rnn_baseline.py
/kaggle/input/dl-kaggle-dataset/cleanwords.txt
/kaggle/input/dl-kaggle-dataset/DataLoader.py


In [4]:
import tensorflow as tf
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

GPU is available


In [5]:
import torch
import pickle
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torchmetrics import AUROC, F1Score
from keras.preprocessing.text import Tokenizer

# import module we'll need to import our custom module
from shutil import copyfile
# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "/kaggle/input/dl-kaggle-dataset/DataLoader.py", dst = "../working/DataLoader.py")
copyfile(src = "/kaggle/input/dl-kaggle-dataset/rnn_baseline.py", dst = "../working/rnn_baseline.py")
copyfile(src = "/kaggle/input/dl-kaggle-dataset/Train.py", dst = "../working/Train.py")
copyfile(src = "/kaggle/input/dl-kaggle-dataset/DataPreprocessing.py", dst = "../working/DataPreprocessing.py")
# import all our functions
from DataLoader import DataLoader
from rnn_baseline import get_av_rnn
from Train import Trainer
from DataPreprocessing import get_clean_word_dict, get_clean_data

In [6]:
cl_path = (os.path.join(dirname, 'cleanwords.txt'))
clean_word_dict = get_clean_word_dict(cl_path)
glove_path = os.path.join(dirname, 'glove.840B.300d.txt')
embedding_path = [glove_path]
MAX_SEQUENCE_LENGTH = 400
MAX_FEATURES = 100000
EMBEDDING_DIM = 300
torch.manual_seed(0)
dataloader = DataLoader()
embedding_index = dataloader.load_embedding(embedding_path)

Total 2195884 word vectors


In [7]:
train_x = pd.read_csv(os.path.join(dirname, 'train_x.csv'))
val_x = pd.read_csv(os.path.join(dirname, 'val_x.csv'))
test_x = pd.read_csv(os.path.join(dirname, 'test_x.csv'))

In [8]:
cleaned_train_x = get_clean_data(train_x, clean_word_dict)
cleaned_val_x = get_clean_data(val_x, clean_word_dict)
cleaned_test_x = get_clean_data(test_x, clean_word_dict)
train_x['string'] = cleaned_train_x
test_x['string'] = cleaned_test_x
val_x['string'] = cleaned_val_x

Cleaned
Cleaned
Cleaned


In [9]:
train_y = pd.read_csv(os.path.join(dirname, 'train_y.csv'))
val_y = pd.read_csv(os.path.join(dirname, 'val_y.csv'))

In [10]:
list_classes = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white', 'y']
train_y, val_y = dataloader.load_dataset(train_x, train_y, val_x, val_y, test_x, list_classes)

Shape of train_y : (269038, 9)
Shape of val_y : (45180, 9)


In [11]:
tokenizer = Tokenizer(num_words = MAX_FEATURES)
train_x, test_x, val_x, word_index = dataloader.tokenize(tokenizer, MAX_SEQUENCE_LENGTH)
embedding_matrix = dataloader.create_embedding_matrix(word_index, EMBEDDING_DIM, embedding_index, MAX_FEATURES)

Shape of train_x tensor: (269038, 400)
Shape of test_data tensor: (133782, 400)
Shape of val_data tensor: (45180, 400)
Found 136016 unique tokens
Null word embeddings: 21362


In [12]:
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

(269038, 400)
(269038, 9)
(45180, 400)
(45180, 9)


In [16]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Lambda, Dropout, concatenate
from keras.optimizers import Adam
from keras.layers import Input, Embedding, SpatialDropout1D, Dense, Bidirectional, LSTM, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.models import Model

def get_LSTM(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    inp = Input(shape=(max_sequence_length,))

    x = Embedding(nb_words, 
                  embedding_dim, 
                  weights=[embedding_matrix],
                  input_length=max_sequence_length,
                  trainable=False)(inp)
    x = SpatialDropout1D(0.35)(x)

    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = Dropout(0.2)(x)

    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    x = concatenate([avg_pool, max_pool])
    
    x = Dense(50, activation="relu")(x)
    out = Dense(out_size, activation='sigmoid')(x)
    model = Model(inp, out)
    adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, amsgrad=True)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    model.summary()
    return model

In [14]:
MODEL_CHECKPOINT_FOLDER = "checkpoints/"
TEMPORARY_CHECKPOINTS_PATH = 'temporary_checkpoints/'
MAX_SENTENCE_LENGTH = 350
nb_words = min(MAX_FEATURES, len(word_index))
def get_model():
    return get_LSTM(nb_words, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, out_size=9)

In [17]:
def worst_group_accuracy(prediction, y):
    """
        Compute the worst group accuracy, with the groups being defined by ['male', 'female', 'LGBTQ',
        'christian', 'muslim', 'other_religions', 'black', 'white'] for positive and negative toxicity.
        arguments:
            prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
            y [pandas.DataFrame]: dataframe containing the metadata
        returns:
            wga [float]: worst group accuracy
    """
    y.loc[prediction.index, 'pred'] = prediction.pred

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    for category in categories:
        for label in [0, 1]:
            group = y.loc[y[category] == label]
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            accuracies.append(group_accuracy)
    wga = np.min(accuracies)
    return wga

In [None]:
trainer = Trainer(model_stamp='LSTM', epoch_num=4, learning_rate=1e-3)
models,val_loss,total_auc,fold_predictions = trainer.train_folds(X=train_x, y=train_y, fold_count=5, batch_size=256, get_model_func=get_model)
print("Predicting val results...")
val_predicts_list = []
for fold_id, model in enumerate(models):
    val_predicts = model.predict(val_x, batch_size=256, verbose=1)
    val_predicts_list.append(val_predicts)
avg_val_predicts = np.zeros(val_predicts_list[0].shape)
for fold_predict in val_predicts_list:
    avg_val_predicts += fold_predict
avg_val_predicts /= len(val_predicts_list)

val_y_test = pd.read_csv(os.path.join(dirname, 'val_y.csv'))
pred_df = pd.DataFrame()
pred_df['pred'] = [val[-1] for val in avg_val_predicts]
pred_df = pred_df.reset_index()
metric = worst_group_accuracy(pred_df, val_y_test)
print(f'AVG_WGA: {metric}')

In [29]:
############################
## Bi-directional LSTM with attention
###########################
from keras.layers import Layer
from keras import initializers
from keras import backend as K
class AttentionWeightedAverage(Layer):
    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)

    def build(self, input_shape):
        # self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3
        self.W = self.add_weight(shape=(input_shape[2], 1), name='{}_W'.format(self.name), initializer=self.init)
        # self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [30]:
def get_LSTM_attn(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    inp = Input(shape=(max_sequence_length,))

    x = Embedding(nb_words, 
                  embedding_dim, 
                  weights=[embedding_matrix],
                  input_length=max_sequence_length,
                  trainable=False)(inp)
    x = SpatialDropout1D(0.35)(x)

    x = Bidirectional(LSTM(60, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    x = Bidirectional(LSTM(60, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    
    last = Lambda(lambda t:t[:, -1], name='last')(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    attn = AttentionWeightedAverage()(x)
    x = concatenate([last, avg_pool, max_pool, attn])
    
    x = Dense(50, activation="relu")(x)
    out = Dense(out_size, activation='sigmoid')(x)
    model = Model(inp, out)
    adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, amsgrad=True)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
    model.summary()
    return model

In [31]:
MODEL_CHECKPOINT_FOLDER = "checkpoints/"
TEMPORARY_CHECKPOINTS_PATH = 'temporary_checkpoints/'
MAX_SENTENCE_LENGTH = 350
nb_words = min(MAX_FEATURES, len(word_index))
def get_model():
    return get_LSTM_attn(nb_words, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, out_size=9)

In [32]:
trainer = Trainer(model_stamp='LSTM_attn', epoch_num=4, learning_rate=1e-3)
models,val_loss,total_auc,fold_predictions = trainer.train_folds(X=train_x, y=train_y, fold_count=5, batch_size=256, get_model_func=get_model)
print("Predicting val results...")
val_predicts_list = []
for fold_id, model in enumerate(models):
    val_predicts = model.predict(val_x, batch_size=256, verbose=1)
    val_predicts_list.append(val_predicts)
avg_val_predicts = np.zeros(val_predicts_list[0].shape)
for fold_predict in val_predicts_list:
    avg_val_predicts += fold_predict
avg_val_predicts /= len(val_predicts_list)

val_y_test = pd.read_csv(os.path.join(dirname, 'val_y.csv'))
pred_df = pd.DataFrame()
pred_df['pred'] = [val[-1] for val in avg_val_predicts]
pred_df = pred_df.reset_index()
metric = worst_group_accuracy(pred_df, val_y_test)
print(f'AVG_WGA: {metric}')

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 400)]                0         []                            
                                                                                                  
 embedding_2 (Embedding)     (None, 400, 300)             3000000   ['input_3[0][0]']             
                                                          0                                       
                                                                                                  
 spatial_dropout1d_2 (Spati  (None, 400, 300)             0         ['embedding_2[0][0]']         
 alDropout1D)                                                                                     
                                                                                              

In [35]:
test_predicts_list = []
for fold_id, model in enumerate(models):
    test_predicts = model.predict(test_x, batch_size=256, verbose=1)
    test_predicts_list.append(test_predicts)
test_val_predicts = np.zeros(test_predicts_list[0].shape)
avg_test_predicts = np.zeros(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    avg_test_predicts += fold_predict
avg_test_predicts /= len(test_predicts_list)

In [42]:
pred_df = pd.DataFrame()
pred_df['pred'] = [val[-1] for val in avg_test_predicts]
pred_df = pred_df.reset_index()

In [46]:
pred_df.to_csv('LSMT_prediction.csv', index=False)