In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dl-kaggle-dataset/cleaned_train_x.csv
/kaggle/input/dl-kaggle-dataset/cleaned_val_x.csv
/kaggle/input/dl-kaggle-dataset/Train.py
/kaggle/input/dl-kaggle-dataset/cleaned_test_x.csv
/kaggle/input/dl-kaggle-dataset/train_y.csv
/kaggle/input/dl-kaggle-dataset/train_x.csv
/kaggle/input/dl-kaggle-dataset/test_x.csv
/kaggle/input/dl-kaggle-dataset/glove.840B.300d.txt
/kaggle/input/dl-kaggle-dataset/DataPreprocessing.py
/kaggle/input/dl-kaggle-dataset/val_x.csv
/kaggle/input/dl-kaggle-dataset/val_y.csv
/kaggle/input/dl-kaggle-dataset/rnn_baseline.py
/kaggle/input/dl-kaggle-dataset/cleanwords.txt
/kaggle/input/dl-kaggle-dataset/DataLoader.py


In [4]:
import tensorflow as tf
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

GPU is available


In [5]:
import torch
import pickle
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torchmetrics import AUROC, F1Score
from keras.preprocessing.text import Tokenizer

# import module we'll need to import our custom module
from shutil import copyfile
# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "/kaggle/input/dl-kaggle-dataset/DataLoader.py", dst = "../working/DataLoader.py")
copyfile(src = "/kaggle/input/dl-kaggle-dataset/rnn_baseline.py", dst = "../working/rnn_baseline.py")
copyfile(src = "/kaggle/input/dl-kaggle-dataset/Train.py", dst = "../working/Train.py")
copyfile(src = "/kaggle/input/dl-kaggle-dataset/DataPreprocessing.py", dst = "../working/DataPreprocessing.py")
# import all our functions
from DataLoader import DataLoader
from rnn_baseline import get_av_rnn
from Train import Trainer
from DataPreprocessing import get_clean_word_dict, get_clean_data

In [6]:
cl_path = (os.path.join(dirname, 'cleanwords.txt'))
clean_word_dict = get_clean_word_dict(cl_path)
glove_path = os.path.join(dirname, 'glove.840B.300d.txt')
embedding_path = [glove_path]
MAX_SEQUENCE_LENGTH = 400
MAX_FEATURES = 100000
EMBEDDING_DIM = 300
torch.manual_seed(0)
dataloader = DataLoader()
embedding_index = dataloader.load_embedding(embedding_path)

Total 2195884 word vectors


In [7]:
train_x = pd.read_csv(os.path.join(dirname, 'cleaned_train_x.csv'))
val_x = pd.read_csv(os.path.join(dirname, 'cleaned_val_x.csv'))
test_x = pd.read_csv(os.path.join(dirname, 'cleaned_test_x.csv'))

In [8]:
train_y = pd.read_csv(os.path.join(dirname, 'train_y.csv'))
val_y = pd.read_csv(os.path.join(dirname, 'val_y.csv'))
list_classes = ['y']
train_y, val_y = dataloader.load_dataset(train_x, train_y, val_x, val_y, test_x, list_classes)

Shape of train_y : (269038, 1)
Shape of val_y : (45180, 1)


In [9]:
tokenizer = Tokenizer(num_words = MAX_FEATURES)
train_x, test_x, val_x, word_index = dataloader.tokenize(tokenizer, MAX_SEQUENCE_LENGTH)
embedding_matrix = dataloader.create_embedding_matrix(word_index, EMBEDDING_DIM, embedding_index, MAX_FEATURES)

Shape of train_x tensor: (269038, 400)
Shape of test_data tensor: (133782, 400)
Shape of val_data tensor: (45180, 400)
Found 136016 unique tokens
Null word embeddings: 21362


In [10]:
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

(269038, 400)
(269038, 1)
(45180, 400)
(45180, 1)


In [11]:
MODEL_CHECKPOINT_FOLDER = "checkpoints/"
TEMPORARY_CHECKPOINTS_PATH = 'temporary_checkpoints/'
MAX_SENTENCE_LENGTH = 350
nb_words = min(MAX_FEATURES, len(word_index))
def get_model():
    return get_av_rnn(nb_words, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, out_size=1)

In [12]:
train_x = np.concatenate((train_x, val_x), axis = 0)
train_y = np.concatenate((train_y, val_y), axis = 0)
print(train_x.shape)
print(train_y.shape)

(314218, 400)
(314218, 1)


In [13]:
def worst_group_accuracy(prediction, y):
    """
        Compute the worst group accuracy, with the groups being defined by ['male', 'female', 'LGBTQ',
        'christian', 'muslim', 'other_religions', 'black', 'white'] for positive and negative toxicity.
        arguments:
            prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
            y [pandas.DataFrame]: dataframe containing the metadata
        returns:
            wga [float]: worst group accuracy
    """
    y.loc[prediction.index, 'pred'] = prediction.pred

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    for category in categories:
        for label in [0, 1]:
            group = y.loc[y[category] == label]
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            accuracies.append(group_accuracy)
    
    acc_index = 1
    for cate in categories:
        for label in [1]:
            print(f'{cate}_{label}: {accuracies[acc_index]}')
            acc_index += 2
    wga = np.min(accuracies)
    return wga

In [14]:
###################################
### with CV
###################################
trainer = Trainer(model_stamp='av_rnn', epoch_num=6, learning_rate=1e-3)

models,val_loss,total_auc,fold_predictions = trainer.train_folds(X=train_x, y=train_y, fold_count=3, batch_size=256, get_model_func=get_model)
print("Predicting val results...")
val_predicts_list = []

for fold_id, model in enumerate(models):
    val_predicts = model.predict(val_x, batch_size=256, verbose=1)
    val_predicts_list.append(val_predicts)

avg_val_predicts = np.zeros(val_predicts_list[0].shape)
for fold_predict in val_predicts_list:
    avg_val_predicts += fold_predict
avg_val_predicts /= len(val_predicts_list)

val_y_test = pd.read_csv(os.path.join(dirname, 'val_y.csv'))
pred_df = pd.DataFrame()
pred_df['pred'] = [val[-1] for val in avg_val_predicts]
pred_df = pred_df.reset_index()
metric = worst_group_accuracy(pred_df, val_y_test)
print(f'AVG_WGA: {metric}')

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 400)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 400, 300)             3000000   ['input_1[0][0]']             
                                                          0                                       
                                                                                                  
 spatial_dropout1d (Spatial  (None, 400, 300)             0         ['embedding[0][0]']           
 Dropout1D)                                                                                       
                                                                                              

In [None]:
for fold_id, model in enumerate(models):
    val_predicts = model.predict(val_x, batch_size=256, verbose=1)
    val_predicts_list.append(val_predicts)

avg_val_predicts = np.zeros(val_predicts_list[0].shape)
for fold_predict in val_predicts_list:
    avg_val_predicts += fold_predict
avg_val_predicts /= len(val_predicts_list)

val_y_test = pd.read_csv(os.path.join(dirname, 'val_y.csv'))

In [17]:
for pred in val_predicts_list:
    pred_df = pd.DataFrame()
    pred_df['pred'] = [val[-1] for val in pred]
    pred_df = pred_df.reset_index()
    metric = worst_group_accuracy(pred_df, val_y_test)
    print(f'WGA: {metric}')
    print('----------------')

male_1: 0.910598111227702
female_1: 0.9149550161262944
LGBTQ_1: 0.8311599176389842
christian_1: 0.9371428571428572
muslim_1: 0.8459715639810427
other_religions_1: 0.8681541582150102
black_1: 0.7875302663438256
white_1: 0.807464248343216
WGA: 0.7875302663438256
----------------
male_1: 0.9118572927597062
female_1: 0.9164827703276184
LGBTQ_1: 0.8277282086479066
christian_1: 0.9391208791208792
muslim_1: 0.8454976303317535
other_religions_1: 0.8742393509127789
black_1: 0.801452784503632
white_1: 0.8099058249040809
WGA: 0.801452784503632
----------------
male_1: 0.8906610703043022
female_1: 0.8981497199117298
LGBTQ_1: 0.8016472203157172
christian_1: 0.9296703296703297
muslim_1: 0.8241706161137441
other_religions_1: 0.8519269776876268
black_1: 0.7524213075060533
white_1: 0.7854900592954308
WGA: 0.7524213075060533
----------------


In [33]:
for pred in val_predicts_list:
    pred_df = pd.DataFrame()
    pred_df['pred'] = [val[-1] for val in pred]
    pred_df['pred'] = [1 if x > 0.5 else 0 for x in pred_df['pred']]
    pred_df = pred_df.reset_index()
    metric = worst_group_accuracy(pred_df, val_y_test)
    print(f'WGA: {metric}')
    print('----------------')

male_1: 0.910598111227702
female_1: 0.9149550161262944
LGBTQ_1: 0.8311599176389842
christian_1: 0.9371428571428572
muslim_1: 0.8459715639810427
other_religions_1: 0.8681541582150102
black_1: 0.7875302663438256
white_1: 0.807464248343216
WGA: 0.7875302663438256
----------------
male_1: 0.9118572927597062
female_1: 0.9164827703276184
LGBTQ_1: 0.8277282086479066
christian_1: 0.9391208791208792
muslim_1: 0.8454976303317535
other_religions_1: 0.8742393509127789
black_1: 0.801452784503632
white_1: 0.8099058249040809
WGA: 0.801452784503632
----------------
male_1: 0.8906610703043022
female_1: 0.8981497199117298
LGBTQ_1: 0.8016472203157172
christian_1: 0.9296703296703297
muslim_1: 0.8241706161137441
other_religions_1: 0.8519269776876268
black_1: 0.7524213075060533
white_1: 0.7854900592954308
WGA: 0.7524213075060533
----------------


In [15]:
test_predicts_list = []
for fold_id, model in enumerate(models):
    test_predicts = model.predict(test_x, batch_size=256, verbose=1)
    test_predicts_list.append(test_predicts)
avg_test_predicts = np.zeros(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    avg_test_predicts += fold_predict
avg_test_predicts /= len(test_predicts_list)
pred_df = pd.DataFrame()
pred_df['pred'] = [val[-1] for val in avg_test_predicts]
pred_df = pred_df.reset_index()



In [16]:
pred_df.to_csv('GRU_prediction.csv', index=False)

In [19]:
model =  models[1]
test_predicts = model.predict(test_x, batch_size=256, verbose=1)



In [21]:
pred_df = pd.DataFrame()
pred_df['pred'] = [val[-1] for val in test_predicts]
pred_df = pred_df.reset_index()
pred_df = pred_df.rename(columns={'index': 'ID'})
pred_df.to_csv('GRU_prediction_best.csv', index=False)