In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dl-kaggle-dataset/cleaned_train_x.csv
/kaggle/input/dl-kaggle-dataset/cleaned_val_x.csv
/kaggle/input/dl-kaggle-dataset/Train.py
/kaggle/input/dl-kaggle-dataset/cleaned_test_x.csv
/kaggle/input/dl-kaggle-dataset/train_y.csv
/kaggle/input/dl-kaggle-dataset/train_x.csv
/kaggle/input/dl-kaggle-dataset/test_x.csv
/kaggle/input/dl-kaggle-dataset/glove.840B.300d.txt
/kaggle/input/dl-kaggle-dataset/DataPreprocessing.py
/kaggle/input/dl-kaggle-dataset/val_x.csv
/kaggle/input/dl-kaggle-dataset/val_y.csv
/kaggle/input/dl-kaggle-dataset/rnn_baseline.py
/kaggle/input/dl-kaggle-dataset/cleanwords.txt
/kaggle/input/dl-kaggle-dataset/DataLoader.py


In [3]:
import tensorflow as tf
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")



GPU is available


In [4]:
import torch
import pickle
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torchmetrics import AUROC, F1Score
from keras.preprocessing.text import Tokenizer

# import module we'll need to import our custom module
from shutil import copyfile
# copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "/kaggle/input/dl-kaggle-dataset/DataLoader.py", dst = "../working/DataLoader.py")
copyfile(src = "/kaggle/input/dl-kaggle-dataset/rnn_baseline.py", dst = "../working/rnn_baseline.py")
copyfile(src = "/kaggle/input/dl-kaggle-dataset/Train.py", dst = "../working/Train.py")
copyfile(src = "/kaggle/input/dl-kaggle-dataset/DataPreprocessing.py", dst = "../working/DataPreprocessing.py")
# import all our functions
from DataLoader import DataLoader
from rnn_baseline import get_av_rnn
from Train import Trainer
from DataPreprocessing import get_clean_word_dict, get_clean_data

In [5]:
cl_path = (os.path.join(dirname, 'cleanwords.txt'))
clean_word_dict = get_clean_word_dict(cl_path)

In [6]:
glove_path = os.path.join(dirname, 'glove.840B.300d.txt')
embedding_path = [glove_path]
MAX_SEQUENCE_LENGTH = 400
MAX_FEATURES = 100000
EMBEDDING_DIM = 300
torch.manual_seed(0)
dataloader = DataLoader()
embedding_index = dataloader.load_embedding(embedding_path)

Total 2195884 word vectors


In [7]:
train_x = pd.read_csv(os.path.join(dirname, 'cleaned_train_x.csv'))
val_x = pd.read_csv(os.path.join(dirname, 'cleaned_val_x.csv'))
test_x = pd.read_csv(os.path.join(dirname, 'cleaned_test_x.csv'))

In [8]:
train_y = pd.read_csv(os.path.join(dirname, 'train_y.csv'))
val_y = pd.read_csv(os.path.join(dirname, 'val_y.csv'))

In [9]:
list_classes = ['y']
train_y, val_y = dataloader.load_dataset(train_x, train_y, val_x, val_y, test_x, list_classes)

Shape of train_y : (269038, 1)
Shape of val_y : (45180, 1)


In [10]:
tokenizer = Tokenizer(num_words = MAX_FEATURES)
train_x, test_x, val_x, word_index = dataloader.tokenize(tokenizer, MAX_SEQUENCE_LENGTH)
embedding_matrix = dataloader.create_embedding_matrix(word_index, EMBEDDING_DIM, embedding_index, MAX_FEATURES)

Shape of train_x tensor: (269038, 400)
Shape of test_data tensor: (133782, 400)
Shape of val_data tensor: (45180, 400)
Found 136016 unique tokens
Null word embeddings: 21362


In [11]:
print(train_x.shape)
print(train_y.shape)
print(val_x.shape)
print(val_y.shape)

(269038, 400)
(269038, 1)
(45180, 400)
(45180, 1)


In [12]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Lambda
from keras.optimizers import Adam
from keras.layers import Input, Embedding, SpatialDropout1D, Dense
from keras.models import Model

def get_baseline(nb_words, embedding_dim, embedding_matrix, max_sequence_length, out_size):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(nb_words, 
                                embedding_dim, 
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)(input_layer)
    embedding_layer = SpatialDropout1D(0.25)(embedding_layer)
    
    
    x = Dense(128, activation='relu')(embedding_layer)
    
    last = Lambda(lambda t: t[:, -1], name='last')(x)
    
    output_layer = Dense(out_size, activation='sigmoid')(last)
    model = Model(inputs=input_layer, outputs=output_layer)
    adam_optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-3, decay=1e-6, clipvalue=5)
    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['accuracy'])
    model.summary()
    return model

In [13]:
MODEL_CHECKPOINT_FOLDER = "checkpoints/"
TEMPORARY_CHECKPOINTS_PATH = 'temporary_checkpoints/'
MAX_SENTENCE_LENGTH = 350
nb_words = min(MAX_FEATURES, len(word_index))
def get_model():
    return get_baseline(nb_words, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, out_size=1)

In [14]:
def worst_group_accuracy(prediction, y):
    """
        Compute the worst group accuracy, with the groups being defined by ['male', 'female', 'LGBTQ',
        'christian', 'muslim', 'other_religions', 'black', 'white'] for positive and negative toxicity.
        arguments:
            prediction [pandas.DataFrame]: dataframe with 2 columns (index and pred)
            y [pandas.DataFrame]: dataframe containing the metadata
        returns:
            wga [float]: worst group accuracy
    """
    y.loc[prediction.index, 'pred'] = prediction.pred

    categories = ['male', 'female', 'LGBTQ', 'christian', 'muslim', 'other_religions', 'black', 'white']
    accuracies = []
    for category in categories:
        for label in [0, 1]:
            group = y.loc[y[category] == label]
            group_accuracy = (group['y'] == (group['pred'] > 0.5)).mean()
            accuracies.append(group_accuracy)
    wga = np.min(accuracies)
    return wga

In [16]:
trainer = Trainer(model_stamp='baseline', epoch_num=5, learning_rate=1e-3)
models,val_loss,total_auc,fold_predictions = trainer._train_model_by_logloss(model=get_model(), batch_size=256, train_x=train_x, train_y=train_y, val_x=val_x, val_y=val_y, fold_id=0)
print("Predicting val results...")
val_predicts_list = []
val_predicts = models.predict(val_x, batch_size=256, verbose=1)
val_predicts_list.append(val_predicts)
val_y_test = pd.read_csv(os.path.join(dirname, 'val_y.csv'))
pred_df = pd.DataFrame()
pred_df['pred'] = [val[-1] for val in val_pred]
pred_df = pred_df.reset_index()
metric = worst_group_accuracy(pred_df, val_y_test)
print(f'WGA: {metric}')

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 400)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 400, 300)          30000000  
                                                                 
 spatial_dropout1d_1 (Spati  (None, 400, 300)          0         
 alDropout1D)                                                    
                                                                 
 dense_2 (Dense)             (None, 400, 128)          38528     
                                                                 
 last (Lambda)               (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                           

In [17]:
test_predicts = models.predict(test_x, batch_size=256, verbose=1)




In [27]:
pred_df = pd.DataFrame()
pred_df['pred'] = [val[0] for val in test_predicts]
pred_df = pred_df.reset_index()

In [29]:
pred_df.to_csv('MLP_predicion.csv', index=False)