<a href="https://colab.research.google.com/github/vijayshankarrealdeal/GoogleColab/blob/main/ONEMORE_TIME.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge
!unzip "/content/train.csv.zip" -d "/content/d/"
!unzip "/content/test.csv.zip" -d "/content/d/"

In [7]:
!unzip '/content/sample_submission.csv.zip' -d '/content/d'

Archive:  /content/sample_submission.csv.zip
  inflating: /content/d/sample_submission.csv  


In [4]:
!kaggle datasets download -d takuok/glove840b300dtxt

Downloading glove840b300dtxt.zip to /content
100% 2.07G/2.08G [00:25<00:00, 58.5MB/s]
100% 2.08G/2.08G [00:25<00:00, 86.1MB/s]


In [5]:
!kaggle datasets download -d yekenot/fasttext-crawl-300d-2m

Downloading fasttext-crawl-300d-2m.zip to /content
 99% 1.42G/1.44G [00:32<00:00, 33.6MB/s]
100% 1.44G/1.44G [00:32<00:00, 47.5MB/s]


In [None]:
import numpy as np
import pandas as pd
import os
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D, CuDNNGRU, Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import LearningRateScheduler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
print(tf.__version__)
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

In [6]:
!unzip '/content/glove840b300dtxt.zip' -d '/content/extra_d'
!unzip '/content/fasttext-crawl-300d-2m.zip' -d '/content/extra_d'

Archive:  /content/glove840b300dtxt.zip
  inflating: /content/extra_d/glove.840B.300d.txt  
Archive:  /content/fasttext-crawl-300d-2m.zip
  inflating: /content/extra_d/crawl-300d-2M.vec  


In [9]:
EMBEDDING_FILES = [
        '/content/extra_d/crawl-300d-2M.vec',
    '/content/extra_d/glove.840B.300d.txt'
]
BATCH_SIZE = 512
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220

In [10]:
target_labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
special_chars = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

In [12]:
train_df = pd.read_csv('/content/d/train.csv')
test_df = pd.read_csv('/content/d/test.csv')
submission = pd.read_csv("/content/d/sample_submission.csv")

y = train_df[target_labels].values
x_train = train_df['comment_text'].astype(str)
y_train = y
x_test = test_df['comment_text'].astype(str)

In [14]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

In [17]:
def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

def build_model(embedding_matrix):
    words = Input(shape=(None,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(6, activation='sigmoid')(hidden)
    
    
    model = Model(inputs=words, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [15]:
tokenizer = text.Tokenizer(filters=special_chars)
tokenizer.fit_on_texts(list(x_train) + list(x_test))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [18]:
embedding_matrix = np.concatenate(
    [build_matrix(tokenizer.word_index, f) for f in EMBEDDING_FILES], axis=-1)

In [19]:
X_train, X_valid, Y_train, Y_valid = train_test_split(x_train, y_train, test_size = 0.1)

In [20]:
EPOCHS = 5
SEEDS = 10
pred = 0

In [21]:
for ii in range(SEEDS):
    model = build_model(embedding_matrix)
    for global_epoch in range(EPOCHS):
        print(global_epoch)
        model.fit(
                    X_train,
                    Y_train,
                    validation_data = (X_valid, Y_valid),
                    batch_size=128,
                    epochs=1,
                    verbose=2,
                    callbacks=[
                        LearningRateScheduler(lambda _: 1e-3 * (0.55 ** global_epoch))
                    ]
                )
        val_preds_3 = model.predict(X_valid)
        AUC = 0
        for i in range(6):
             AUC += roc_auc_score(Y_valid[:,i], val_preds_3[:,i])/6.
        print(AUC)

    pred += model.predict(x_test, batch_size = 1024, verbose = 1)/SEEDS
    model.save_weights('model_weights_'+str(ii)+'.h5')
    os.system('gzip '+'model_weights_'+str(ii)+'.h5')


0
1122/1122 - 128s - loss: 0.0501 - val_loss: 0.0405
0.986545434635814
1
1122/1122 - 122s - loss: 0.0391 - val_loss: 0.0386
0.9897863994269387
2
1122/1122 - 123s - loss: 0.0358 - val_loss: 0.0378
0.9896304125593662
3
1122/1122 - 123s - loss: 0.0334 - val_loss: 0.0383
0.9900989259026562
4
1122/1122 - 123s - loss: 0.0317 - val_loss: 0.0380
0.9898513166825671
0
1122/1122 - 124s - loss: 0.0506 - val_loss: 0.0401
0.9871825924595783
1
1122/1122 - 124s - loss: 0.0390 - val_loss: 0.0379
0.9903141125362994
2
1122/1122 - 123s - loss: 0.0357 - val_loss: 0.0371
0.9906148951982034
3
1122/1122 - 122s - loss: 0.0334 - val_loss: 0.0374
0.9903393262747885
4
1122/1122 - 122s - loss: 0.0317 - val_loss: 0.0376
0.9901061536712232
0
1122/1122 - 124s - loss: 0.0504 - val_loss: 0.0408
0.9884082276748174
1
1122/1122 - 123s - loss: 0.0388 - val_loss: 0.0381
0.9902785269105762
2
1122/1122 - 125s - loss: 0.0356 - val_loss: 0.0382
0.9906513271882563
3
1122/1122 - 125s - loss: 0.0335 - val_loss: 0.0373
0.9905433852

In [23]:
submission[target_labels] = (pred)
submission.to_csv("submission.csv", index = False)