In [None]:
import torch
import torch.nn as nn
import importlib
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score
from sotoxic.data_helper.data_transformer import DataTransformer
from sotoxic.data_helper.data_loader import DataLoader
from sotoxic.train.trainer import PyTorchModelTrainer
from sotoxic.config import dataset_config 

import sotoxic.models.pytorch.rhn as rhn
import sotoxic.train.trainer as trn
importlib.reload(rhn)
importlib.reload(trn)

In [None]:
VOCAB_SIZE = 100000
MAX_SEQUENCE_LENGTH = 300
EMBEDDING_SIZE = 300

EMBEDDING_FILE='features/crawl-300d-2M.vec'
#EMBEDDING_FILE='features/glove.840B.300d.txt'
#EMBEDDING_FILE='features/glove.twitter.27B.200d.txt'

## Prepare training and testing data

In [None]:
print("Loading the dataset")
data_transformer = DataTransformer(max_num_words=VOCAB_SIZE, max_sequence_length=MAX_SEQUENCE_LENGTH, char_level=False)
data_loader = DataLoader()
train_sequences, training_labels, test_sequences = data_transformer.prepare_data()

In [None]:
train_sequences[:20]

In [None]:
print("Loading pre-trained word embedding.")
embeddings_index = data_loader.load_embedding(EMBEDDING_FILE)
embedding_matrix = data_transformer.build_embedding_matrix(embeddings_index)
print("Loaded")

## Build model

In [None]:
def get_recurrent_higway_classifier():
    embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_SIZE)
    embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
    embedding.weight.requires_grad=False
    return rhn.RecurrentHighwayClassifier(
        input_size=EMBEDDING_SIZE,
        hidden_size=60, 
        embedding=embedding,
        recurrence_length=2,
        recurrent_dropout=0.25)


## Training

In [None]:
trainer = trn.PyTorchModelTrainer(model_stamp="FASTTEXT_RHN_64_64", epoch_num=300, learning_rate=1e-3,
                                  verbose_round=80, shuffle_inputs=False, early_stopping_round=12)

In [None]:
models, best_logloss, best_auc, best_val_pred = trainer.train_folds(X=train_sequences, y=training_labels,
                    fold_count=10, batch_size=256, get_model_func=get_recurrent_higway_classifier)

In [None]:
new_models, best_logloss, best_auc, best_val_pred = trainer.keep_train_folds(X=train_sequences, y=training_labels,
                    fold_count=10, batch_size=256, old_models=models)

In [None]:
best_logloss

## Make predections

In [None]:
train_fold_preditcions = np.concatenate(best_val_pred, axis=0)
training_auc = roc_auc_score(training_labels, train_fold_preditcions)
print("Training AUC", training_auc)

In [None]:
path = 'Dataset/'
TRAIN_DATA_FILE=path + 'train.csv'
TEST_DATA_FILE=path + 'test.csv'
test_df = pd.read_csv(TEST_DATA_FILE)
train_df = pd.read_csv(TRAIN_DATA_FILE)

In [None]:
#test_data = test_df
CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
submit_path_prefix = "results/rhn/Fasttext-tunedRHN-" + str(MAX_SEQUENCE_LENGTH) 

print("Predicting testing results...")
test_predicts_list = []
for fold_id, model in enumerate(new_models):
    test_predicts = model.predict(test_sequences, batch_size=256, verbose=1)
    test_predicts_list.append(test_predicts)

test_predicts = np.zeros(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    test_predicts += fold_predict
test_predicts /= len(test_predicts_list)

test_ids = test_df["id"].values
test_ids = test_ids.reshape((len(test_ids), 1))

test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
test_predicts["id"] = test_ids
test_predicts = test_predicts[["id"] + CLASSES]

In [None]:
submit_path = submit_path_prefix + "-L{:4f}-A{:4f}.csv".format(best_logloss, best_auc)
test_predicts.to_csv(submit_path, index=False)

In [None]:
train_ids = train_df["id"].values
train_ids = train_ids.reshape((len(train_ids), 1))

train_predicts = pd.DataFrame(data=train_fold_preditcions, columns=CLASSES)
train_predicts["id"] = train_ids
train_predicts = train_predicts[["id"] + CLASSES]
submit_path = submit_path_prefix + "-Train-L{:4f}-A{:4f}.csv".format(best_logloss, best_auc)
train_predicts.to_csv(submit_path, index=False)