## Approach 2: Recurrent Neural Networks

This approach builds a RNN model to predict the safety of product.

In [None]:
# Upgrade dependencies
! which python3
! pip3 install -r ../../requirements.txt

In [None]:
import re, time
import numpy as np
import torch, torchtext
import boto3
import os
import pandas as pd

from os import path
from collections import Counter
from torch import nn, optim
from torch.nn import BCEWithLogitsLoss
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from torchtext.vocab import GloVe

## 1. Reading the dataset

use the __pandas__ library to read dataset

#### __Training data:__

In [None]:
train_df = pd.read_csv('../data/training.csv', encoding='utf-8', header=0)
train_df.head()

#### __Test data:__

In [None]:
test_df = pd.read_csv('../data/training.csv', encoding='utf-8', header=0)
test_df.head()

## 2. Train a RNN model

In [None]:
#Convert datatype to float64
train_df["human_tag"].value_counts()
train_df["human_tag"] = train_df["human_tag"].astype(np.float64)

In [None]:
# Check the number of missing values for each columm
print(train_df.isna().sum())

In [None]:
# fill-in the missing values in it with the empty string
train_df["text"] = train_df["text"].fillna("")

In [None]:
# Split train and validation data
train_text, val_text, train_label, val_label = train_test_split(
    train_df["text"].tolist(),
    train_df["human_tag"].tolist(),
    test_size=0.10,
    shuffle=True,
    random_state=324,
)

In [None]:
# create a vocabulary with the tokens from the text data
tokenizer = get_tokenizer("basic_english")
counter = Counter()
for line in train_text:
    counter.update(tokenizer(line))
vocab = Vocab(counter, min_freq=1)

In [None]:
# create a mapper to transform our text data
text_transform_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

In [3]:
# Let's create a function for transformation
# In this function, transform and pad (if necessary) text data
# cut the series of words at the point where it reaches a certain lenght (we used max_len=50 here)
# If the text is shorter than max_len, we pad zeros to the end
def transformText(text_list, max_len):
    # Transform the text
    transformed_data = [text_transform_pipeline(text)[:max_len] for text in text_list]

    # Pad zeros if the text is shoter than max_len
    for data in transformed_data:
        data[len(data) : max_len] = np.zeros(max_len - len(data))

    return torch.tensor(transformed_data, dtype=torch.int64)

In [None]:
# use the transformText() function and create the data loaders
# use max_len=100 to consider the first 100 words in the text
max_len = 100
batch_size = 16

# Pass transformed and padded data to dataset
# Create data loaders
train_dataset = TensorDataset(
    transformText(train_text, max_len), torch.tensor(train_label)
)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

val_dataset = TensorDataset(transformText(val_text, max_len), torch.tensor(val_label))
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
# use GloVe word vectors
glove = GloVe(name="6B", dim=300)
embedding_matrix = glove.get_vecs_by_tokens(vocab.itos)

In [None]:
# Size of the state vectors
hidden_size = 8

# General NN training parameters
learning_rate = 0.001
epochs = 25

# Embedding vector and vocabulary sizes
embed_size = 300  # glove.6B.300d.txt
vocab_size = len(vocab.itos)

In [None]:
# Model is made of these layers
# Embedding layer: This is where words/tokens are mapped to word vectors.
# RNN layer: use a simple 2-layer RNN model
# Linear layer: A linear layer with a single neuron is used to output the isPositive prediction.
class Net(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(
            embed_size, hidden_size, num_layers=num_layers
        )

        self.linear = nn.Linear(hidden_size*max_len, 1)
        self.act = nn.Sigmoid()

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        # Call RNN layer
        outputs, _ = self.rnn(embeddings)
        # Use the output of each time step
        # Send it all together to the linear layer
        outs = self.linear(outputs.reshape(outputs.shape[0], -1))
        return self.act(outs)
    
model = Net(vocab_size, embed_size, hidden_size, num_layers=2)

# Initialize the weights
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
    if type(m) == nn.RNN:
        for param in m._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_uniform_(m._parameters[param])

In [None]:
# Set the embedding layer's parameters from GloVe
model.embedding.weight.data.copy_(embedding_matrix)

# Won't change/train the embedding layer
model.embedding.weight.requires_grad = False

In [None]:
# Setting our trainer
trainer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Use Binary Cross-entropy loss
# reduction="sum" sums the losses for given output and target
cross_ent_loss = nn.BCELoss(reduction="sum")

In [None]:
# Get the compute device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.apply(init_weights)
model.to(device)

for epoch in range(epochs):
    start = time.time()
    training_loss = 0
    val_loss = 0
    # Training loop, train the network
    for data, target in train_loader:
        trainer.zero_grad()
        data = data.to(device)
        target = target.to(device)
        output = model(data)
        L = cross_ent_loss(output, target.unsqueeze(1))
        training_loss += L.item()
        L.backward()
        trainer.step()

    # Validate the network, no training (no weight update)
    for data, target in val_loader:
        val_predictions = model(data.to(device))
        L = cross_ent_loss(val_predictions, target.to(device).unsqueeze(1))
        val_loss += L.item()

    # Let's take the average losses
    training_loss = training_loss / len(train_label)
    val_loss = val_loss / len(val_label)

    end = time.time()
    print(
        f"Epoch {epoch}. Train_loss {training_loss}. Val_loss {val_loss}. Seconds {end-start}"
    )

## 3. Make predictions on your test dataset

In [None]:
# Fill-in missing values
test_df["text"].fillna("", inplace=True)

In [None]:
# Get Test Text
test_text = test_df["text"]

In [None]:
# Get Prediction
test_prediction = []
test_dataset = TensorDataset(transformText(test_text, max_len))
test_loader = DataLoader(test_dataset, batch_size=batch_size)

for data in test_loader:
    test_preds = model(data[0].to(device))
    test_prediction.extend(
        [np.rint(test_pred)[0] for test_pred in test_preds.detach().cpu().numpy()]
    )

## 4. Write predictions to a CSV file

In [None]:
import pandas as pd
 
result_df = pd.DataFrame()
result_df["ID"] = test_df["ID"]
result_df["human_tag"] = test_prediction
 
result_df.to_csv("../../data/approach_2_result.csv", encoding='utf-8', index=False)