## Approach 1: Logistic Regression Model

This approach builds a Logistic Regression model to predict the safety of product.

In [None]:
# Upgrade dependencies
! which python3
! pip3 install -r ../data/requirements.txt


In [None]:
import boto3
import os
import numpy as np
import pandas as pd
import nltk, re
import time
import torch
import torch.nn as nn

from os import path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.nn import BCELoss
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("stopwords")

%matplotlib inline
import matplotlib.pyplot as plt

## 1. Reading the dataset

This approach uses the __pandas__ library to read our dataset.

#### __Training data:__

In [None]:
train_df = pd.read_csv('../data/training.csv', encoding='utf-8', header=0)
train_df.head(10)

#### __Test data:__

In [None]:
test_df = pd.read_csv('../data/test.csv', encoding='utf-8', header=0)
test_df.head(10)

## 2. Train a Logistic Regression Model
Apply pre-processing and vectorization operations and train the model.

In [None]:
# Check the number of missing values for each columm
print(test_df.isna().sum())

In [None]:
# fill-in the missing values in it with the empty string
train_df["text"].fillna("", inplace=True)

In [None]:
# get a list of stop words from the NLTK library
stop = stopwords.words("english")

# Excluding useful words
excluding = [
    "against",
    "not",
    "don",
    "don't",
    "ain",
    "aren",
    "aren't",
    "couldn",
    "couldn't",
    "didn",
    "didn't",
    "doesn",
    "doesn't",
    "hadn",
    "hadn't",
    "hasn",
    "hasn't",
    "haven",
    "haven't",
    "isn",
    "isn't",
    "mightn",
    "mightn't",
    "mustn",
    "mustn't",
    "needn",
    "needn't",
    "shouldn",
    "shouldn't",
    "wasn",
    "wasn't",
    "weren",
    "weren't",
    "won",
    "won't",
    "wouldn",
    "wouldn't",
]

# New stop word list
stop_words = [word for word in stop if word not in excluding]

# Initialize the Stemmer
snow = SnowballStemmer("english")

# Process the text for cleaning
def process_text(texts):
    final_text_list = []
    for sent in texts:
        
        # Check if the sentence is a missing value
        if isinstance(sent, str) == False:
            sent = ""
            
        filtered_sentence = []
        
        # String Clearning:
        # Lowercase
        sent = sent.lower()
        # Remove leading/trailing whitespace
        sent = sent.strip()
        # Remove extra space and tabs
        sent = re.sub("\s+", " ", sent)
        # Remove HTML tags/markups:
        sent = re.compile("<.*?>").sub("", sent)

        for w in word_tokenize(sent):
            # Check if it is not numeric and its length>2 and not in stop words
            if (not w.isnumeric()) and (len(w) > 2) and (w not in stop_words):
                # Stem and add to filtered list
                filtered_sentence.append(snow.stem(w))
        final_string = " ".join(filtered_sentence)  # final string of cleaned words

        final_text_list.append(final_string)

    return final_text_list

In [None]:
# Split train and validation data
X_train, X_val, y_train, y_val = train_test_split(
    train_df[["text"]],
    train_df["human_tag"].values,
    test_size=0.10,
    shuffle=True,
    random_state=324,
)

In [None]:
# Train and Val data clean
print("Processing the text fields...")
X_train["text"] = process_text(X_train["text"].tolist())
X_val["text"] = process_text(X_val["text"].tolist())

In [None]:
# Use TD-IDF to vectorize to vectors of len 750.
tf_idf_vectorizer = TfidfVectorizer(max_features=750)

# Fit the vectorizer to training data
tf_idf_vectorizer.fit(X_train["text"].values)

# Transform text fields
X_train = tf_idf_vectorizer.transform(X_train["text"].values).toarray()
X_val = tf_idf_vectorizer.transform(X_val["text"].values).toarray()

# Check data size
print("Shapes of features: Training and Validation")
print(X_train.shape, X_val.shape)

In [None]:
# Store vectorized datasets for further adjustment
X_train_store = X_train.copy()
X_val_store = X_val.copy()

In [None]:
# Reassign stored data to X_train and X_val, used for second and later round model optimzation
X_train = X_train_store.copy()
X_val = X_val_store.copy()

In [None]:
# Check vectorization results
tf_idf_vectorizer.get_feature_names()[:10]

In [None]:
# Set batch numbers for each weight update
batch_size = 16
# Set epochs for total number of iterations
epochs = 30
# Set Learning rate
lr = 0.005

# Run the training in the GPU if supported, else in the CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Build our double layers network
net = nn.Sequential(
    # Input size of 1 is expected. Linear layer-1 with 10 units
    nn.Linear(in_features=750, out_features=10),
    # Relu activation is applied
    nn.ReLU(),
    # Output layer with single unit
    nn.Linear(10, 1),
    # Add Sigmoid at the end to turn output to probabilities
    nn.Sigmoid(),
)
net.to(device)

# Initialize the network
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, std=1)
        nn.init.zeros_(m.bias)
net.apply(init_weights)

# Define the loss. For binary classification the appropriate choice is Binary Cross Entropy.
# For sigmoid in the last layer, use nn.BCELoss.
loss = BCELoss(reduction="none")

# Define the optimizer, SGD (Stochastic Gradient Descent) with learning rate
optimizer = torch.optim.SGD(net.parameters(), lr=lr)

# Use PyTorch DataLoaders to load the data in batches
train_dataset = TensorDataset(
    torch.tensor(X_train, dtype=torch.float32),
    torch.tensor(y_train, dtype=torch.float32),
)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

# Move validation dataset on CPU/GPU device
X_val = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val = torch.tensor(y_val, dtype=torch.float32).to(device)

In [None]:
# Lists to store the losses as the training progresses
train_losses = []
val_losses = []

# Loop over epochs
for epoch in range(epochs):
    start = time.time()
    training_loss = 0
    # Build a training loop to train the network
    for data, target in train_loader:
        # zero the parameter gradients
        optimizer.zero_grad()

        data = data.to(device)
        target = target.to(device).view(-1, 1)

        # Forward pass - compute the predictions of the NN on the batch
        output = net(data)  
        # Compute the loss and sum
        L = loss(output, target).sum()
        training_loss += L.item() 
        # Calculate gradients
        L.backward()  
        # Update weights with gradient descent
        optimizer.step()  

    # Get validation predictions
    val_predictions = net(X_val)
    # Calculate the validation loss
    val_loss = torch.sum(loss(val_predictions, y_val.view(-1, 1))).item()

    # Take the average losses
    training_loss = training_loss / len(y_train)
    val_loss = val_loss / len(y_val)

    train_losses.append(training_loss)
    val_losses.append(val_loss)

    end = time.time()
    print(
        f"Epoch {epoch}. Train_loss {training_loss}, Validation_loss {val_loss}, Seconds {end-start}"
    )

In [None]:
# Visualization
plt.plot(train_losses, label="Training Loss")
plt.plot(val_losses, label="Validation Loss")
plt.title("Loss values")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.ylim([0.25, 0.5])
plt.show()

## 3. Predictions

Make Predictions with test data
1. Fill-in missing values
2. Clean and normalize text
3. Vectorization
4. Convert to Torch tensor
5. Get predictions
6. Round up to 1 or down to 0

You will save your predictions (with __test_predictions__ variable) to a CSV file later in section 4.

In [None]:
# Fill-in missing values
test_df["text"].fillna("", inplace=True)

In [None]:
# Clean and normalize text
test_df["text"] = process_text(test_df["text"].tolist())

In [None]:
# Vectorization
X_test = tf_idf_vectorizer.transform(test_df["text"].values).toarray()

# Chech vectorization results
print("Shapes of features: Training and Validation")
print(X_test.shape)

In [None]:
# Convert to Torch tensor
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)

In [None]:
#Get predictions
test_predictions = net(X_test)
#Round up to 1 or down to 0
test_predictions = np.rint(test_predictions.detach().cpu().numpy())

## 4. Write predictions to a CSV file

In [None]:
import pandas as pd
 
result_df = pd.DataFrame()
result_df["ID"] = test_df["ID"]
result_df["human_tag"] = test_predictions
 
result_df.to_csv("../../data/approach_1_result.csv", encoding='utf-8', index=False)