<a href="https://colab.research.google.com/github/tysonbarreto/NLP/blob/main/Quora_Insincere_Question_Classification_Deep_Learning_%26_Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
import nltk

import torch
from torch.utils.data import TensorDataset,DataLoader

import torch.nn as nn
import torch.nn.functional as F

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

##Download Data from Kaggle

In [None]:
os.environ['KAGGLE_CONFIG_DIR']='/content'

In [None]:
! kaggle competitions download -c quora-insincere-questions-classification -f train.csv
! kaggle competitions download -c quora-insincere-questions-classification -f test.csv
! kaggle competitions download -c quora-insincere-questions-classification -f sample_submission.csv


train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
sample_submission.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
train_fname = './train.csv.zip'
test_fname = './test.csv.zip'
sub_fname = './sample_submission.csv.zip'


In [None]:
raw_df = pd.read_csv(train_fname)
test_df = pd.read_csv(test_fname)
sub_df = pd.read_csv(sub_fname)


In [None]:
sample_df = raw_df.sample(100_000)

#Prepare the Data for Training

* Convert text to TF-IDF
* Convert vectors to PyTorch Tensors
* Create PyTorch Data Loaders

In [None]:
stemmer = SnowballStemmer(language='english')

In [None]:
english_stopwords = stopwords.words('english')

In [None]:
def tokenize(text):
  return [stemmer.stem(token) for token in word_tokenize(text)]

In [None]:
vectorizer = TfidfVectorizer(tokenizer = tokenize, stop_words=english_stopwords, max_features=1000)

In [None]:
vectorizer.fit(sample_df['question_text'])



In [None]:
len(vectorizer.get_feature_names_out())

1000

# Transform questions into Vectors

In [None]:
inputs = vectorizer.transform(sample_df.question_text)

In [None]:
inputs.shape

(100000, 1000)

In [None]:
test_inputs = vectorizer.transform(test_df.question_text)

In [None]:
test_inputs.shape

(375806, 1000)

In [None]:
targets = sample_df.target

# Split the Training and Validation set

In [None]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs,targets,train_size=0.3,random_state=42)

In [None]:
train_inputs.shape, val_inputs.shape

((30000, 1000), (70000, 1000))

In [None]:
train_targets.shape, val_targets.shape

((30000,), (70000,))

#Convert to PyTorch Tensors

In [None]:
train_input_tensors = torch.tensor(train_inputs.toarray()).float()
val_input_tensors = torch.tensor(val_inputs.toarray()).float()

In [None]:
train_input_tensors.shape,val_input_tensors.shape

(torch.Size([30000, 1000]), torch.Size([70000, 1000]))

In [None]:
train_target_tensors = torch.tensor(train_targets.values).float()
val_target_tensors = torch.tensor(val_targets.values).float()

In [None]:
train_target_tensors.shape,val_target_tensors.shape

(torch.Size([30000]), torch.Size([70000]))

In [None]:
test_inputs_tensors = torch.tensor(test_inputs.toarray()).float()

# Create PyTorch Dataloader

In [None]:
train_ds = TensorDataset(train_input_tensors,train_target_tensors)
val_ds = TensorDataset(val_input_tensors,val_target_tensors)

In [None]:
test_ds = TensorDataset(test_inputs_tensors)

In [None]:
train_dl = DataLoader(train_ds,batch_size=128,shuffle=True)

In [None]:
val_dl = DataLoader(val_ds,batch_size=128)
test_dl = DataLoader(test_ds,batch_size=128)

In [None]:
for batch in train_dl:
  batch_inputs = batch[0]
  batch_targets = batch[1]
  print(batch_inputs.shape, batch_targets.shape)
  break

torch.Size([128, 1000]) torch.Size([128])


In [None]:
len(train_dl)

235

# Training a Deep Learning Model

In [None]:
class QuoraNet(nn.Module):
  def __init__(self):
    super().__init__()
    self.layer1 = nn.Linear(1000,512)
    self.layer2 = nn.Linear(512,256)
    self.layer3 = nn.Linear(256,128)
    self.layer4 = nn.Linear(128,64)
    self.layer5 = nn.Linear(64,1)

  def forward(self,inputs):
    out = self.layer1(inputs)
    out = F.relu(out)
    out = self.layer2(out)
    out = F.relu(out)
    out = self.layer3(out)
    out = F.relu(out)
    out = self.layer4(out)
    out = F.relu(out)
    out = self.layer5(out)
    return out


In [None]:
model = QuoraNet()

In [None]:
for batch in train_dl:
  bin, bt = batch
  print(bin.shape,bt.shape)

  bo = model(bin)


  probs = torch.sigmoid(bo[:,0])
  preds = (probs>0.5).float()

  print('Accuracy Score',accuracy_score(bt,preds))
  print('F1 Score',f1_score(bt,preds))

  print('Binary Cross Entropy Loss',F.binary_cross_entropy(preds,bt))

  break

torch.Size([128, 1000]) torch.Size([128])
Accuracy Score 0.9453125
F1 Score 0.0
Binary Cross Entropy Loss tensor(5.4688)


In [None]:
bt[:5],probs[:10],preds[:10]

(tensor([0., 0., 1., 0., 0.]),
 tensor([0.4975, 0.4976, 0.4976, 0.4977, 0.4976, 0.4974, 0.4973, 0.4976, 0.4976,
         0.4975], grad_fn=<SliceBackward0>),
 tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))

In [None]:
# Evaluate model performance

def evaluate(model, dl):
  losses, accs, f1s = [], [], []
  for batch in dl:
      inputs, targets = batch
      out = model(inputs)

      probs = torch.sigmoid(out[:,0])
      loss = F.binary_cross_entropy(probs, targets.float(), weight=torch.tensor(20.))
      losses.append(loss.item())

      preds = (probs > 0.5).int()
      acc = accuracy_score(targets, preds)
      f1 = f1_score(targets, preds)

      accs.append(acc)
      f1s.append(f1)

  return torch.mean(torch.tensor(losses)).item(), torch.mean(torch.tensor(accs)).item(), torch.mean(torch.tensor(f1s)).item()

In [None]:
evaluate(model,train_dl)

(13.77531623840332, 0.9388408687943262, 0.0)

In [None]:
# Train the model
def fit(epochs, lr, model, train_dl, val_dl):
  history=[]
  optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=1e-5)

  for epoch in range(epochs):

    for batch in train_dl:
      inputs, targets = batch

      out =model(inputs)

      probs = torch.sigmoid(out.flatten())

      loss = F.binary_cross_entropy(probs,targets)
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

    loss, acc, f1 = evaluate(model, val_dl)
    history.append([loss, acc, f1])

    print(f'Epoch {epoch}; Loss {loss:.4f}; Accuracy {acc:.4f}; F1 Score {f1:.4f}')
  return history

In [None]:
fit(10,0.001,model, train_dl, val_dl)

Epoch 0; Loss 3.2642; Accuracy 0.9383; F1 Score 0.0000
Epoch 1; Loss 3.2794; Accuracy 0.9450; F1 Score 0.3430
Epoch 2; Loss 3.2776; Accuracy 0.9454; F1 Score 0.3800
Epoch 3; Loss 3.6345; Accuracy 0.9422; F1 Score 0.4287
Epoch 4; Loss 4.6007; Accuracy 0.9279; F1 Score 0.4305
Epoch 5; Loss 5.7497; Accuracy 0.9412; F1 Score 0.3848
Epoch 6; Loss 6.0131; Accuracy 0.9325; F1 Score 0.4263
Epoch 7; Loss 6.4648; Accuracy 0.9339; F1 Score 0.4114
Epoch 8; Loss 7.3291; Accuracy 0.9414; F1 Score 0.3964
Epoch 9; Loss 7.6302; Accuracy 0.9389; F1 Score 0.4042


In [None]:
model= QuoraNet()
fit(10,0.0001,model, train_dl, val_dl)

Epoch 0; Loss 4.1717; Accuracy 0.9383; F1 Score 0.0000
Epoch 1; Loss 3.5773; Accuracy 0.9383; F1 Score 0.0000
Epoch 2; Loss 3.3307; Accuracy 0.9383; F1 Score 0.0000
Epoch 3; Loss 3.2724; Accuracy 0.9432; F1 Score 0.2291
Epoch 4; Loss 3.2832; Accuracy 0.9432; F1 Score 0.3539
Epoch 5; Loss 3.3542; Accuracy 0.9405; F1 Score 0.4195
Epoch 6; Loss 3.3910; Accuracy 0.9424; F1 Score 0.4024
Epoch 7; Loss 3.5675; Accuracy 0.9431; F1 Score 0.3877
Epoch 8; Loss 3.8661; Accuracy 0.9439; F1 Score 0.3594
Epoch 9; Loss 4.1053; Accuracy 0.9394; F1 Score 0.3929


[[4.171741962432861, 0.9383467452337424, 0.0],
 [3.5773234367370605, 0.9383467452337424, 0.0],
 [3.330695390701294, 0.9383467452337424, 0.0],
 [3.272437572479248, 0.9432191009401933, 0.22911721290095063],
 [3.2832157611846924, 0.9432333833899191, 0.3538921668571937],
 [3.3541836738586426, 0.9405462424915121, 0.4194528109278877],
 [3.3909831047058105, 0.9423886785061373, 0.4024005691127017],
 [3.567502498626709, 0.9430885185427004, 0.3877420702816257],
 [3.8660829067230225, 0.9438903760773049, 0.3593992327533162],
 [4.105301856994629, 0.9393730412640375, 0.39294959550632]]

# Predictions

In [None]:
def predict_df(df):
  inputs = vectorizer.transform(df.question_text)
  input_tensors = torch.tensor(inputs.toarray()).float()
  outputs = model(input_tensors)
  probs = torch.sigmoid(outputs.flatten())
  preds = (probs>0.5).float()
  return preds

In [None]:
predict_df(sample_df.iloc[15:20]),sample_df.iloc[15:20].question_text.values ,sample_df.iloc[15:20].target

(tensor([0., 0., 1., 0., 0.]),
 array(["Is there any honest woman out there that don't cheat and lie that would like a long-term relationship and try to start a family?",
        "What do girls think of boys who wears girls bra's for fantasy and they are not gay?",
        'In Genesis, after the flood, God says that He put the rainbow in the sky as a sign of His covenant. Do Christians believe that rainbows were not created until this point in history? Or did rainbows exist before the great flood?',
        'How did ground zero smell like?',
        'How much does buttered chicken cost?'], dtype=object),
 1019962    0
 939168     0
 879400     0
 1300124    0
 1289159    0
 Name: target, dtype: int64)