In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv
/kaggle/input/fasttext-embedding/wiki-news-300d-1M.vec


In [9]:
import os
import torch
import io
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch.nn as nn
from typing import Dict
import tensorflow as tf
from sklearn import metrics
import logging

In [10]:
import psutil

def check_memory():
    memory_info = psutil.virtual_memory()
    print(f"Total Memory: {memory_info.total / (1024 ** 3):.2f} GB")
    print(f"Available Memory: {memory_info.available / (1024 ** 3):.2f} GB")
    print(f"Used Memory: {memory_info.used / (1024 ** 3):.2f} GB")
    print(f"Percentage Used: {memory_info.percent}%")

check_memory()

Total Memory: 31.36 GB
Available Memory: 29.63 GB
Used Memory: 1.28 GB
Percentage Used: 5.5%


In [11]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 10

In [12]:
class LSTM(nn.Module):
    def __init__(self, embedding_matrix: np.ndarray):
        """
        :param embedding_matrix: numpy array with vectors for all words
        """
        super(LSTM, self).__init__()
        # number of words = number of rows in embedding matrix
        num_words = embedding_matrix.shape[0]
        # dimension of embedding is num of columns in the matrix
        embed_dim = embedding_matrix.shape[1]

        # we define an input embedding layer
        self.embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=embed_dim)

        # embedding matrix is used as weights of the embedding layer
        self.embedding.weight = nn.Parameter(
            torch.tensor(embedding_matrix, dtype=torch.float32)
        )

        # we don't want to train the pre-trained embeddings
        self.embedding.weight.requires_grad = False

        # a simple bidirectional LSTM with hidden size of 128
        self.lstm = nn.LSTM(
            embed_dim,
            128,
            bidirectional=True,
            batch_first=True,
        )

        # output layer which is a linear layer, we have only one output
        # input (512) = 128 + 128 for mean and same for max pooling
        self.out = nn.Linear(512, 1)

    def forward(self, x):
        # pass data through embedding layer
        # the input is just the tokens
        x = self.embedding(x)

        # move embedding output to lstm
        x, _ = self.lstm(x)

        # apply mean and max pooling on lstm output
        avg_pool = torch.mean(x, 1)
        max_pool, _ = torch.max(x, 1)

        # concatenate mean and max pooling
        # this is why size is 512
        # 128 for each direction = 256
        # avg_pool = 256 and max_pool = 256
        out = torch.cat((avg_pool, max_pool), 1)

        # pass through the output layer and return the output
        out = self.out(out)

        # return linear output
        return out

In [13]:
class IMDBDataset:
    def __init__(self, reviews: np.array, targets: np.array):
        """
        :param reviews: this is a numpy array
        :param targets: a vector, numpy array
        """
        self.reviews = reviews
        self.targets = targets

    def __len__(self) -> int:
        # returns length of the dataset
        return len(self.reviews)

    def __getitem__(self, item) -> Dict[str, torch.tensor]:
        # for any given item, which is an int,
        # return review and targets as torch tensor
        # item is the index of the item in concern
        review = self.reviews[item, :]
        target = self.targets[item]

        return {
            "review": torch.tensor(review, dtype=torch.long),
            "target": torch.tensor(target, dtype=torch.float),
        }

In [14]:
def train(data_loader, model, optimizer, device):
    """
    This is the main training function that trains model for one epoch
    :param data_loader: this is the torch dataloader
    :param model: model (lstm model)
    :param optimizer: torch optimizer, e.g. adam, sgd, etc.
    :param device: this can be "cuda" or "cpu"
    """
    # set model to training mode
    model.train()

    # go through batches of data in data loader
    for data in data_loader:
        # fetch review and target from the dict
        reviews = data["review"]
        targets = data["target"]

        # move the data to device that we want to use
        reviews = reviews.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        # clear the gradients
        optimizer.zero_grad()

        # make predictions from the model
        predictions = model(reviews)

        # calculate the loss
        loss = nn.BCEWithLogitsLoss()(predictions, targets.view(-1, 1))

        # compute gradient of loss w.r.t.
        # all parameters of the model that are trainable
        loss.backward()

        # single optimization step
        optimizer.step()


def evaluate(data_loader, model, device):
    # initialize empty lists to store predictions and targets
    final_predictions = []
    final_targets = []

    # put the model in eval mode
    model.eval()

    # disable gradient calculation
    with torch.no_grad():
        for data in data_loader:
            reviews = data["review"]
            targets = data["target"]
            reviews = reviews.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            # make predictions
            predictions = model(reviews)

            # move predictions and targets to list
            # we need to move predictions and targets to cpu too
            predictions = predictions.cpu().numpy().tolist()
            targets = data["target"].cpu().numpy().tolist()
            final_predictions.extend(predictions)
            final_targets.extend(targets)

    # return final predictions and targets
    return final_predictions, final_targets


In [15]:
logging.basicConfig(level=logging.INFO)


def load_vectors(fname: str) -> Dict[str, np.ndarray]:
    fin = io.open(fname, "r", encoding="utf-8", newline="\n", errors="ignore")
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(" ")
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data


def create_embedding_matrix(word_index: Dict[str, int], embedding_dict: Dict[str, np.ndarray]) -> np.ndarray:
    """
    This function creates the embedding matrix.
    :param word_index: a dictionary with word:index_value
    :param embedding_dict: a dictionary with word:embedding_vector
    :return: a numpy array with embedding vectors for all known words
    """
    # initialize matrix with zeros
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    # loop over all the words with tqdm progress bar
    for word, i in tqdm(word_index.items(), desc="Creating Embedding Matrix"):
        # if word is found in pre-trained embeddings,
        # update the matrix. if the word is not found,
        # the vector is zeros!
        if word in embedding_dict:
            embedding_matrix[i] = embedding_dict[word]
        if i % 10000 == 0:  # Log progress every 10,000 iterations
            logging.info(f'Processed {i} words')
    # return embedding matrix
    return embedding_matrix

In [17]:
def run(df: pd.DataFrame, fold: int):
    """
    Run training and validation for a given fold
    and dataset
    :param df: pandas dataframe with kfold column
    :param fold: current fold, int
    """
    # fetch training dataframe
    train_df = df[df.kfold != fold].reset_index(drop=True)
    # fetch validation dataframe
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    print("Fitting tokenizer")
    # we use tf.keras for tokenization
    # you can use your own tokenizer and then you can
    # get rid of tensorflow
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist())

    # convert training data to sequences
    # for example : "bad movie" gets converted to
    # [24, 27] where 24 is the index for bad and 27 is the
    # index for movie
    xtrain = tokenizer.texts_to_sequences(train_df.review.values)

    # similarly convert validation data to
    # sequences
    xtest = tokenizer.texts_to_sequences(valid_df.review.values)

    # zero pad the training sequences given the maximum length
    # this padding is done on left hand side
    # if sequence is > MAX_LEN, it is truncated on left hand side too
    xtrain = tf.keras.preprocessing.sequence.pad_sequences(
        xtrain, maxlen=MAX_LEN
    )

    # zero pad the validation sequences
    xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest, maxlen=MAX_LEN)

    # initialize dataset class for training
    train_dataset = IMDBDataset(
        reviews=xtrain, targets=train_df.sentiment.values
    )

    # create torch dataloader for training
    # torch dataloader loads the data using dataset
    # class in batches specified by batch size
    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=2
    )

    # initialize dataset class for validation
    valid_dataset = IMDBDataset(
        reviews=xtest, targets=valid_df.sentiment.values
    )

    # create torch dataloader for validation
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=2
    )

    valid_dataset = IMDBDataset(
        reviews=xtest, targets=valid_df.sentiment.values
    )

    # create torch dataloader for validation
    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
    )
    print("Loading embeddings")

    # load embeddings as shown previously
    embedding_dict = load_vectors("/kaggle/input/fasttext-embedding/wiki-news-300d-1M.vec")
    embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict)
    # create torch device, since we use gpu, we are using cuda
    device = torch.device("cuda")
    # fetch our LSTM model
    model = LSTM(embedding_matrix)
    # send model to device
    model.to(device)

    # initialize Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    print("Training Model")
    # set best accuracy to zero
    best_accuracy = 0
    # set early stopping counter to zero
    early_stopping_counter = 0

    # train and validate for all epochs
    for epoch in range(EPOCHS):
        # train one epoch
        train(train_data_loader, model, optimizer, device)
        # validate
        outputs, targets = evaluate(valid_data_loader, model, device)
        # use threshold of 0.5
        # please note we are using linear layer and no sigmoid
        # you should do this 0.5 threshold after sigmoid
        outputs = np.array(outputs) >= 0.5
        # calculate accuracy
        accuracy = metrics.accuracy_score(targets, outputs)
        print(f"FOLD:{fold}, Epoch: {epoch}, Accuracy Score = {accuracy}")
        # simple early stopping
        if accuracy > best_accuracy:
            best_accuracy = accuracy
        else:
            early_stopping_counter += 1
        if early_stopping_counter > 2:
            break

In [18]:
df = pd.read_csv("/kaggle/input/notebook630e1ab092/imdb_folds.csv")
# train for all folds
run(df, fold=0)
run(df, fold=1)
run(df, fold=2)
run(df, fold=3)
run(df, fold=4)

Fitting tokenizer
Loading embeddings


Creating Embedding Matrix: 100%|██████████| 124252/124252 [00:01<00:00, 67615.04it/s] 


Training Model


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:0, Epoch: 0, Accuracy Score = 0.8678


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:0, Epoch: 1, Accuracy Score = 0.8855


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:0, Epoch: 2, Accuracy Score = 0.89


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:0, Epoch: 3, Accuracy Score = 0.8937


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:0, Epoch: 4, Accuracy Score = 0.8942


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:0, Epoch: 5, Accuracy Score = 0.8969


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:0, Epoch: 6, Accuracy Score = 0.8971


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:0, Epoch: 7, Accuracy Score = 0.8965


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:0, Epoch: 8, Accuracy Score = 0.8918


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:0, Epoch: 9, Accuracy Score = 0.8799
Fitting tokenizer
Loading embeddings


Creating Embedding Matrix: 100%|██████████| 124252/124252 [00:01<00:00, 71274.48it/s] 


Training Model


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:1, Epoch: 0, Accuracy Score = 0.8673


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:1, Epoch: 1, Accuracy Score = 0.8841


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:1, Epoch: 2, Accuracy Score = 0.8894


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:1, Epoch: 3, Accuracy Score = 0.8907


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:1, Epoch: 4, Accuracy Score = 0.8914


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:1, Epoch: 5, Accuracy Score = 0.8872


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:1, Epoch: 6, Accuracy Score = 0.8912


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:1, Epoch: 7, Accuracy Score = 0.8911
Fitting tokenizer
Loading embeddings


Creating Embedding Matrix: 100%|██████████| 124252/124252 [00:01<00:00, 66992.06it/s]


Training Model


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:2, Epoch: 0, Accuracy Score = 0.8734


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:2, Epoch: 1, Accuracy Score = 0.8873


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:2, Epoch: 2, Accuracy Score = 0.8912


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:2, Epoch: 3, Accuracy Score = 0.8909


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:2, Epoch: 4, Accuracy Score = 0.8911


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:2, Epoch: 5, Accuracy Score = 0.889
Fitting tokenizer
Loading embeddings


Creating Embedding Matrix: 100%|██████████| 124252/124252 [00:01<00:00, 73743.72it/s] 


Training Model


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:3, Epoch: 0, Accuracy Score = 0.8743


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:3, Epoch: 1, Accuracy Score = 0.8867


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:3, Epoch: 2, Accuracy Score = 0.8941


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:3, Epoch: 3, Accuracy Score = 0.8932


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:3, Epoch: 4, Accuracy Score = 0.8923


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:3, Epoch: 5, Accuracy Score = 0.892
Fitting tokenizer
Loading embeddings


Creating Embedding Matrix: 100%|██████████| 124252/124252 [00:01<00:00, 74427.11it/s] 


Training Model


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:4, Epoch: 0, Accuracy Score = 0.8658


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:4, Epoch: 1, Accuracy Score = 0.8867


  self.pid = os.fork()
  self.pid = os.fork()


FOLD:4, Epoch: 2, Accuracy Score = 0.8932


  self.pid = os.fork()
  self.pid = os.fork()


KeyboardInterrupt: 