In [1]:
!wget http://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip

--2020-11-15 06:16:04--  http://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 172.67.9.4, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1523785255 (1.4G) [application/zip]
Saving to: ‘crawl-300d-2M.vec.zip’


2020-11-15 06:16:39 (42.3 MB/s) - ‘crawl-300d-2M.vec.zip’ saved [1523785255/1523785255]



In [2]:
import zipfile
with zipfile.ZipFile('/content/crawl-300d-2M.vec.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [3]:
!cp /content/drive/My\ Drive/colab/Approaching_Any_ML_Problem/chapter11_text/input/IMDB\ Dataset.csv .

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# create_folds.py
# import pandas and model_selection module of scikit-learn
import pandas as pd
from sklearn import model_selection

if __name__ == "__main__":
    # Read training data
    df = pd.read_csv("./IMDB Dataset.csv")

    # map positive to 1 and negative to 0
    df.sentiment = df.sentiment.apply(
        lambda x: 1 if x == "positive" else 0
    )

    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1

    # the next step is to randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)

    # fetch labels
    y = df.sentiment.values

    # initiate the kfold class from model_selection module
    kf = model_selection.StratifiedKFold(n_splits=5)

    # fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
        df.loc[v_, 'kfold'] = f

    # save the new csv with kfold column
    df.to_csv("./imdb_folds.csv", index=False)


In [8]:
!cp /content/drive/My\ Drive/colab/Approaching_Any_ML_Problem/chapter11_text/config.py .
!cp /content/drive/My\ Drive/colab/Approaching_Any_ML_Problem/chapter11_text/dataset.py .
!cp /content/drive/My\ Drive/colab/Approaching_Any_ML_Problem/chapter11_text/engine.py .
!cp /content/drive/My\ Drive/colab/Approaching_Any_ML_Problem/chapter11_text/lstm.py .

In [10]:
# train.py
import io
import torch

import numpy as np
import pandas as pd

# yes, we use tensorflow
# but not for training the model !

import tensorflow as tf

from sklearn import metrics
import torch.utils.data.dataloader

import config as config
import dataset as dataset
import engine as engine
import lstm as lstm


def load_vectors(fname):
    # taken from: https://fasttext.cc/docs/en/english-vectors.html
    fin = io.open(
        fname,
        'r',
        encoding='utf-8',
        newline='\n',
        errors='ignore'
    )
    n, d = map(int, fin.readline().split())

    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))

    return data


def create_embedding_matrix(word_index, embedding_dict):
    """
    This function creates embedding matrix.
    :param word_index: a dictionary with word:index_value
    :param embedding_dict: a dictionary with word:embedding_vector
    :return: a numpy array with embedding vectors for all known words
    """
    # initialize matrix with zeros
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    # loop over all the words
    for word, i in word_index.items():
        # if word is found in pre-trained embeddings,
        # update the matrix. if the word is not found,
        # the vector is zeor!
        if word in embedding_dict:
            embedding_matrix[i] = embedding_dict[word]

    # return embedding matrix
    return embedding_matrix


def run(df, fold):
    """
    Run training and validation for a given fold and dataset
    :param df: pandas dataframe with kfold column
    :param fold: current fold, int
    :return:
    """
    # fetch training dataframe
    train_df = df[df.kfold != fold].reset_index(drop=True)

    # fetch validation dataframe
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    print("Fitting tokenizer")
    # we use tf.keras for tokenization
    # you can use your own tokenizer and then you can get rid of tensorflow
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist())

    # convert training data to sequences
    # for example: "bad movie" gets converted to
    # [24, 27] where 24 is the index for bad and 27 is the
    # index for movie
    xtrain = tokenizer.texts_to_sequences(train_df.review.values)

    # similarly convert validation data to
    # sequences
    xtest = tokenizer.texts_to_sequences(valid_df.review.values)

    # zero pad the training sequences given the maximum length
    # this padding is done on left hand side
    # if sequence is > MAX_LEN, it is truncated on left hand side too
    xtrain = tf.keras.preprocessing.sequence.pad_sequences(
        xtrain, maxlen=config.MAX_LEN
    )

    # zero pad the validation sequences
    xtest = tf.keras.preprocessing.sequence.pad_sequences(
        xtest, maxlen=config.MAX_LEN
    )

    # initialize dataset class for training
    train_dataset = dataset.IMDBDataset(
        reviews=xtrain,
        targets=train_df.sentiment.values
    )

    # create torch dataloader for training
    # torch dataloader loads the data using dataset
    # class in batches specified by batch size
    train_data_loader = torch.utils.data.dataloader.DataLoader(
        dataset=train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=2
    )

    # initialize dataset class for validation
    valid_dataset = dataset.IMDBDataset(
        reviews=xtest,
        targets=valid_df.sentiment.values
    )

    # create torch dataloader for validation
    valid_data_loader = torch.utils.data.dataloader.DataLoader(
        dataset=valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=1
    )

    print("Loading embeddings")
    # load embeddings as shown previously
    embedding_dict = load_vectors("./crawl-300d-2M.vec")
    embedding_matrix = create_embedding_matrix(
        tokenizer.word_index, embedding_dict
    )

    # create torch device, since we use gpu, we are using cuda
    device = torch.device("cuda")

    # fetch our LSTM model
    model = lstm.LSTM(embedding_matrix)

    # send model to device
    model.to(device)

    # initialize Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    print("Training Model")
    # set best accuracy to zero
    best_accuracy = 0
    # set early stopping counter to zero
    early_stopping_counter = 0
    # train and validate for all epoches
    for epoch in range(config.EPOCHS):
        # train one epoch
        engine.train(train_data_loader, model, optimizer, device)
        # validate
        outputs, targets = engine.evaluate(
            valid_data_loader, model, device
        )

        # use threshold of 0.5
        # please note we are using linear layer and no sigmoid
        # you should do this 0.5 threshold after sigmoid
        outputs = np.array(outputs) >= 0.5

        # calculate accuracy
        accuracy = metrics.accuracy_score(targets, outputs)
        print(
            f"FOLD:{fold}, Epoch: {epoch}, Accuracy Score = {accuracy}"
        )
        # simple early stopping
        if accuracy > best_accuracy:
            best_accuracy = accuracy
        else:
            early_stopping_counter += 1

        if early_stopping_counter > 2:
            break


if __name__ == "__main__":

    # load data
    df = pd.read_csv("./imdb_folds.csv")

    # train for all folds
    run(df, fold=0)
    run(df, fold=1)
    run(df, fold=2)
    run(df, fold=3)
    run(df, fold=4)


Fitting tokenizer
Loading embeddings
Training Model
FOLD:0, Epoch: 0, Accuracy Score = 0.8843
FOLD:0, Epoch: 1, Accuracy Score = 0.8919
FOLD:0, Epoch: 2, Accuracy Score = 0.9005
FOLD:0, Epoch: 3, Accuracy Score = 0.9008
FOLD:0, Epoch: 4, Accuracy Score = 0.8933
FOLD:0, Epoch: 5, Accuracy Score = 0.886
FOLD:0, Epoch: 6, Accuracy Score = 0.8876
Fitting tokenizer
Loading embeddings
Training Model
FOLD:1, Epoch: 0, Accuracy Score = 0.8831
FOLD:1, Epoch: 1, Accuracy Score = 0.8998
FOLD:1, Epoch: 2, Accuracy Score = 0.903
FOLD:1, Epoch: 3, Accuracy Score = 0.9058
FOLD:1, Epoch: 4, Accuracy Score = 0.9033
FOLD:1, Epoch: 5, Accuracy Score = 0.9023
FOLD:1, Epoch: 6, Accuracy Score = 0.9025
Fitting tokenizer
Loading embeddings
Training Model
FOLD:2, Epoch: 0, Accuracy Score = 0.8732
FOLD:2, Epoch: 1, Accuracy Score = 0.8922
FOLD:2, Epoch: 2, Accuracy Score = 0.8996
FOLD:2, Epoch: 3, Accuracy Score = 0.8976
FOLD:2, Epoch: 4, Accuracy Score = 0.9009
FOLD:2, Epoch: 5, Accuracy Score = 0.9002
FOLD:2