# NLP with Bert for Sentiment Analysis

### Importing the libraries

In [1]:
!pip3 install ktrain

Collecting ktrain
[?25l  Downloading https://files.pythonhosted.org/packages/99/67/31cab9d7c0e23333aebc28b082659c1528f9ab7e22d00e7237efe4fc14f6/ktrain-0.26.2.tar.gz (25.3MB)
[K     |████████████████████████████████| 25.3MB 129kB/s 
[?25hCollecting scikit-learn==0.23.2
[?25l  Downloading https://files.pythonhosted.org/packages/f4/cb/64623369f348e9bfb29ff898a57ac7c91ed4921f228e9726546614d63ccb/scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8MB)
[K     |████████████████████████████████| 6.8MB 49.8MB/s 
Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/56/a3/8407c1e62d5980188b4acc45ef3d94b933d14a2ebc9ef3505f22cf772570/langdetect-1.0.8.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 52.4MB/s 
Collecting cchardet
[?25l  Downloading https://files.pythonhosted.org/packages/80/72/a4fba7559978de00cf44081c548c5d294bf00ac7dcda2db405d2baa8c67a/cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263kB)
[K     |██████████████████████████

In [11]:
import os.path
import numpy as np
import tensorflow as tf
import ktrain # Ktrain is a wrapper for tensorflow.keras that allows to us to wrap some models in only a few lines of code
from ktrain import text

## Part 1: Data Preprocessing

### Loading the IMDB dataset

In [7]:
# get_file used to download a file/dataset from a url
dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz",
                                  origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
                                  extract=True)
IMDB_DATADIR = os.path.join(os.path.dirname(dataset), 'aclImdb') # gets the dir folder path and 

In [6]:
print(os.path.dirname(dataset)) # path to all the datasets
print(IMDB_DATADIR) # path to the imdb dataset

/root/.keras/datasets
/root/.keras/datasets/aclImdb


### Creating the training and test sets

In [10]:
# texts_from_folder returns (trn, val, preproc), where val is our test in this case
# since we know the function returns three items in a tuples, we destructure thosee items immediately
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(datadir=IMDB_DATADIR,
                                                                       classes=['pos','neg'],
                                                                       maxlen=500, #max number of word we going to use
                                                                       train_test_names=['train','test'],
                                                                       preprocess_mode='bert')

detected encoding: utf-8
downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


## Part 2: Building the BERT model

In [12]:
model = text.text_classifier(name='bert',
                             train_data=(x_train, y_train),
                             preproc=preproc)

Is Multi-Label? False
maxlen is 500
done.


## Part 3: Training the BERT model

In [15]:
# this gets the learner instance of the Bert model.

learner = ktrain.get_learner(model=model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=6)


 <!-- Choosing the batch size -->

The sequence corresponds to the  maxlen used during creation of the test and train set

sequence len=64, max_batch_size=64
        sequence len=128, max_batch_size=32
        sequence len=256, max_batch_size=16
        sequence len=320, max_batch_size=14
        sequence len=384, max_batch_size=12
        sequence len=512, max_batch_size=6

In [16]:
# training

learner.fit_onecycle(lr=2e-5,
                     epochs=1)



begin training using onecycle policy with max lr of 2e-05...


<tensorflow.python.keras.callbacks.History at 0x7fe0b60c5a10>