In [1]:
# Enable intellisense
%config IPCompleter.greedy=True

In [2]:
# Import modules
import pandas as pd
import numpy as np
import bert
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import  Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tqdm import tqdm
import matplotlib.pyplot as plt

# Print version information
print("TensorFlow Version:",tf.__version__)
print("Hub version: ",hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

# Pandas settings
pd.set_option('display.max_colwidth',1000)
pd.options.display.max_rows = 10000

TensorFlow Version: 2.2.0
Hub version:  0.9.0
GPU is available


In [3]:
# Import (local) helper scripts
import sys
sys.path.append("../helpers")
import imdb_preprocess_functions as nist_imdb

In [4]:
dir(nist_imdb)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'file_column',
 'get_fit_data',
 'get_imdb_df_data',
 'get_model_performance',
 'hub',
 'label_column',
 'np',
 'output_result',
 'pd',
 'probability_column',
 'text_column',
 'to_categorical']

In [5]:
# Now load the Stanford IMDB training and test dataset.
[df_train, df_test] = nist_imdb.get_imdb_df_data('../data/imdb_master.csv')

The number of rows and columns in the training dataset is: (25000, 5)
Missing values in train dataset:
Unnamed: 0    0
type          0
review        0
label         0
file          0
dtype: int64
Check train class balance
1.0    12500
0.0    12500
Name: label, dtype: int64
The number of rows and columns in the test dataset is: (25000, 5)
Missing values in test dataset:
Unnamed: 0    0
type          0
review        0
label         0
file          0
dtype: int64
Check test class balance
1.0    12500
0.0    12500
Name: label, dtype: int64


In [6]:
# Shuffle the dataset
df = df_train.sample(frac=1, random_state=0)
df.shape

(25000, 5)

In [7]:
SPLIT_TRAIN_SIZE = 17500

In [8]:
# Split the dataset into training and testing subsets
df_train_train = df[:SPLIT_TRAIN_SIZE]
df_train_test = df[SPLIT_TRAIN_SIZE:]

In [9]:
df_train_train.shape

(17500, 5)

In [10]:
df_train_test.shape

(7500, 5)

In [11]:
# Check the target class balance
df_train_train[nist_imdb.label_column].value_counts()

0.0    8786
1.0    8714
Name: label, dtype: int64

In [12]:
# Check the target class balance
df_train_test[nist_imdb.label_column].value_counts()

1.0    3786
0.0    3714
Name: label, dtype: int64

In [13]:
# Build file names/paths from size of testing and training datasets
file_train_train = 'imdb_train_split_' + str(SPLIT_TRAIN_SIZE) + '.csv'
file_train_test = 'imdb_train_split_'  + str(25000 - SPLIT_TRAIN_SIZE) + '.csv'
# Generate CSV files containing the processed data
df_train_train.to_csv(file_train_train, index=False, columns = [nist_imdb.text_column, nist_imdb.label_column, nist_imdb.file_column])
df_train_test.to_csv(file_train_test, index=False, columns = [nist_imdb.text_column, nist_imdb.label_column, nist_imdb.file_column])