In [1]:
import warnings  
with warnings.catch_warnings():  
    warnings.filterwarnings("ignore",category=FutureWarning)
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Embedding
    from absl import logging
    import tensorflow as tf
    import matplotlib.pyplot as plt
    import numpy as np
    import os
    import pandas as pd
    import re
    import seaborn as sns
    from tensorflow.keras.preprocessing import sequence
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Dense, Embedding
    from tensorflow.keras.layers import LSTM
    from tensorflow.keras.datasets import imdb

In [2]:
def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                       "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                      "aclImdb", "test"))
  
  return train_df, test_df
logging.set_verbosity(logging.ERROR)

train_df, test_df = download_and_load_datasets()

In [3]:
data_x=list(train_df.get("sentence"))
data_y=list(train_df.get("polarity"))
datate_x=list(test_df.get("sentence"))
datate_y=list(test_df.get("polarity"))

In [4]:
def delp(string): 
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    for x in string.lower(): 
        if x in punctuations: 
            string = string.replace(x, " ") 
    return string

In [5]:
import string
pr_train=[]
pr_test=[]
for x in data_x:
    x.replace("<br /><br />","")
    x.replace(r"\n", " ")
    pr_train.append(delp(x))
for x in datate_x:
    x.replace("<br /><br />","")
    x.replace(r"\n", "")
    pr_test.append(delp(x))
sentence = pr_train + pr_test
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(sentence)
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_x = pad_sequences(tokenizer.texts_to_sequences(pr_train), maxlen=80)
test_x = pad_sequences(tokenizer.texts_to_sequences(pr_test), maxlen=80)

In [None]:
model = Sequential()
model.add(Embedding(20000, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print(model.summary())
model.fit(train_x, np.array(data_y),
          batch_size=64,
          epochs=8,
          validation_data=(test_x, datate_y))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         2560000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________
None
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 25000 samples, validate on 25000 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8

In [None]:
model.save("Model_bin.h5")

In [None]:
model.predict(pad_sequences(tokenizer.texts_to_sequences(["Today is sunny"]), maxlen=80))[0][0]

In [None]:
import pickle
with open("tokenizer_bin.pickle", "wb") as handle:
    pickle.dump(tokenizer,handle )