<a href="https://colab.research.google.com/github/ugurklc/Deep_Learning/blob/Master/Natural_Language_Processing/IMDB_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [11]:
# Load IMDB dataset

imdb_sentences = []

train_data = tfds.as_numpy(tfds.load("imdb_reviews", split='train'))

for item in train_data:
  imdb_sentences.append(str(item['text']))

In [13]:
imdb_sentences[0]

'b"This was an absolutely terrible movie. Don\'t be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie\'s ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor\'s like Christopher Walken\'s good name. I could barely sit through it."'

In [16]:
# Create a tokenizer and create a set of sequences

from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)

In [21]:
print(list(tokenizer.word_index.items())[:20])

[('the', 1), ('and', 2), ('a', 3), ('of', 4), ('to', 5), ('is', 6), ('br', 7), ('in', 8), ('it', 9), ('i', 10), ('this', 11), ('that', 12), ('was', 13), ('as', 14), ('for', 15), ('with', 16), ('movie', 17), ('but', 18), ('film', 19), ("'s", 20)]


In [23]:
# 1. Remove HTML tags
# 2. Remove stopwords
# 3. Strip punctuation

from bs4 import BeautifulSoup
import string

stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having",
             "he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how",
             "hows", "i", "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself",
             "lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought",
             "our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes", "should",
             "so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then",
             "there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those", "through",
             "to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve", "were",
             "what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom", "why",
             "whys", "with", "would", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
             "yourselves"]

table = str.maketrans('', '', string.punctuation)

imdb_sentences = []

for item in train_data:
    sentence = str(item['text'].decode('UTF-8').lower())
    sentence = sentence.replace(",", " , ")
    sentence = sentence.replace(".", " . ")
    sentence = sentence.replace("-", " - ")
    sentence = sentence.replace("/", " / ")
    soup = BeautifulSoup(sentence)
    sentence = soup.get_text()
    words = sentence.split()
    filtered_sentence = ""
    for word in words:
        word = word.translate(table)
        if word not in stopwords:
            filtered_sentence = filtered_sentence + word + " "
    imdb_sentences.append(filtered_sentence)

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=25000)
tokenizer.fit_on_texts(imdb_sentences)
sequences = tokenizer.texts_to_sequences(imdb_sentences)


['movie', 'film', 'not', 'one', 'like', 'just', 'good', 'no', 'time', 'even', 'story', 'really', 'see', 'can', 'much', 'bad', 'get', 'will', 'also', 'people']


In [25]:
print(list(tokenizer.word_index.items())[:20])

[('movie', 1), ('film', 2), ('not', 3), ('one', 4), ('like', 5), ('just', 6), ('good', 7), ('no', 8), ('time', 9), ('even', 10), ('story', 11), ('really', 12), ('see', 13), ('can', 14), ('much', 15), ('bad', 16), ('get', 17), ('will', 18), ('also', 19), ('people', 20)]


In [26]:
# See how the tokenizer worked on simple examples

examples = [
    'Today is a sunny day',
    'Today is a rainy day',
    'Is it sunny today?'
]

In [27]:
example_sequences = tokenizer.texts_to_sequences(examples)
example_sequences

[[516, 5229, 147], [516, 6489, 147], [5229, 516]]

In [28]:
# See the original sentences but the stopwords removed

tokenizer.sequences_to_texts(example_sequences)

['today sunny day', 'today rainy day', 'sunny today']

* See that "is", "a" and "it" are removed from the sentences.