In [32]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from pathlib import Path
import os
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/tylergehbauer/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [33]:
#tokenization for wordcloud, ect..
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
import re

In [34]:
#used for LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [3]:
#reads in the dataset
imdb_df = pd.read_csv("Data/IMDB Dataset.csv")

In [4]:
imdb_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
imdb_df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [6]:
#This loops creates a vader sentiment score for each review
sentiments = []#creates empty list

#Going to use VADER’s SentimentIntensityAnalyzer() to see how many 'Neutral (neu)' , 'Positive (pos)', and 'Negatvie(neg)'
# words there are for each review. 
for review in imdb_df['review']: #goes through each review in dataframe and applies polarity
    try:
        text = review #accessing each review in dataframe
        results = analyzer.polarity_scores(text) #this is how get compound, pos, neg, and nue polarity scores
        #during imports we set analyzer equal to SentimentIntensityAnalyzer() 
        #'results' uses this function to find polarity score for each article in bitcoin_news
        compound = results["compound"] #gets compound polarity score
        pos = results["pos"] #gets postive polarity score
        neu = results["neu"] #gets neutral polarity score
        neg = results["neg"] #gets negative polarity score

        sentiments.append({
            "text": text,
            "Compound": compound,
            "Positive": pos,
            "Negative": neg,
            "Neutral": neu,
        }) #appends a dictionary inside the list-(sentiments) so a dataframe can be created
        #this appends all polarity score we got above
    except AttributeError:
        pass
#rinse and repeat for each article    
imdb = pd.DataFrame(sentiments) #creates a dataframe using the list of dicts
imdb.head()

Unnamed: 0,text,Compound,Positive,Negative,Neutral
0,One of the other reviewers has mentioned that ...,-0.9951,0.048,0.203,0.748
1,A wonderful little production. <br /><br />The...,0.9641,0.172,0.053,0.776
2,I thought this was a wonderful way to spend ti...,0.9605,0.192,0.094,0.714
3,Basically there's a family where a little boy ...,-0.9213,0.065,0.138,0.797
4,"Petter Mattei's ""Love in the Time of Money"" is...",0.9744,0.147,0.052,0.801


In [7]:
imdb.tail(5)

Unnamed: 0,text,Compound,Positive,Negative,Neutral
49995,I thought this movie did a down right good job...,0.989,0.199,0.047,0.753
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",-0.6693,0.114,0.166,0.72
49997,I am a Catholic taught in parochial elementary...,-0.9851,0.108,0.208,0.683
49998,I'm going to have to disagree with the previou...,-0.7648,0.082,0.105,0.813
49999,No one expects the Star Trek movies to be high...,0.4329,0.141,0.135,0.723


In [8]:
# Instantiate the lemmatizer
wnl = WordNetLemmatizer() 

# Create a list of stopwords
stop = stopwords.words('english') # contains all stop words for english

In [9]:
# Expand the default stopwords list if necessary
#will append each word to the list 'stop' created in cell above.
stop.append("u")
stop.append("it'")
stop.append("'s")
stop.append("n't")
stop.append('…')
stop.append("\`")
stop.append('``')
stop.append('char')
stop.append('chars')
stop.append("''")
stop.append('’')
stop.append('arent')
stop.append('Mr.')
stop.append(',')
stop.append('br')
stop.append('film')
stop.append('movie')
stop.append('say')
stop.append('get')
stop.append('would')
stop.append('could')
stop.append('people')
stop.append('guy')
stop.append('put')
stop.append('given')
stop.append('go')
stop.append('one')
stop.append('even')
stop.append('also')
stop.append("'m")

stop = set(stop)

In [10]:
# Complete the tokenizer function
#function that creates the 'tokens' column
def tokenizer(text):
    """Tokenizes text."""
    
    # Create a list of the words
    words = word_tokenize(text) #tokenizes each word in text

    # Convert the words to lowercase
    words = list(filter(lambda w: w.lower(), words))
    
    # Remove the punctuation
    words = list(filter(lambda t: t not in punctuation, words))
    
    # Remove the stopwords
    words = list(filter(lambda t: t.lower() not in stop, words)) #uses the stop list we created
    
    # Lemmatize Words into root words
    #makes Ran and Running into same word for exmaple. 
    tokens = [wnl.lemmatize(word) for word in words]
    
    return tokens

In [11]:
# Create a new tokens column for each words in review
imdb["tokens"] = imdb.text.apply(tokenizer)
imdb.head()

Unnamed: 0,text,Compound,Positive,Negative,Neutral,tokens
0,One of the other reviewers has mentioned that ...,-0.9951,0.048,0.203,0.748,"[reviewer, mentioned, watching, 1, Oz, episode..."
1,A wonderful little production. <br /><br />The...,0.9641,0.172,0.053,0.776,"[wonderful, little, production, filming, techn..."
2,I thought this was a wonderful way to spend ti...,0.9605,0.192,0.094,0.714,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,-0.9213,0.065,0.138,0.797,"[Basically, family, little, boy, Jake, think, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",0.9744,0.147,0.052,0.801,"[Petter, Mattei, Love, Time, Money, visually, ..."


In [12]:
#looks at mean score for each column
imdb.describe()

Unnamed: 0,Compound,Positive,Negative,Neutral
count,50000.0,50000.0,50000.0,50000.0
mean,0.299203,0.140736,0.09463,0.764636
std,0.806077,0.066568,0.058763,0.069129
min,-0.9998,0.0,0.0,0.336
25%,-0.6901,0.094,0.052,0.722
50%,0.82015,0.131,0.087,0.768
75%,0.9705,0.177,0.13,0.812
max,0.9999,0.548,0.58,1.0


## LSTM Algoirthm

In [16]:
#Joins the original 'sentiiment' column to the updated dataframe (for algorithm)
machine_imdb = imdb.join(imdb_df['sentiment'])
machine_imdb.head()

Unnamed: 0,text,Compound,Positive,Negative,Neutral,tokens,sentiment
0,One of the other reviewers has mentioned that ...,-0.9951,0.048,0.203,0.748,"[reviewer, mentioned, watching, 1, Oz, episode...",positive
1,A wonderful little production. <br /><br />The...,0.9641,0.172,0.053,0.776,"[wonderful, little, production, filming, techn...",positive
2,I thought this was a wonderful way to spend ti...,0.9605,0.192,0.094,0.714,"[thought, wonderful, way, spend, time, hot, su...",positive
3,Basically there's a family where a little boy ...,-0.9213,0.065,0.138,0.797,"[Basically, family, little, boy, Jake, think, ...",negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",0.9744,0.147,0.052,0.801,"[Petter, Mattei, Love, Time, Money, visually, ...",positive


In [14]:
#this will change 'postive' and 'negative' into integer, necessary for LSTM 
sentiments_dict = {
    'positive': 1,
    'negative': 0
}

In [35]:
#this applies the dict just created to the dataframe
machine_imdb['sentiment'] = machine_imdb['sentiment'].apply(lambda x: sentiments_dict[x])
machine_imdb.head(5)

Unnamed: 0,text,Compound,Positive,Negative,Neutral,tokens,sentiment
0,One of the other reviewers has mentioned that ...,-0.9951,0.048,0.203,0.748,"[reviewer, mentioned, watching, 1, Oz, episode...",1
1,A wonderful little production. <br /><br />The...,0.9641,0.172,0.053,0.776,"[wonderful, little, production, filming, techn...",1
2,I thought this was a wonderful way to spend ti...,0.9605,0.192,0.094,0.714,"[thought, wonderful, way, spend, time, hot, su...",1
3,Basically there's a family where a little boy ...,-0.9213,0.065,0.138,0.797,"[Basically, family, little, boy, Jake, think, ...",0
4,"Petter Mattei's ""Love in the Time of Money"" is...",0.9744,0.147,0.052,0.801,"[Petter, Mattei, Love, Time, Money, visually, ...",1


In [36]:
y = machine_imdb['sentiment'].values #y equals this specific column (target)/ what you want to predict
X = machine_imdb['tokens'].values #x is all columnes without Outcome (features)/ what you will use to predict
#seperates the columns

In [37]:
#makes all tokens same size so can train model
tokenizer = Tokenizer(lower=True)
tokenizer.fit_on_texts(machine_imdb['tokens'])

In [21]:
#We replace the words in our sentences with whichever index they are included in the 'tokenizer'
X_seq = tokenizer.texts_to_sequences(X)
#transform the text data to numerical sequences and save the sequences in a variable called X_seq

In [22]:
# Set the pad size
max_words = 150 #max a (whole comment) segment will be less than 140 to normalize

# Pad the sequences using the pad_sequences() method
X_pad = pad_sequences(X_seq, maxlen=max_words, padding="post", truncating='post') #truncating='post'

pad_sequences: This function transforms a list (of length num_samples) of sequences (lists of integers) into a 2D Numpy array of shape (num_samples, num_timesteps)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, random_state=78)

Verify Shape

In [24]:
X_test.shape

(12500, 150)

In [25]:
X_train.shape

(37500, 150)

In [27]:
vocabulary_size = len(tokenizer.word_counts.keys()) + 1
embedding_size = 64

In [28]:
# Define the LSTM RNN model
model = Sequential()

# Layer 1
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))

# Layer 2
model.add(LSTM(units=128))

# Output layer
model.add(Dense(units=1, activation="sigmoid"))

2022-03-22 19:02:45.776659: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [29]:
model.compile(optimizer = 'adam', 
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [30]:
# Show the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 64)           10109568  
                                                                 
 lstm (LSTM)                 (None, 128)               98816     
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 10,208,513
Trainable params: 10,208,513
Non-trainable params: 0
_________________________________________________________________


In [31]:
model.fit(X_train, y_train, batch_size = 1000, epochs = 10, verbose = 1) #epochs 

Epoch 1/10


2022-03-22 19:03:22.903225: W tensorflow/core/framework/op_kernel.cc:1722] OP_REQUIRES failed at cast_op.cc:121 : UNIMPLEMENTED: Cast string to float is not supported


UnimplementedError: Graph execution error:

Detected at node 'binary_crossentropy/Cast' defined at (most recent call last):
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/traitlets/config/application.py", line 846, in launch_instance
      app.start()
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 677, in start
      self.io_loop.start()
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/asyncio/base_events.py", line 541, in run_forever
      self._run_once()
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/asyncio/base_events.py", line 1786, in _run_once
      handle._run()
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/asyncio/events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 353, in dispatch_shell
      await result
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2902, in run_cell
      raw_cell, store_history, silent, shell_futures)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2947, in _run_cell
      return runner(coro)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3173, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3364, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/d3/thkbqmp55bsgp57w077cr09h0000gn/T/ipykernel_32112/3592142334.py", line 1, in <module>
      model.fit(X_train, y_train, batch_size = 1000, epochs = 10, verbose = 1) #epochs
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/keras/engine/training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/keras/engine/training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/keras/engine/training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/keras/engine/training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/keras/engine/training.py", line 860, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/keras/engine/training.py", line 919, in compute_loss
      y, y_pred, sample_weight, regularization_losses=self.losses)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/keras/engine/compile_utils.py", line 201, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/keras/losses.py", line 141, in __call__
      losses = call_fn(y_true, y_pred)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/keras/losses.py", line 245, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/tylergehbauer/opt/anaconda3/envs/pyvizenv/lib/python3.7/site-packages/keras/losses.py", line 1922, in binary_crossentropy
      y_true = tf.cast(y_true, y_pred.dtype)
Node: 'binary_crossentropy/Cast'
Cast string to float is not supported
	 [[{{node binary_crossentropy/Cast}}]] [Op:__inference_train_function_3152]