# Preparing Data


## Importing Libraries

In [None]:
# Import the pandas library.
import pandas as pd
# Import the numpy library.
import numpy as np

# Import the string library.
import string
# Import the punctuation library.
from string import punctuation
# Import the nltk library.
import nltk
# Import the stopwords.
from nltk.corpus import stopwords
# Download the stopwords
nltk.download("stopwords")

# Import the TensorFlow library.
import tensorflow as tf

# Import the Input, Dense, Dropout layers.
from tensorflow.keras.layers import Input, Dense, Dropout
# Import the Sequential model.
from tensorflow.keras.models import Sequential

# Import the sklearn library.
import sklearn
# Import the train_test_split model selection.
from sklearn.model_selection import train_test_split
# Import the CountVectorizer, TfidfTransformer, TfidfVectorizer.
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# Import the joblib library.
import joblib

# Cleaning the Data


## Importing the Data

In [None]:
# Use the .read_csv function to read the "Reviews.csv" data into the notebook.
data=pd.read_csv("Reviews.csv")

# Use data.head() to print out the first few lines of data.
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## Cleaning Up Data


In [None]:
# Add the 'UserId', 'Id', and 'Time' to the drop function to drop them.
data=data.drop(["UserId", "Id", "Time"],axis=1)

# Use dropna() to drop empty rows.
data.dropna(inplace=True)

# Print the first few lines of the data.
data.head()

Unnamed: 0,ProductId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,Text
0,B001E4KFG0,delmartian,1,1,5,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,B00813GRG4,dll pa,0,0,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,"Natalia Corres ""Natalia Corres""",1,1,4,"""Delight"" says it all",This is a confection that has been around a fe...
3,B000UA0QIQ,Karl,3,3,2,Cough Medicine,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,"Michael D. Bigham ""M. Wassir""",0,0,5,Great taffy,Great taffy at a great price. There was a wid...


## Adding a Polarity Column


In [None]:
# Create a new column to keep track of if the review is positive negative or neutral.
data["Polarity_Rating"]=data["Score"].apply(lambda x: "Positive" if x>3 else ("Neutral" if x==3 else "Negative"))

# Print the first few lines of the data.
data.head()

Unnamed: 0,ProductId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,Text,Polarity_Rating
0,B001E4KFG0,delmartian,1,1,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Positive
1,B00813GRG4,dll pa,0,0,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Negative
2,B000LQOCH0,"Natalia Corres ""Natalia Corres""",1,1,4,"""Delight"" says it all",This is a confection that has been around a fe...,Positive
3,B000UA0QIQ,Karl,3,3,2,Cough Medicine,If you are looking for the secret ingredient i...,Negative
4,B006K2ZZ7K,"Michael D. Bigham ""M. Wassir""",0,0,5,Great taffy,Great taffy at a great price. There was a wid...,Positive


## Sampling the Data


In [None]:
# Separate the positive data into a group.
data_positive = data[data["Polarity_Rating"]=="Positive"]
# Separate the negative data into a group.
data_negative=data[data["Polarity_Rating"]=="Negative"]
# Separate the neutral data into a group.
data_neutral=data[data["Polarity_Rating"]=="Neutral"]

# Print out the shape of each list.
print(data_positive.shape)
print(data_negative.shape)
print(data_neutral.shape)



(443756, 8)
(82007, 8)
(42638, 8)


In [None]:
# Get a sample from the positive reviews.
data_positive=data_positive.sample(8000)
# Get a sample from the negative reviews.
data_negative=data_negative.sample(8000)
# Get a sample from the neutral reviews.
data_neutral=data_neutral.sample(8000)

# Print out the shape of the new lists.
print(data_positive.shape)
print(data_negative.shape)
print(data_neutral.shape)



(8000, 8)
(8000, 8)
(8000, 8)


In [None]:
# Combine the positive, negative, and neutral data lists together to create one large dataset.
data=pd.concat([data_positive, data_negative, data_neutral])

# Print the data's shape.
print(data.shape)

(24000, 8)


## Cleaning up the Data


In [None]:
# Create a function called text_cleanup that returns the text without the stopwords and punctuation.
def text_cleanup(text):
    stopwrds = stopwords.words("english")
    no_punc = [char for char in text if char not in string.punctuation]
    no_punc="".join(no_punc)
    return " ".join([word for word in no_punc.split() if word.lower not in stopwrds])

In [None]:
# Create a new column called reviews that cleans up the text.
data["reviews"]=data["Text"].apply(text_cleanup)

# Print the first few lines of the data.
data.head()

Unnamed: 0,ProductId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,Text,Polarity_Rating,reviews
498934,B000LKU3Y2,Grandma,0,0,5,School Milk Time,"Althought a bit expensive, this is a great alt...",Positive,Althought a bit expensive this is a great alte...
65087,B000EM8UFG,"John B. Goode ""JBG""",14,14,5,Excellent!,I drank Lipton's regular black tea for years b...,Positive,I drank Liptons regular black tea for years be...
431222,B000Q3CGBM,Jennifer,0,0,5,Great food,My dogs love it. I own two Yorkies. The kibble...,Positive,My dogs love it I own two Yorkies The kibble i...
403757,B003172KB6,H. Poirier,3,4,5,Tasty,Got this for my sister and had a bite. As some...,Positive,Got this for my sister and had a bite As some ...
398347,B0049PCCIA,Big Grown-Up Mommy,1,1,4,Nice Little Crusts,Our grocery store sells these for $5/package. ...,Positive,Our grocery store sells these for 5package The...


In [None]:
# Add the review data and polarity rating columns to recreate the dataset with just the needed data.
data=data[["reviews", "Polarity_Rating"]]

# Print the first few lines of the data.
data.head()

Unnamed: 0,reviews,Polarity_Rating
498934,Althought a bit expensive this is a great alte...,Positive
65087,I drank Liptons regular black tea for years be...,Positive
431222,My dogs love it I own two Yorkies The kibble i...,Positive
403757,Got this for my sister and had a bite As some ...,Positive
398347,Our grocery store sells these for 5package The...,Positive


## One Hot Encoding    


In [None]:
# Apply one-hot encoding to your data by using the pandas get_dummies function.
one_hot=pd.get_dummies(data["Polarity_Rating"])

# Print the first few lines of the one-hot data.
one_hot.head()

Unnamed: 0,Negative,Neutral,Positive
498934,False,False,True
65087,False,False,True
431222,False,False,True
403757,False,False,True
398347,False,False,True


In [None]:
# Combine your data and the one_hot data to combine the data into one big dataset.
data=pd.concat([data,one_hot], axis=1)

# Print the first few lines of the data.
data.head()

Unnamed: 0,reviews,Polarity_Rating,Negative,Neutral,Positive
498934,Althought a bit expensive this is a great alte...,Positive,False,False,True
65087,I drank Liptons regular black tea for years be...,Positive,False,False,True
431222,My dogs love it I own two Yorkies The kibble i...,Positive,False,False,True
403757,Got this for my sister and had a bite As some ...,Positive,False,False,True
398347,Our grocery store sells these for 5package The...,Positive,False,False,True


In [None]:
# Drop the 'Polarity_Rating' column.
data.drop("Polarity_Rating", inplace=True,axis=1)

# Print the first few lines of the data.
data.head()

Unnamed: 0,reviews,Negative,Neutral,Positive
498934,Althought a bit expensive this is a great alte...,False,False,True
65087,I drank Liptons regular black tea for years be...,False,False,True
431222,My dogs love it I own two Yorkies The kibble i...,False,False,True
403757,Got this for my sister and had a bite As some ...,False,False,True
398347,Our grocery store sells these for 5package The...,False,False,True


## Train and Test Split


In [None]:
# Set x_rev equal to the reviews column by adding .values to the end to get the data itself out of the column.
x_rev=data["reviews"].values

# Set y_pol to the data with the reviews column dropped.
y_pol = data.drop("reviews", axis=1)

# Create your train and test datasets.
x_rev_train, x_rev_test, y_pol_train, y_pol_test=train_test_split(x_rev, y_pol, test_size=0.3, shuffle=True)

# Vectorizing


## Fit Stage


In [None]:
# Create a count vectorizer object.
vect=CountVectorizer()

# Set a maximum amount of features for the vectorizer to 15,000.
vect.max_features=15000

# Add the review data to the fit function.
vect.fit(x_rev)

# Get the vectorized vocabulary.
vocab=vect.vocabulary_

# Add a print statement to print out the vocab that has been saved to a variable.
print(vocab)



In [None]:
# Save the vocab to your Student folder.
joblib.dump(vocab, "vocab.pkl")

['vocab.pkl']

## Transform


In [None]:
# Transform the training data.
x_rev_train_v=vect.transform(x_rev_train)

# Transform the test data.
x_rev_test_v=vect.transform(x_rev_test)

In [None]:
# Transform the training data into an array.
x_rev_train_v=x_rev_train_v.toarray()

# Transform the test data into an array.
x_rev_test_v=x_rev_test_v.toarray()

In [None]:
# Print out the shape of the training dataset.
print(x_rev_train_v.shape)

# Print the shape of the test dataset.
print(x_rev_test_v.shape)

(16800, 15000)
(7200, 15000)


# Creating the Network


## Create the Model

In [None]:
# Create a sequential model.
model=Sequential()

## Input Layer


In [None]:
# Add an input layer with 4000 units and an activation of 'relu'.
model.add(Dense(units=4000, activation="relu"))

## Dropout Layers


In [None]:
# Add a dropout layer with a rate of 0.5.
model.add(Dropout(rate=0.7))

## Calculation Layers


In [None]:
# Add layers to the middle of the network.
model.add(Dense(activation="relu", units=2000))
model.add(Dropout(rate=0.5))
model.add(Dense(activation="relu", units=500))
model.add(Dropout(rate=0.5))
model.add(Dense(units=250, activation="relu"))
model.add(Dropout(rate=0.5))

## Output Layer


In [None]:
# Add a final Dense layer to represent the output with the units set to 3 and activation set to "softmax".
model.add(Dense(activation="softmax", units=3))

## Compiling the Network


In [None]:
# Add the compile function that calculates the loss and uses the optimizer parameter to set the optimization algorithm.
model.compile(loss="categorical_crossentropy", optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=["accuracy"])

## Fit Data to the Network


In [None]:
# Add the fit function and set the input data for this model, the epochs to the fit stage, and shuffle the data, so the network doesn't rely on a pattern to learn.
model.fit(x=x_rev_train_v, y=y_pol_train, batch_size=256, epochs=10, validation_data=(x_rev_test_v, y_pol_test))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1edba988940>

## Evaluating the Network


In [None]:
# Calculate the scores and calculate the loss and accuracy of your model.
scores=model.evaluate(x_rev_test_v, y_pol_test, verbose=1)

# Print the test accuracy of your model.
print("Test Accuracy: ", scores[1])

Test Accuracy:  0.7124999761581421


## Save the Model


In [None]:
# Export your model.
model.save("sentiments.h5")