In [20]:
import re, unicodedata, os, time, math
import numpy as np
import pandas as pd

import tensorflow as tf

import nltk

---

### Data pre-processing

---

In [3]:
df = pd.read_csv('./data/E-Commerce Reviews Cleaned.csv', 
                 encoding="ISO-8859-1",
                 header=None,
                 names=["Classification", "Review"]) 
#can decode unicode characters without error, array each string is each column



In [4]:
df.shape

(23486, 2)

In [5]:
df.head()

Unnamed: 0,Classification,Review
0,1,Absolutely wonderful - silky and sexy and comf...
1,1,Love this dress! it's sooo pretty. i happene...
2,0,I had such high hopes for this dress and reall...
3,1,"I love, love, love this jumpsuit. it's fun, fl..."
4,1,This shirt is very flattering to all due to th...


In [6]:
df.isnull().sum()

Classification      0
Review            845
dtype: int64

In [7]:
m_df = df.dropna()

In [8]:
m_df.isnull().sum()

Classification    0
Review            0
dtype: int64

In [25]:
m_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22641 entries, 0 to 23485
Data columns (total 2 columns):
Classification    22641 non-null int64
Review            22641 non-null object
dtypes: int64(1), object(1)
memory usage: 530.6+ KB


In [26]:
m_df.Classification.value_counts() #Overwhelming majority of reviews are classified as positive.

1    18540
0     4101
Name: Classification, dtype: int64

In [14]:
def preprocess_sentence(s): #make each word into their own vector of size n
    s = unicode_to_ascii(s.lower().strip()) #all lowercase and then remove the spaces. Make sure you do this
    #we take all the punctuation and separate it. it will mess up the system without it, like hello, is different then hello
    s = re.sub(r"([?.!;,:()\"])", r" \1 ", s) #take first argument it matches and puts a space after it. inside [] is what we want to match
    
    s = re.sub(r'[" "]+', " ", s) #replace spaces if you want to get rid of tabs it's \t. thsi replaces with single space.
    
    s = re.sub(r"[^a-zA-Z?.!;:,()\"]+", " ", s) #just removes anything that we do not want. exluding what is in []
    
    s = s.rstrip().strip()  #just remove anything at the start that we dont want.
    
    s = '<start> ' + s + ' <end>' #Differentiate between different lengths of the arrays. Tokenization: all sentences are equal length, the biggest one acts as the size everyone else needs to be. so we add 0's. We do not want to consider anything after the end, aka the zeros.
    
    return s

def tokenize(lang): #Actually doing the tokenizing. lang is entire dataset.
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='') #uses keras, high level API that allows you to do functions without writing code
    
    lang_tokenizer.fit_on_texts(lang) #implementing it to read the dataset. Learn what longest one is ect.
    
    tensor = lang_tokenizer.texts_to_sequences(lang) #takes each word in the sentence and turns it into a number that represents it.
    
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                          padding="post") #adds the zeros we needed before. post is after pre is before.
    
    return tensor, lang_tokenizer

def load_dataset(dataframe): #actually process dataset
    clean_reviews = [preprocess_sentence(s) for s in dataframe["Review"]] #looks at all text in the review column and preprocess them all.
    
    input_tensor, input_tokenizer = tokenize(clean_reviews) #tokenize the reviews. same as tensor and lang_tokenizer. 
    target_tensor = dataframe["Classification"].tolist()
    
    _, indices = np.unique(target_tensor, return_inverse=True) #_ means we don't care about some variables so we dont list them. grab everything that is different and make it into a number.
    indices = indices.astype(np.int32) #makes our dtypes the same
    target_tensor = np.expand_dims(indices, axis=1) #add dimension to columns (axis 1)
    
    return input_tensor, target_tensor, input_tokenizer

#Dtypes for tensors need to match.


In [15]:
input_tensor, target_tensor, input_tokenizer = load_dataset(m_df)

In [16]:
np.array(input_tensor) #words are numbers with our padded zeros.

array([[ 11, 256, 521, ...,   0,   0,   0],
       [ 11,  27,   9, ...,   0,   0,   0],
       [ 11,   3,  74, ...,   0,   0,   0],
       ...,
       [ 11,   9,  33, ...,   0,   0,   0],
       [ 11,   3,  79, ...,   0,   0,   0],
       [ 11,   9,  23, ...,   0,   0,   0]], dtype=int32)

In [17]:
np.array(target_tensor) #unicoded, we cant feed the labels into the system obv. We can fix in our load_dataset function.

array([[1],
       [1],
       [0],
       ...,
       [0],
       [1],
       [1]], dtype=int32)

In [18]:
np.array(input_tensor).shape #largest review is 137 so all arrays are 137

(22641, 137)

In [19]:
np.array(target_tensor).shape #This shows that we have 22,641 classified reviews, which is the same as our input_tensor
#This also means that we were able to unicode all of the classifications successfully.

(22641, 1)

In [21]:
from tensorflow.python.keras.models import Sequential #sequential means any layer comes after eachother. 
#One after another. CANT USE SEQUENTIAL FOR RESIDUAL NETWORKS, thats dynamic

from tensorflow.python.keras.layers import Dense, Flatten, Embedding 
#Embedding is first turns each word into vector.

EMBEDDING_DIM = 2 #any power of two the higher is more computing power #how many numbers represent one word
VOCAB_SIZE = len(input_tokenizer.word_index)+1 #this is each word. need to add one. Essentally how many words there are. Dict.
INPUT_LENGTH = input_tensor.shape[1] #the second value from shape was word length. 


In [23]:
model = Sequential() #defining a blank model
model.add(Embedding(VOCAB_SIZE,
                    EMBEDDING_DIM,
                    input_length=INPUT_LENGTH,
                   name='embedding')) #translate text into vectors, adds a name to see different layers.
model.add(Flatten()) 
model.add(Dense(1, activation='sigmoid')) #the 1 is how many end points. sigmoid = closer to 0 it negative, 1 is positive

#you can make it more complex if you add parameters to dense, activation can be relu and a differet number.

#you have so many vectors and you need to flatten it for the system. Takes vectors and stacks them then adds.
#instead of 137 vectors you have just one. i.e. (a,b so a+b=c) all vectors will end up being 60160 length (INPUT_LENGTH*EMBEDDING_DIM)
#Each word has its own vector, so each vector needs to be stacked beside eachother. length is different than the vector. it's the size of the embedding
#look up word2vec******

In [None]:
model.compile(optimizer='adam',
             loss="binary_crossentropy",
             metrics=["accuracy"]) #uses gradient descent. then binary 0 or 1 class. then shows accuracy


model.fit(input_tensor, target_tensor, epochs=2, verbose=1) #verbose is seeing output, epochs is number of times through a dataset.


In [None]:
#automatically take a sentence and tokenize it then feeds it into the system. then do predict("This sucks")

def predict(sentence):
    # creating cleaned input, output pairs
    sentence_tokens = preprocess_sentence(sentence)

    sentence_tokens = input_tokenizer.texts_to_sequences([sentence_tokens])

    sentence_length = input_tensor.shape[1]

    for i, s in enumerate(sentence_tokens):
        sentence_tokens[i] = s + ([0] * (sentence_length - len(s)))

    result = model.predict(sentence_tokens)

    if result[0][0] < 0.5:
        print("'{0}' has a NEGATIVE sentiment with confidence {1}".format(sentence, 1-result[0][0]))
    elif result[0][0] >= 0.5:
        print("'{0}' has a POSITIVE sentiment with confidence {1}".format(sentence, result[0][0]))