# dmarketing.ai

## Deep Learning

## Project: Build a reviews' classifier based on Amazon's reviews dataset

*__dmarketing.ai__* (*Digital Marketing AI*) is a deep learning project focused on building severally, vastly diverse usage, neural net models. <br /><br /> 
In this _Jupyter Notebook_ you will find step by step build Recurrent Neural Network that can perform review sentiment classification and deside whether customer review was : *'negative', 'neutral' , 'positive'*. <br /> 
Dataset for building a classifier were downloaded from [link](https://registry.opendata.aws/amazon-reviews/#usageexamples) and contains *train.csv* and *test.csv* files which contain training and testing data respectivly.

## Step 1: Load & Explore Dataset

In [None]:
import os

DATA_FOLDER_PATH = "./data"
TRAIN_DATA_PATH = os.path.join(DATA_FOLDER_PATH, 'train.csv')
TEST_DATA_PATH = os.path.join(DATA_FOLDER_PATH, 'test.csv')

#### Counting the number of samples available in the csv_file.

I constructed a generator by which I will be iterate through CSV files due to their large size which makes them impossible to load into RAM memory.

In [None]:
def count_samples(csv_file_path):
    '''Counts samples of data containes in a single csv file.

            Parameters:
            csv_file_path (str): file system path to a csv file with data samples.

            Returns:
            sample_cnt (int): number of samples.
    '''
    samples_cnt = 0

    with open(csv_file_path, 'r', errors='ignore') as csv_file:
        for lines in csv_file:
            samples_cnt += 1
    return samples_cnt

In [None]:
train_samples = count_samples(TRAIN_DATA_PATH)
test_samples = count_samples(TEST_DATA_PATH)

In [None]:
print("Number of train samples : {}\nNumber of test samples : {}".format(train_samples, test_samples))

#### Constructing a function that allows to iterate over choosen column in CSV file.
The data inside CSV file contain three columns containing following features:

- `'rating'` is an integer that represents rating of a corresponding review.
- `'title'` is a string that represents title of a corresponding review.
- `'review'` is a string that constain text of a review. 

In [None]:
import csv, string

RATING_IDX = 0
TITLE_IDX = 1
REVIEW_IDX = 2

def flow_from_csv(path=None, col_idx=REVIEW_IDX):
    '''Produces generator that iterates through col_idxes in csv file containg data.
                
            Parameters:
            path (str): file system path to a csv file with data samples.
            loc_idx(int): number of column.
                
            Returns:
            generator: generator that returns data from each row specified by col_idx.
     '''
    with open(path, 'r', errors='ignore') as csv_file:
        reader = csv.reader(csv_file)
        
        readed_cnt = 0
        while readed_cnt != train_samples - 1:
            row = next(reader)
            
            text = row[col_idx].lower()
            text = text.translate(str.maketrans('', '', string.punctuation))
            
            readed_cnt += 1
            yield text
            
    return

#### Creating a Tokenizer class object and fiting it on reviews in train dataset.

Tokenizer object will be then used to : create sequences out of strings of reviews, padding those sequences to a given length. <br/>
For more detailed description visit [keras.preprocessing.text.Tokenizer documentation](https://keras.io/preprocessing/text/)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
review_gen = flow_from_csv(TRAIN_DATA_PATH, REVIEW_IDX)

tokenizer.fit_on_texts(review_gen)

#### Based on the tokenizer determining the most frequently occured words.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

word_cnt = tokenizer.word_counts

# Extracts the most frequent words
most_freq_words = 30

sorted_items = list(word_cnt.items())
sorted_items.sort(key=lambda item: item[-1], reverse=True)

most_freq_keys = [k for k, v in sorted_items[:most_freq_words]]
most_freq_values = [v for k, v in sorted_items[:most_freq_words]]

# Draws bar char of most frequent words
plt.figure(figsize=(10, 10))
plt.title(str(most_freq_words) + " most frequent words")
plt.xlabel("Word")
plt.xticks(rotation=-90)
plt.ylabel("Occurance")
plt.bar(most_freq_keys, most_freq_values)

### Step 2: Design and Validate a Model Architecture 


#### Creating data pipeline.

Creating a data pipeline that will produce generator returning *tuple(inputs, targets)* that will be used to train neural network model.

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

classes = ["negative", "neutral", "positive"]
# Returns index of a corresponding class in classes list
rating2class = {'1': 0,
                '2': 0,
                '3': 1,
                '4': 2,
                '5': 2}

NUMBER_OF_CLASSES = len(classes)         # Returns number of unique values in rating2class dict 

In [None]:
def skip_rows(gen, num):
    '''Skips rows of csv file read by genertor.

            Parameters:
            gen (generator): csv file reader generator.
            num (int): number of row to be skipped.
    '''
    skipped = 0
    while skipped != num:
        next(gen)
        skipped += 1;

In [None]:
def data_pipeline(path, maxlen, batch_size=1, start_idx=0):
    '''Produces generator that will be used to train neural network.

            Parameters:
            path (str): file system path to a csv file with data samples.

            Returns:
            generator: generator that returns tuple(list_of_reviews, list_of_outputs).
    '''
    csv_file = open(path, 'r', errors='ignore')
    reader = csv.reader(csv_file)

    readed_cnt = start_idx
    skip_rows(reader, start_idx)

    while True:
        reviews = []
        ratings = []

        for _ in range(batch_size):
            row = next(reader)
            # Extract ratings
            rating = row[RATING_IDX]
            rating_class = rating2class[rating]
            ratings.append(rating_class)

            # Extract and clear reviews
            review = row[REVIEW_IDX].lower()
            review = review.translate(str.maketrans('', '', string.punctuation))
            reviews.append(review)

            readed_cnt += 1

        # Tokenize and pad sequences
        reviews = tokenizer.texts_to_sequences(reviews)
        reviews = pad_sequences(reviews, maxlen=maxlen)
        
        # Converting input to binary class matrix
        ratings = to_categorical(ratings, num_classes=NUMBER_OF_CLASSES, dtype='uint8')
            
        yield reviews, ratings, [None]
            
        # Provides infinite data generation
        if readed_cnt + batch_size >= train_samples - 1:
            csv_file.close()
            csv_file = open(path, 'r', errors='ignore')
            reader = csv.reader(csv_file)
            readed_cnt = start_idx
            # Skip first start_idx rows
            skip_rows(reader, start_idx)

#### Creating and training model.

Creating Recurrent Neural Network model and training it.

In [None]:
max_len = 80    # Maximal length of a sequence that can be feed to neural network

model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(max_words, 128, input_length=max_len),
    tf.keras.layers.GRU(64, recurrent_dropout=0.2, dropout=0.2),
    tf.keras.layers.Dense(NUMBER_OF_CLASSES, activation='softmax')
])

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
model.summary()

In [None]:
tensorboard_cb = [
    tf.keras.callbacks.TensorBoard(
                                log_dir='best_sequential_model',
                                histogram_freq=1,
                                embeddings_freq=1)
]

In [None]:
data_gen = data_pipeline(TRAIN_DATA_PATH, maxlen=max_len, batch_size=256)
val_gen = data_pipeline(TRAIN_DATA_PATH, maxlen=max_len, batch_size=256, start_idx=2 * 10^6)

history = model.fit(data_gen, steps_per_epoch=1000,
                    epochs=10, 
                    validation_data=val_gen,
                    validation_steps=500)

#### Plots of loss function and accuracy parameters with respect to epoch.

In [None]:
# Plot accuracy
hist_dict = history.history
train_acc = hist_dict['acc']
val_acc = hist_dict['val_acc']

epochs = np.arange(1, 41)

plt.plot(epochs, train_acc, 'bo', label='Train accuracy')
plt.plot(epochs, val_acc, 'r-', label='Validation accuracy')
plt.grid()
plt.legend(loc='best')

In [None]:
# Plot loss
train_loss = hist_dict['loss']
val_loss = hist_dict['val_loss']

plt.plot(epochs, train_loss, 'bo', label='Train loss')
plt.plot(epochs, val_loss, 'r-', label='Validation loss')
plt.grid()
plt.legend(loc='best')

#### Saving best Sequential model of RNN obtain.

In [None]:
model.save('best_sequential_model.h5')

### Step 3: Test a Model on New Reviews

In [None]:
test_gen = data_pipeline(TRAIN_DATA_PATH, maxlen=max_len, batch_size=int(train_samples/1000))

model.evaluate(test_gen, steps=1000)

### Step 4: Summary

<font color=red>__*Final conclusions*__</font>