# dmarketing.ai

### Deep Learning

### Project: Build a reviews' classifier based on Amazon's reviews dataset

<font color=red>__*Provide project's description here*__</font>

### Step 0: Load The Data 

In [None]:
import os

DATA_FOLDER_PATH = "./data"
TRAIN_DATA_PATH = os.path.join(DATA_FOLDER_PATH, 'train.csv')
TEST_DATA_PATH = os.path.join(DATA_FOLDER_PATH, 'test.csv')

In [None]:
def count_samples(csv_file_path):
    samples_cnt = 0

    with open(csv_file_path, 'r', errors='ignore') as csv_file:
        for lines in csv_file:
            samples_cnt += 1
    return samples_cnt

In [None]:
train_samples = count_samples(TRAIN_DATA_PATH)
test_samples = count_samples(TEST_DATA_PATH)

In [None]:
import csv, string

RATING_IDX = 0
TITLE_IDX = 1
REVIEW_IDX = 2

def flow_from_csv(path=None, col_idx=REVIEW_IDX):
    '''Produces generator that iterates through col_idxes in csv file containg data.
                
            Parameters:
            path (str): file system path to a csv file with data samples.
            loc_idx(int): number of column.
                
            Returns:
            generator: generator that returns data from each row specified by col_idx.
     '''
    with open(path, 'r', errors='ignore') as csv_file:
        reader = csv.reader(csv_file)
        
        readed_cnt = 0
        while readed_cnt != train_samples - 1:
            row = next(reader)
            
            text = row[col_idx].lower()
            text = text.translate(str.maketrans('', '', string.punctuation))
            
            readed_cnt += 1
            yield text
            
    return

### Step 1: Dataset Summary & Explonation

<font color=red>__*Feature descriptions*__</font>

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
review_gen = flow_from_csv(TRAIN_DATA_PATH, REVIEW_IDX)

tokenizer.fit_on_texts(review_gen)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook

In [None]:
word_cnt = tokenizer.word_counts

# Extracting the most frequent words
most_freq_words = 30

sorted_items = list(word_cnt.items())
sorted_items.sort(key=lambda item: item[-1], reverse=True)

most_freq_keys = [k for k, v in sorted_items[:most_freq_words]]
most_freq_values = [v for k, v in sorted_items[:most_freq_words]]

# Drawing bar char of most frequent words
plt.title(str(most_freq_words) + " most frequent words")
plt.xlabel("Word")
plt.xticks(rotation=-90)
plt.ylabel("Occurance")
plt.bar(most_freq_keys, most_freq_values)

### Step 2: Design and Validate a Model Architecture 

<font color=red>__*Neural Net architecture descriptions tested*__</font>

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def data_pipeline(path, maxlen, batch_size=1):
    '''Produces generator that will be used to train neural network.
                
            Parameters:
            path (str): file system path to a csv file with data samples.
                
            Returns:
            generator: generator that returns tuple(list_of_reviews, list_of_outputs).
     '''
    with open(path, 'r', errors='ignore') as csv_file:
        reader = csv.reader(csv_file)
        
        readed_cnt = 0
        while readed_cnt + batch_size <= train_samples - 1:
            reviews = []
            ratings = []
            
            for _ in range(batch_size):
                row = next(reader)
                # Extract ratings
                rating = row[RATING_IDX]
                ratings.append(ord(rating) - ord('0'))
                
                # Extract and clear reviews
                review = row[REVIEW_IDX].lower()
                review = review.translate(str.maketrans('', '', string.punctuation))
                reviews.append(review)
                
                readed_cnt += 1
            
            # Tokenize and pad sequences
            reviews = tokenizer.texts_to_sequences(reviews)
            reviews = pad_sequences(reviews, maxlen=maxlen)
            
            # Converting input to binary class matrix
            ratings = to_categorical(ratings, num_classes=6, dtype='uint8')
            ratings = np.delete(ratings, 0, axis=1)
            
            yield (reviews, ratings)
            
    return

### Step 3: Test a Model on New Reviews

In [None]:
def test():
    pass

### Step 4: Summary

<font color=red>__*Final conclusions*__</font>