# Use simple techniques for feature Engineering and to do multi-class classification to predict review ratings based on the Amazon Reviews dataset

<b>Objective for this exercise:</b>
    * Establish NLP prediction accuracy baseline using simple ML models
    * Explore different permutation of feature engineering techniques, data, and classification algorithms
    * Compare accuracy of preduction using the following information:
        * Product Title
        * Review Headline
        * Review Body
    * (If time allows) see if using only helpful reviews to train improves our accuracy for our predictions - this reduces our 110k dataset to 35k


<b>Feature Engineering Techniques:</b>
    * bag of words
    * TF-IDF
    * Topic Modeling
    
    
<b>Classification:</b>
    * Logistic Regression Classification
    * K-nearest Neighbors Classification
    * Radius Neighbors Classification - document suggests the Radius Neighbors might be a better fit if our data is no uniform. From our exploratory data analysis, we see that most reviews skew towards 4 or 5 stars
    
    
    
<b>Data:</b>

Data used in this notebooks came from Amazon reviews dataset - Wirless category. First it was converted from tsv to csv. Then it was pre-processed in the previous notebook using various text processing techniques. For details, please see: [amazon_review_preprocessing.ipynb](amazon_review_preprocessing.ipynb)


Example of how to do this:
```
python preprocess_amazon.py -l INFO -r -o dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-smallout.csv dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-smallin.csv
```


<b>Memory Requirement:</b>

| File | Python Memory |
|------|---------------|
| amazon_reviews_us_Wireless_v1_00-tinyout.csv | 20 - 26 GB |

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from datetime import timedelta, datetime
import time


%matplotlib inline

In [2]:
# set global variables

# I'm finding that running these models on my laptop takes forever and they are not finishing so I'm going to start
# with a really small file just to validate my code
#
# datafile was generated from amazon_review_preprocessing.ipynb - this file has 1k reviews randomly chosen
# from original file
KEEP_COLUMNS = ["product_title", "helpful_votes", "review_headline", "review_body", "star_rating"]
TIME_FORMAT = '%Y-%m-%d %H:%M:%S'
DATE_FORMAT = '%Y-%m-%d'
OUTCOME_COLUMN = "star_rating"


# Configuration
DATA_FILE = "dataset/amazon_reviews/amazon_reviews_us_Wireless_v1_00-tinyout.csv"
NEIGHBORS = [5] # default
# NEIGHBORS = [1, 3, 5, 7, 9, 11]

# Radius for RadiusNeighbor
RADII = [5.0] # this is the lowest number I tried that was able to find a neighbor
# RADII = [5.0, 7.0, 9.0, 11.0, 13.0]

# logistic regression settings
C= [1.0] # default
# C = [0.2, 0.4, 0.6, 0.8, 1.0]


FEATURE_COLUMN = "review_body"
ENABLE_KNN = True
ENABLE_RN = True
ENABLE_LR = True
ENABLE_BOW = True
ENABLE_TFIDF = True
WRITE_TO_CSV = True
OUTFILE = "amazon_review_classifier_simple.csv"



In [3]:
# read in DF
df = pd.read_csv(DATA_FILE)[KEEP_COLUMNS]
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22414 entries, 0 to 22413
Data columns (total 5 columns):
product_title      22414 non-null object
helpful_votes      22414 non-null int64
review_headline    22414 non-null object
review_body        22414 non-null object
star_rating        22414 non-null int64
dtypes: int64(2), object(3)
memory usage: 875.6+ KB
None


Unnamed: 0,product_title,helpful_votes,review_headline,review_body,star_rating
0,tfy universal car headrest mount holder portab...,0,good enough,serves purpose loud whoever sitting seat attached,3
1,iccker art nylon hair paint brush tools set bl...,0,five stars,works really well samsung s6 otterbox defender...,5
2,jbl gx series coaxial car loudspeakers certifi...,1,speakers did not sound well thought,speakers did not sound well thought would jbls...,2
3,otium screen protectors,0,really easy install included guide,absoultely perfect included install guide make...,5
4,apple watch stand vtin aluminum alloy build ho...,0,love,heres lot like stand apple watch very modern s...,5


### <font color="red">Should I include add these along with the word vectors as part of the feature set?</font>

In [4]:
# let's get some data on our text

def wc(x:str):
    return len(str(x).split())

df["pt_wc"] = df.product_title.apply(wc)
df["rh_wc"] = df.review_headline.apply(wc)
df["rb_wc"] = df.review_body.apply(wc)
df.describe()

Unnamed: 0,helpful_votes,star_rating,pt_wc,rh_wc,rb_wc
count,22414.0,22414.0,22414.0,22414.0,22414.0
mean,0.904792,3.895333,15.907335,2.956322,25.9797
std,8.709008,1.465474,9.716846,1.915684,41.441713
min,0.0,1.0,1.0,1.0,1.0
25%,0.0,3.0,9.0,2.0,8.0
50%,0.0,5.0,14.0,2.0,15.0
75%,0.0,5.0,20.0,4.0,28.0
max,868.0,5.0,92.0,21.0,1133.0


In [5]:
# Set up different dataframes for training

# outcome
Y = df["star_rating"]
X = df[FEATURE_COLUMN]

### Define a function to help us run models

In [52]:
# expand classification report into dictionary
# classifcation report is a 2 level dictionary. from documentation, it looks something like this
# {'label 1': {'precision':0.5,
#              'recall':1.0,
#              'f1-score':0.67,
#              'support':1},
#  'label 2': { ... },
#   ...
# }
def add_dict_to_dict(target, source):
    """
    target: dictionary to add to
    source: dictionary to add from
    ------
    return: dictionary with source added to target
    """
    for key, value in source.items():
        if isinstance(value, dict):
            # append key to dictionary keys
            for subkey, subvalue in value.items():
                target[f'{key}_{subkey}'] = subvalue
        else:
            target[key] = value
            
    return target
    

In [None]:
from sklearn.metrics import f1_score, precision_recall_fscore_support, classification_report, accuracy_score

def interpret_predictions(Y_test, Y_predict, report=None):
    """
    Run metrics on predictions
    
    Y_test: true results
    Y_predict: predictions from model
    results: dictionary to append results to
    ------
    return dictionary with report
    """

    # calculation metrics
#     precision, recall, fbeta_score, support = precision_recall_fscore_support(Y_test, Y_predict)
#     f1 = f1_score(Y_test, Y_predict, average='micro')
#     a_score = accuracy_score(Y_test, Y_predict)

    if not report:
        report = {}
        
#     report["accuracy_score"] = a_score
#     report["precision"] = precision
#     report["recall"] = recall
#     report["F1"] = f1
#     report["Fbeta"] = fbeta_score
    
    c_report = classification_report(Y_test, Y_predict, output_dict = True)

    report = add_dict_to_dict(report, c_report)
    
    return report


In [59]:

def model_fit_predict(model, X_train, Y_train, X_test, Y_test):
        """
        Fit the model then run predict on it
        
        model: model to train with
        X_test: training input
        Y_train: training classes
        X_test: test input
        Y_test: result
        -----
        return tuple of predictions and dictionary with train time, predict_time total time
        """

        train_time_start = datetime.now()
        print(f'Start training: {train_time_start.strftime(TIME_FORMAT)}')
        result = model.fit(bag_X_train, bag_Y_train)

        train_time_end = datetime.now()
        print(f'End training: {train_time_end.strftime(TIME_FORMAT)}')

#         score = result.score(bag_X_test, bag_Y_test)

        # calculate mean error score
        score_time_end = datetime.now()
        print(f'End Scoring: {score_time_end.strftime(TIME_FORMAT)}')
        
        # predictions
        Y_predict = model.predict(X_test)
        predict_time_end = datetime.now()
        print(f'End predict: {predict_time_end.strftime(TIME_FORMAT)}')

        # calculate times
        train_time = train_time_end - train_time_start
        train_time_min = round(train_time.total_seconds() / 60)
        print(f'Training time (min): {train_time_min}')


        score_time = score_time_end - train_time_end
        score_time_min = round(score_time.total_seconds() / 60)
        print(f'Scoring time (min): {score_time_min}')

        predict_time = predict_time_end - score_time_end
        predict_time_min = round(predict_time.total_seconds() / 60)
        print(f'Predict time (min): {predict_time_min}')

        train_examples, train_features = X_train.shape
        test_examples, test_features = X_test.shape
        

        report = {
                "train_examples": train_examples,
                "train_features": train_features,
                "test_examples": test_examples,
                "test_features": test_features,
                "train_time_min": train_time_min,
                "score_time_min": score_time_min,
                "predict_time_min": predict_time_min,
                "total_time_min": train_time_min + score_time_min + predict_time_min
               }
        
        report = interpret_predictions(Y_test, Y_predict, report)




        return (Y_predict, report)


# Bag of Words - Generate Feature Vectors

In [8]:
# TODO: try different parameters for CountVectorizers?
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(X.array)
vocab = cv.get_feature_names()
# print(f"vocab: {vocab}")
bag_pd = pd.DataFrame(cv_matrix.toarray(), columns=vocab)

In [9]:
# explore the data
print(len(vocab))
bag_pd.head()

20541


Unnamed: 0,00,000,0000,000hz,000hzsensitivity,000mah,001,002,003,004,...,zoomed,zooming,zooms,zperia,zr,zte,zumo,zune,zuzo,zx4
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# split results into training and test set
bag_X_train, bag_X_test, bag_Y_train, bag_Y_test = train_test_split(bag_pd, Y, random_state=1)

print(f"training set size {len(bag_X_train)}")
print(f"test set size {len(bag_X_test)}")

training set size 16810
test set size 5604


# TF-IDF - Generate Feature Vectors

In [11]:
# # TODO: play with min_df and max_df
# # TODO: play with variations of ngram
# tv = TfidfVectorizer(min_df=0., max_df=1., ngram_range=(1,3), use_idf=True)
# tv_matrix = tv.fit_transform(X.array)
# vocab = tv.get_feature_names()
# tv_pd = pd.DataFrame(np.round(tv_matrix.toarray(), 2), columns=vocab)

In [12]:
# # split results into training and test set
# tv_X_train, tv_X_test, tv_Y_train, tv_Y_test = train_test_split(tv_pd, Y, random_state=1)

# print(f"training set size {len(tv_X_train)}")
# print(f"test set size {len(tv_X_test)}")

# Now Let's Run Some Models

In [54]:
results_pd = pd.DataFrame()

### KNN

In [57]:
# use K-nearest neighbors to train
if ENABLE_KNN and ENABLE_BOW:
    for neighbor in NEIGHBORS:
        print(f'\n{neighbor} neighbors\n-----------------------')
        neigh = KNeighborsClassifier(n_neighbors=neighbor, n_jobs=-1)
        Y_predict, report = model_fit_predict(neigh, bag_X_train, bag_Y_train, bag_X_test, bag_Y_test)
        
        report["model"] = "KNN"
        report["feature_type"] = "BoW"
        report["parameters"] = {"neighbors": neighbor}
        

        results_pd = results_pd.append(report, ignore_index=True)

results_pd.head()




5 neighbors
-----------------------
Start training: 2019-05-17 00:58:52
End training: 2019-05-17 00:59:21
End Scoring: 2019-05-17 00:59:21
End predict: 2019-05-17 01:09:53
Training time (min): 0
Scoring time (min): 0
Predict time (min): 11


Unnamed: 0,1_f1-score,1_precision,1_recall,1_support,2_f1-score,2_precision,2_recall,2_support,3_f1-score,3_precision,...,test_examples,test_features,total_time_min,train_examples,train_features,train_time_min,weighted avg_f1-score,weighted avg_precision,weighted avg_recall,weighted avg_support
0,0.456848,0.352643,0.648469,751.0,0.08998,0.186441,0.059299,371.0,0.086053,0.213235,...,5604.0,20541.0,11.0,16810.0,20541.0,0.0,0.499946,0.489384,0.546752,5604.0


In [60]:
results_pd.T.head(48)


Unnamed: 0,0
1_f1-score,0.456848
1_precision,0.352643
1_recall,0.648469
1_support,751
2_f1-score,0.0899796
2_precision,0.186441
2_recall,0.0592992
2_support,371
3_f1-score,0.0860534
3_precision,0.213235


### Bag of Words using Radius Neighbors Classifier

Documentation says if the data is not evenly distributed Radius Neighbors might be a better algorithm so trying that here

In [None]:
if ENABLE_RN and ENABLE_BOW:
    for radius in RADII:
        rnc = RadiusNeighborsClassifier(radius=radius, n_jobs=-1)
        print(f'\nRadius: {radius}\n-----------------------')
        results = model_fit_predict(rnc, bag_X_train, bag_Y_train, bag_X_test, bag_Y_test)
        results["model"] = "RN"
        results["feature_type"] = "BoW"
        results["parameters"] = {"radius": radius}
        results_pd = results_pd.append(results, ignore_index=True)

results_pd.head()


### Bag of Word using Logistic Regression - what parameters should I play with here?

In [None]:
if ENABLE_LR and ENABLE_BOW:
    for c in C:
        lr = LogisticRegression(random_state=0, solver='lbfgs',
                                  multi_class='auto',
                                max_iter=1000, n_jobs=-1)
        print(f'\nRadius: {radius}\n-----------------------')
        results = model_fit_predict(lr, bag_X_train, bag_Y_train, bag_X_test, bag_Y_test)
        results["model"] = "LR"
        results["feature_type"] = "BoW"
        results["parameters"] = {"c": c}
        results_pd = results_pd.append(results, ignore_index=True)

results_pd.head()


# TD-IDF

In [None]:
# TODO: play with min_df and max_df
# TODO: play with variations of ngram
tv = TfidfVectorizer(min_df=0., max_df=1., ngram_range=(1,3), use_idf=True)
tv_matrix = tv.fit_transform(X.array)
vocab = tv.get_feature_names()
tv_pd = pd.DataFrame(np.round(tv_matrix.toarray(), 2), columns=vocab)

In [None]:
# split results into training and test set
tv_X_train, tv_X_test, tv_Y_train, tv_Y_test = train_test_split(tv_pd, Y, random_state=1)

print(f"training set size {len(tv_X_train)}")
print(f"test set size {len(tv_X_test)}")

### KNN

In [None]:
# use K-nearest neighbors to train

if ENABLE_KNN and ENABLE_TFIDF:

    for neighbor in NEIGHBORS:
        print(f'\n{neighbor} neighbors\n-----------------------')
        neigh = KNeighborsClassifier(n_neighbors=neighbor, n_jobs=-1)
        results = model_fit_predict(neigh, tv_X_train, tv_Y_train, tv_X_test, tv_Y_test)
        results["model"] = "KNN"
        results["feature_type"] = "TFIDF"
        results["parameters"] = {"neighbors": neighbor}
        results_pd = results_pd.append(results, ignore_index=True)

results_pd.head()
        


### Radius Neighbor Classifier

In [None]:
if ENABLE_RN and ENABLE_TFIDF:
    for radius in RADII:
        rnc = RadiusNeighborsClassifier(radius=radius, n_jobs=-1)
        print(f'\nRadius: {radius}\n-----------------------')
        results = model_fit_predict(rnc, tv_X_train, tv_Y_train, tv_X_test, tv_Y_test)
        results["model"] = "RN"
        results["feature_type"] = "TFIDF"
        results["parameters"] = {"radius": radius}
        results_pd = results_pd.append(results, ignore_index=True)

results_pd.head()


### Logistic Regression Classifier

In [None]:
if ENABLE_LR and ENABLE_TFIDF:
    for c in C:
        lr = LogisticRegression(random_state=0, solver='lbfgs',
                                  multi_class='auto',
                                max_iter=1000, n_jobs=-1)
        print(f'\nRegularization: {c}\n-----------------------')
        results = model_fit_predict(lr, tv_X_train, tv_Y_train, tv_X_test, tv_Y_test)
        results["model"] = "LR"
        results["feature_type"] = "TFIDF"
        results["parameters"] = {"c": c}
        results_pd = results_pd.append(results, ignore_index=True)

results_pd.head()


# Write data to an output file so we can load it back in later

In [None]:
if WRITE_TO_CSV:
    results_pd.to_csv(f'{datetime.now().strftime(DATE_FORMAT)}-{FEATURE_COLUMN}-{OUTFILE}', index=False)

# Data Visualization For Our Results

In [None]:
# # visualize some data
# sns.set(font_scale=2)
# sns.set_context(font_scale=3)
# f, ax = plt.subplots(6, 2, figsize=(20,50))
# plt.tight_layout(pad=2, h_pad=5)

# # KNN Graphs


# # total time by neighbor
# sns.lineplot(x="neighbors", y="total_time_min", data=knn_results_pd, marker='o', color='b', ax=ax[0, 0])
# ax[0, 0].set_title("KNN BoW Total Time (minutes)")

# # score by neighbor
# sns.lineplot(x="neighbors", y="score", data=knn_results_pd, marker='o', color='b', ax=ax[0, 1])
# ax[0, 1].set_title("KNN BoW Score")

# # total time by neighbor
# sns.lineplot(x="neighbors", y="total_time_min", data=knn_tv_results_pd, marker='o', color='b', ax=ax[1, 0])
# ax[1, 0].set_title("KNN TFIDF Total Time (minutes)")

# # score by neighbor
# sns.lineplot(x="neighbors", y="score", data=knn_tv_results_pd, marker='o', color='b', ax=ax[1, 1])
# ax[1, 1].set_title("KNN TFIDF Score")


# # Radius Neighbor Graphs

# # total time by radius
# sns.lineplot(x="radius", y="total_time_min", data=rn_results_pd, marker='o', color='g', ax=ax[2, 0])
# ax[2, 0].set_title("Radius BoW Total Time (minutes)")

# # score by radius
# sns.lineplot(x="radius", y="score", data=rn_results_pd, marker='o', color='g', ax=ax[2, 1])
# ax[2, 1].set_title("Radius BoW Score")

# # total time by radius
# sns.lineplot(x="radius", y="total_time_min", data=rn_tv_results_pd, marker='o', color='g', ax=ax[3, 0])
# ax[3, 0].set_title("Radius TFIDF Total Time (minutes)")

# # score by radius
# sns.lineplot(x="radius", y="score", data=rn_tv_results_pd, marker='o', color='g', ax=ax[3, 1])
# ax[3, 1].set_title("Radius TFIDF Score")


# # Logistic Regression Graphs

# # total time by c
# sns.lineplot(x="c", y="total_time_min", data=lr_results_pd, marker='o', color='c', ax=ax[4, 0])
# ax[4, 0].set_title("Logistic Regression BoW Total Time (minutes)")

# # score by c
# sns.lineplot(x="c", y="score", data=lr_results_pd, marker='o', color='c', ax=ax[4, 1])
# ax[4, 1].set_title("Logistic BoW Regression Score")


# # total time by c
# sns.lineplot(x="c", y="total_time_min", data=lr_tv_results_pd, marker='o', color='c', ax=ax[5, 0])
# ax[5, 0].set_title("Logistic Regression TFIDF Total Time (minutes)")

# # score by c
# sns.lineplot(x="c", y="score", data=lr_tv_results_pd, marker='o', color='c', ax=ax[5, 1])
# ax[5, 1].set_title("Logistic TFIDF Regression Score")

