# Init

In [1]:
import gzip

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

Convert `gzip` into Dataframe.

In [2]:
df = pd.DataFrame([eval(l) for l in gzip.open("./data.json.gz")])

Sample of the data.

In [3]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]","This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy",5.0,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]","I had a factory Glock tool that I was using for my Glock 26, 27, and 17. I've since lost it and had needed another. Since I've used Ghost products prior, and know that they are reliable, I had decided to order this one. Sure enough, this is just as good as a factory tool.",5.0,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]","If you don't have a 3/32 punch or would like to have one in your Glock bag, this is okay. The butt end of it is handy for pushing pins back in place. If you already have a 3/32 punch and don't need another, don't both with this one.",4.0,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]","This works no better than any 3/32 punch you would find at the hardware store. Actually, I think you would be better with a regular punch as it has more to hold on to.",4.0,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]","I purchased this thinking maybe I need a special tool to easily pop off my base plates for my magazines, but it does the same as a regular punch tool. Glock mags are a pain to get the base plates off. The tool does not really make a difference.",4.0,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 296337 entries, 0 to 296336
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   reviewerID      296337 non-null  object 
 1   asin            296337 non-null  object 
 2   reviewerName    294935 non-null  object 
 3   helpful         296337 non-null  object 
 4   reviewText      296337 non-null  object 
 5   overall         296337 non-null  float64
 6   summary         296337 non-null  object 
 7   unixReviewTime  296337 non-null  int64  
 8   reviewTime      296337 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 20.3+ MB


Timestamp range of the dataset.

In [5]:
timestamp = pd.to_datetime(df["unixReviewTime"], unit="s")
timestamp.min(), timestamp.max()

(Timestamp('2002-03-07 00:00:00'), Timestamp('2014-07-23 00:00:00'))

# Preprocessing

In [6]:
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

stop_words = stopwords.words("english")

Removing stop words from review text.

In [7]:
wordnet_lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r"[a-z]+")


def preprocess(document):
    document = document.lower()  # Convert to lowercase
    words = tokenizer.tokenize(document)  # Tokenize
    words = [w for w in words if not w in stop_words]  # Removing stopwords
    # Lemmatizing
    for pos in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]:
        words = [wordnet_lemmatizer.lemmatize(x, pos) for x in words]
    return " ".join(words)


processed_text = df["reviewText"].apply(preprocess)

Sample of text pre and post removing stopwords.

In [8]:
print("Pre:", df["reviewText"][0])
print("Post:", processed_text[0])

Pre: This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy
Post: come time veru happy have use already make take pin glock easy


Create binary feature to differentiate between positive and negative review.

In [9]:
def cat_is_positive(x):
    if x > 3:
        return 1
    return 0


is_positive = df.overall.apply(cat_is_positive)

# Models

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    processed_text, is_positive, random_state=42
)

vectorizer = CountVectorizer(min_df=5).fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [11]:
feature_names = vectorizer.get_feature_names_out()
print("Number of features: {}".format(len(feature_names)))
print("Show some feature names : \n", feature_names[::1000])

Number of features: 18897
Show some feature names : 
 ['aa' 'atlas' 'breathe' 'cmp' 'dayhike' 'eas' 'fitbit' 'groupset'
 'inactivity' 'lace' 'metel' 'oahu' 'pine' 'rank' 'run' 'slot' 'sunrise'
 'trailer' 'vinci']


In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

scores = cross_val_score(MultinomialNB(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.3f}".format(np.mean(scores)))

Mean cross-validation accuracy: 0.862


In [14]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


def model_evaluation(predictions, y_test):
    """
    Print model evaluation to predicted result
    """
    print(
        "\nAccuracy on validation set: {:.4f}".format(
            accuracy_score(y_test, predictions)
        )
    )
    print(
        "\nClassification report : \n",
        classification_report(y_test, predictions),
    )
    print("\nConfusion Matrix : \n", confusion_matrix(y_test, predictions))


mnb = MultinomialNB()
mnb.fit(X_train, y_train)
predictions = mnb.predict(X_test)
model_evaluation(predictions, y_test)


Accuracy on validation set: 0.8614

Classification report : 
               precision    recall  f1-score   support

           0       0.52      0.48      0.50     10754
           1       0.91      0.93      0.92     63331

    accuracy                           0.86     74085
   macro avg       0.72      0.70      0.71     74085
weighted avg       0.86      0.86      0.86     74085


Confusion Matrix : 
 [[ 5177  5577]
 [ 4688 58643]]
