In [124]:
import json
import gzip
import pandas as pd

In [125]:

# Importing metadata zip file and converting it to dataframe

def getDF(path):
  # Unzip the file, load in each line as an object
  g = gzip.open(path, 'rb')
  g = [json.loads(l) for l in g]

  # Map to a dictionary, then load in as a dataframe
  dict_df = {i: d for (i, d) in enumerate(g)}
  return pd.DataFrame.from_dict(dict_df, orient='index')

df = getDF('All_Beauty.json.gz')

In [126]:
# Find any reviews that are empty or null and remove them (there are non)

df_empty_reviews_index = (df["reviewText"] == "" )|( df["reviewText"].isnull())
print("There are no empty reviews")

df[df_empty_reviews_index]

df = df.drop(df[ ( (df["reviewText"] == "" ) | ( df["reviewText"].isnull()) ) ].index)

There are no empty reviews


In [127]:
# Selecting only the colums that are required for analysis

colums_reviewtext_asin = ["reviewText", "asin", "reviewerID"]
df = df[colums_reviewtext_asin]

df.describe()


Unnamed: 0,reviewText,asin,reviewerID
count,370946,370946,370946
unique,319643,32571,323687
top,Good,B000FOI48G,A2GJX2KCUSR0EI
freq,802,8671,27


In [128]:
# Remove all duplicates as they affect classifiers learning 

# Remove duplicate reviews
df[df.duplicated("reviewText")]
df = df.drop_duplicates("reviewText", keep="last")
df.describe() 

# Matches number of unique reviews now

Unnamed: 0,reviewText,asin,reviewerID
count,319643,319643,319643
unique,319643,31438,299718
top,My husband wanted to reading about the Negro ...,B000FOI48G,A2GJX2KCUSR0EI
freq,1,8268,25


In [129]:
# Identify range of reviews in after duplicates removal

df_reviews_without_empty = df["reviewText"].apply(lambda x: len(x.split()))
#df_descriptions_without_empty.describe()
df_reviews_without_empty.describe([0.1,0.15,0.20,0.25,0.30,0.75,0.85,0.90,0.92,0.95,0.97,0.98,0.99])

# under or equal to 95% (140 words )because after it increases substantially in length only for a small portion of reviews
# However choose 97% to help the classifier learn from larger samples
# under or equal to 20% (11 words) because uder that too litle words for classifier to learn especially after it is cleaned 

count    319643.000000
mean         42.042973
std          59.966297
min           1.000000
10%           5.000000
15%           7.000000
20%           9.000000
25%          11.000000
30%          13.000000
50%          24.000000
75%          49.000000
85%          73.000000
90%          95.000000
92%         109.000000
95%         140.000000
97%         181.000000
98%         216.000000
99%         287.000000
max        2443.000000
Name: reviewText, dtype: float64

In [130]:
# Remove reviews with more or less than xx pre-cleaned words

# Split at any white space 
df["num_words_reviews"] = df["reviewText"].apply(lambda x: len(x.split()))

# Check if under or equal to 80% words fulfils withs condition and set it
df = df[(df["num_words_reviews"] <= 181) & (df["num_words_reviews"] >= 11)]

df["reviewText"].describe()

# Reviews consisting of less than 8 word might not be usefull for classification as clasifier won't have enough information to learn from them
# Especially after cleaning, only a few words will be left

count                                                224502
unique                                               224502
top       My  husband wanted to reading about the Negro ...
freq                                                      1
Name: reviewText, dtype: object

In [131]:
# Identify range of reviews in after duplicates removal

df_reviews_without_empty = df["reviewText"].apply(lambda x: len(x.split()))
#df_descriptions_without_empty.describe()
df_reviews_without_empty.describe([0.03,0.1,0.75,0.85,0.90,0.95])

count    224502.000000
mean         40.933132
std          28.759442
min          11.000000
3%           11.000000
10%          14.000000
50%          31.000000
75%          54.000000
85%          71.000000
90%          85.000000
95%         105.000000
max         140.000000
Name: reviewText, dtype: float64

In [132]:
# Regrex for character removal
import re

# Spacy for spell check
import spacy
import contextualSpellCheck
nlp = spacy.load("en_core_web_sm")
contextualSpellCheck.add_to_pipe(nlp)

# NLTK for tokenisation and lemmatization
import nltk

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

nltk.download('wordnet')

# Spacy stop word creation
stopping_words = spacy.lang.en.stop_words.STOP_WORDS



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [133]:
# Preprocessing of reviews

def preprocessing(raw_string):
    # Remove html tags and anything inside them 
    no_html = re.sub(r'<[^>]*>','', raw_string)
    #print("after removing html", no_html)

    # Make everything lowercase
    lowercase_column = no_html.lower()
    #print("lowercase", lowercase_column)

    # TODO Might not be a good idea as there are \' inside text which might afect the cleaning 
    # Remove apostrophe to enable spell check to correct words with apostrophe
    #without_apostrophe = re.sub(r'[\']', '', lowercase_column)

    # TODO Need to double check again where best to use this spell check
    # 
    # .pepe for batches of text
    #doc = list(nlp.pipe(without_apostrophe))

    #spell_checked = doc._.outcome_spellCheck

    # Remove all non alphabetic instances that aren't a space and replace them with a space using Regrex
    alphabetic_column = re.sub(r'[^a-z\s]', ' ', lowercase_column)
    #print("removed numerical and punctuation", alphabetic_column)

    # Tokenize string into individual words
    tokens = word_tokenize(alphabetic_column)

    # Remove stopping words using Spacy library
    tokens_without_stopping_words = [token for token in tokens if token not in stopping_words]

    # Lemmatize tokens using nltk and join them into sentances
    sentances_without_stop_words = ' '.join([lemmatizer.lemmatize(t) for t in tokens_without_stopping_words])

    return sentances_without_stop_words

df["clean_reviews"] = df["reviewText"].head(240000).apply(preprocessing)

df["clean_reviews"]

1         husband wanted reading negro baseball great ad...
3         baseball fan knew bit negro league learned lot...
4         good story black league bought book teach high...
7             didn t like product smudged eye throughly day
8           simply love product appreciate print feed order
                                ...                        
371338    love love love time saver people like hair gre...
371340    awful super frizzy tried comb fell completely ...
371341    skeptical buying worried look obviously fake s...
371343    way lighter photo mix blend color nice quality...
371344    return instruction phone packaging color order...
Name: clean_reviews, Length: 224502, dtype: object

In [134]:
# Find any reviews that are empty or null and remove them 

df = df.drop(df[ ( (df["clean_reviews"] == "" ) | ( df["clean_reviews"].isnull()) ) ].index)

There are no empty reviews


In [135]:
# Identifying range of reviews in after cleaning
 
df_reviews_without_empty_clean = df["clean_reviews"].apply(lambda x: len(x.split()))
#df_descriptions_without_empty.describe()
df_reviews_without_empty_clean.describe([0.03,0.1,0.75,0.85,0.90,0.95])

count    224498.000000
mean         18.081056
std          12.557273
min           1.000000
3%            5.000000
10%           6.000000
50%          14.000000
75%          23.000000
85%          31.000000
90%          37.000000
95%          45.000000
max         102.000000
Name: clean_reviews, dtype: float64