In [1]:
## HW1

import pandas as pd
import re
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# input text
review_good = open('../datasets/good_amazon_toy_reviews.txt', 'r', encoding = 'utf-8')
review_g = review_good.readlines()

review_poor = open('../datasets/poor_amazon_toy_reviews.txt', 'r', encoding = 'utf-8')
review_p = review_poor.readlines()

# combine good reviews and poor reviews, remove \n
reviews = review_g + review_p
for i in range(len(reviews)):
    reviews[i] = reviews[i].replace('\n', '')

In [2]:
# show total number of reviews
len(reviews)

114917

In [3]:
# show first four reviews
reviews[:4]

['Excellent!!!',
 '"Great quality wooden track (better than some others we have tried). Perfect match to the various vintages of Thomas track that we already have. There is enough track here to have fun and get creative incorporating your key pieces with track splits, loops and bends."',
 'my daughter loved it and i liked the price and it came to me rather than shopping with a ton of people around me. Amazon is the Best way to shop!',
 'Great item. Pictures pop thru and add detail as &#34;painted.&#34;  Pictures dry and it can be repainted.']

In [4]:
# PART A
# remove incorrect decodings
for i in range(len(reviews)):
    reviews[i] = re.sub(r'<\w+\s?\/?>', '',reviews[i]) # remove words like <br /> or <br>
    reviews[i] = re.sub(r'&#\w{2,};', '',reviews[i]) # remove words like &#34; patterns according to this entities list https://www.freeformatter.com/html-entities.html
    reviews[i] = re.sub(r'%\w{2}', '',reviews[i]) # remove words like %20; patterns according to encoding reference https://krypted.com/utilities/html-encoding-reference/

In [5]:
# show the fourth review that has incorrect decoding before
reviews[3]

'Great item. Pictures pop thru and add detail as painted.  Pictures dry and it can be repainted.'

In [6]:
# normalize all reference

# create list of possible patterns for each category, process words in upper case later

# recipients - children: possible patterns: 2-year/years/yr/yrs-old, seven year/years/yr/yrs old, 2 yr. old, (grand)son(s)/daughter(s)/boy(s)/girl(s)/kid(s)/child(ren)/baby/babies/sister(s)/brother(s)/cousin(s)/nephew(s)/niece(s)
# children = [r'\b(\w+\s?\-?(?:years?|yrs?)\.?\-?\s?old)\b', r'\b(child(?:ren)?|kids?|boys?|girls?|sons?|daughters?|sisters?|brothers?|cousins?|nephews?|nieces?|bab(?:y|ies)|wild)\b', r'\b(grand(?:child(?:ren)?|kids?|sons?|daughters?))\b']
children = [r'\b(\w+\s?\-?(?:years?|yrs?)\.?\-?\s?old)\b', r'\b((?:grand)?(?:child(?:ren)?|kids?|boys?|girls?|sons?|daughters?|sisters?|brothers?|cousins?|nephews?|nieces?|bab(?:y|e|ies)|wild))\b']

# recipients - parents: father, mother, grandfather, grandmother, parents, grandparents, aunt, uncle
# parents = [r'\b(father|mother|parents?|aunts?|uncles?|dad(?:dy)?|mom(?:my)?|papa|mama)\b', r'\b(grand(?:father|mother|parents?|pa|ma))\b']
parents = [r'\b((?:grand)?(?:father|mother|parents?|aunts?|uncles?|dad(?:dy)?|mom(?:my)?|pa(?:pa)?|ma(?:ma)?))\b']

# recipients - spouses and other relationship: wife, husband, darling, my love/honey
spouses = [r'\b(hu(?:sband|bby)|wi(?:fe|ves)|darlings?|(?:boy|girl)friend|partners?)\b', r'\b(my\s(?:love|honey))\b']

# combine all patterns of recipients
recipients = children + parents + spouses


In [7]:
# occasions - Chistmas, Anniversary, Birthday, Ceremony, Festival, Halloween, etc.
occasions = [r'\b((?:christ|x)mas|anniversar(?:y|ies)|b(?:irth)?day|ceremon(?:y|ies)|festivals?|halloween)\b']

In [8]:
# create new reviews list with all reviews in lower case
reviews_low = []

# apply all the patterns to each review
for i in range(len(reviews)):
    new_review = reviews[i].lower()
    # replace all relevant words with _RECIPIENT_
    for j in recipients:
        new_review = re.sub(j, '_RECIPIENT_', new_review)
    # replace all relevant words with _GIFT_OCCASION_
    for k in occasions:
        new_review = re.sub(k, '_GIFT_OCCASION_', new_review) 
    reviews_low.append(new_review)  

In [9]:
reviews_low[:6]

['excellent!!!',
 '"great quality wooden track (better than some others we have tried). perfect match to the various vintages of thomas track that we already have. there is enough track here to have fun and get creative incorporating your key pieces with track splits, loops and bends."',
 'my _RECIPIENT_ loved it and i liked the price and it came to me rather than shopping with a ton of people around me. amazon is the best way to shop!',
 'great item. pictures pop thru and add detail as painted.  pictures dry and it can be repainted.',
 'i was pleased with the product.',
 '_RECIPIENT_ like it']

In [None]:
# PART B
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('punkt') 
nltk.download('stopwords')

In [11]:
# due to the local memory processing problem for the large scale text, I will sample for 20000 documents/rows for vectorization 1-5
import random
random.seed(0)
sampled_reviews = random.sample(reviews_low, 20000)

In [12]:
# 1. TFIDF vectorizer, no stopword removal or other preprocessing

vectorizer = TfidfVectorizer()
vectorizer.fit(sampled_reviews)
vector_1 = vectorizer.transform(sampled_reviews)
review_df = pd.DataFrame(vector_1.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {review_df.shape}")

Shape of dataframe is (20000, 17481)


In [13]:
# 2. Count vectorizer, no stopword removal or other preprocessing

vectorizer = CountVectorizer() 
vectorizer.fit(sampled_reviews)
vector_2 = vectorizer.transform(sampled_reviews)
review_df = pd.DataFrame(vector_2.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {review_df.shape}")

Shape of dataframe is (20000, 17481)


In [14]:
# 3. either vectorizer (Count vectorizer) + stop words removed

vectorizer = CountVectorizer(stop_words='english') 
vectorizer.fit(sampled_reviews)
vector_3 = vectorizer.transform(sampled_reviews)
review_df = pd.DataFrame(vector_3.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {review_df.shape}")

Shape of dataframe is (20000, 17188)


In [15]:
# 4. either vectorizer (Count vectorizer) + stop words removed + stemming

# create list for stemmed reviews
stemmer = PorterStemmer()

# function to stem each sentence in list
def stem_list(list):
    stemmed_reviews = []
    for review in list:
        words = word_tokenize(review)
        new_words = []
        for word in words:
            new_words.append(stemmer.stem(word))
        stemmed_review = " ".join(new_words)
        stemmed_reviews.append(stemmed_review)
    return stemmed_reviews

stemmed_reviews = stem_list(sampled_reviews)

# vectorization with stemming
vectorizer = CountVectorizer(stop_words='english') 
vectorizer.fit(stem_list(stemmed_reviews))
vector_4 = vectorizer.transform(stem_list(stemmed_reviews))
review_df = pd.DataFrame(vector_4.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {review_df.shape}")

Shape of dataframe is (20000, 12515)


In [16]:
# 5. either vectorizer (Count vectorizer) + stop words removed + lemmatization

# create list for lemmatized reviews
lemmatizer = WordNetLemmatizer()

## refer to https://gist.github.com/gaurav5430/9fce93759eb2f6b1697883c3782f30de#file-nltk-lemmatize-sentences-py
# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

# function to lemmatize each sentence in list
def lemma_list(list):
    lemma_reviews = []
    for review in list:
        lemma_review = lemmatize_sentence(review)
        lemma_reviews.append(lemma_review)
    return lemma_reviews

lemma_reviews = lemma_list(sampled_reviews)

# vectorization with lemmatization
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(lemma_reviews)
vector_5 = vectorizer.transform(lemma_reviews)
review_df = pd.DataFrame(vector_5.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {review_df.shape}")

Shape of dataframe is (20000, 14297)


In [17]:
# 6. either vectorizer (Count vectorizer) + stop words removed + lemmatization + ngram size of 2
# due to the memory problem, the 20000 sampled reviews list doesn't work, so here I use unsampled data with feature limit 10000

# original reviews with lemmatization
lemma_reviews_full = lemma_list(reviews_low)

vectorizer = CountVectorizer(ngram_range = (2,2), max_features = 10000) 
vectorizer.fit(lemma_reviews_full)
vector_6 = vectorizer.transform(lemma_reviews_full)
review_df = pd.DataFrame(vector_6.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe with ngram size = 2 is {review_df.shape}")

Shape of dataframe with ngram size = 2 is (114917, 10000)


In [18]:
# 7. either vectorizer (Count vectorizer) + stop words removed + lemmatization + ngram size of 3
# due to the memory problem, the 20000 sampled reviews list doesn't work, so here I use unsampled data with feature limit

vectorizer = TfidfVectorizer(ngram_range = (3,3), max_features = 10000) 
vectorizer.fit(lemma_reviews_full)
vector_7 = vectorizer.transform(lemma_reviews_full)
review_df = pd.DataFrame(vector_7.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe with ngram size = 3 is {review_df.shape}")

Shape of dataframe with ngram size = 3 is (114917, 10000)


In [19]:
## appendix: all csr shape of above vectorizations with original data (without sampling or feature limit)

vectors = []
#1 TFIDF
vectorizer = TfidfVectorizer()
vectorizer.fit(reviews_low)
vectors.append(vectorizer.transform(reviews_low))

#2 count vectorizer
vectorizer = CountVectorizer() 
vectorizer.fit(reviews_low)
vectors.append(vectorizer.transform(reviews_low))

#3 count vectorizer + stopword
vectorizer = CountVectorizer(stop_words='english') 
vectorizer.fit(reviews_low)
vectors.append(vectorizer.transform(reviews_low))

#4 count vectorizer + stopword + stemming
vectorizer = CountVectorizer(stop_words='english') 
vectorizer.fit(stem_list(reviews_low))
vectors.append(vectorizer.transform(stem_list(reviews_low)))

#5 count vectorizer + stopword + lemmatization
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(lemma_reviews_full)
vectors.append(vectorizer.transform(lemma_reviews_full))

#6 count vectorizer+ stopword + lemmatization + ngram size of 2
vectorizer = CountVectorizer(stop_words='english', ngram_range = (2,2)) 
vectorizer.fit(lemma_reviews_full)
vectors.append(vectorizer.transform(lemma_reviews_full))

#7 stopword + lemmatization + ngram size of 3
vectorizer = CountVectorizer(stop_words='english', ngram_range = (3,3)) 
vectorizer.fit(lemma_reviews_full)
vectors.append(vectorizer.transform(lemma_reviews_full))

for i in range(len(vectors)):
    print(f"Shape of csr for vectorization {i+1} is {vectors[i].shape}")


Shape of csr for vectorization 1 is (114917, 40514)
Shape of csr for vectorization 2 is (114917, 40514)
Shape of csr for vectorization 3 is (114917, 40210)
Shape of csr for vectorization 4 is (114917, 30844)
Shape of csr for vectorization 5 is (114917, 35211)
Shape of csr for vectorization 6 is (114917, 561625)
Shape of csr for vectorization 7 is (114917, 994608)
