In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# read txt file

good_reviews_df = pd.read_csv('good_amazon_toy_reviews.txt', sep="\n", header=None, names=["line"])
poor_reviews_df = pd.read_csv('poor_amazon_toy_reviews.txt', sep="\n", header=None, names=["line"])

In [3]:
# remove punctuation
good_reviews_df['no_punctuation'] = good_reviews_df['line'].str.replace(r'(<br />|['+string.punctuation+r'])',' ')
poor_reviews_df['no_punctuation'] = poor_reviews_df['line'].str.replace(r'(<br />|['+string.punctuation+r'])',' ')

# lower case
good_reviews_df['lower_case'] = good_reviews_df['no_punctuation'].str.lower()
poor_reviews_df['lower_case'] = poor_reviews_df['no_punctuation'].str.lower()

# stopword
stopword_list = stopword_list = stopwords.words('english')
good_reviews_df['stopword'] = good_reviews_df['lower_case'].str.replace(
    r'\b('+'|'.join(stopword_list)+r')\b','')
poor_reviews_df['stopword'] = poor_reviews_df['lower_case'].str.replace(
    r'\b('+'|'.join(stopword_list)+r')\b','')

In [4]:
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

In [5]:
# reduce lengthening
good_reviews_df['short'] = good_reviews_df['stopword'].apply(reduce_lengthening)
poor_reviews_df['short'] = poor_reviews_df['stopword'].apply(reduce_lengthening)

In [6]:
lemmatizer = WordNetLemmatizer()
def lemma(s):
    l = [lemmatizer.lemmatize(word) for word in word_tokenize(s)]
    return ' '.join(l)

In [7]:
good_reviews_df['lemma'] = good_reviews_df['short'].apply(lemma)
poor_reviews_df['lemma'] = poor_reviews_df['short'].apply(lemma)

In [8]:
stemmer = PorterStemmer()
def stem(s):
    l = [stemmer.stem(word) for word in word_tokenize(s)]
    return ' '.join(l)

In [9]:
good_reviews_df['stem'] = good_reviews_df['short'].apply(stem)
poor_reviews_df['stem'] = poor_reviews_df['short'].apply(stem)

In [10]:
good_reviews_df

Unnamed: 0,line,no_punctuation,lower_case,stopword,short,lemma,stem
0,Excellent!!!,Excellent,excellent,excellent,excellent,excellent,excel
1,Great quality wooden track (better than some o...,Great quality wooden track better than some o...,great quality wooden track better than some o...,great quality wooden track better others ...,great quality wooden track better others tr...,great quality wooden track better others tried...,great qualiti wooden track better other tri pe...
2,my daughter loved it and i liked the price and...,my daughter loved it and i liked the price and...,my daughter loved it and i liked the price and...,daughter loved liked price came rathe...,daughter loved liked price came rather s...,daughter loved liked price came rather shoppin...,daughter love like price came rather shop ton ...
3,Great item. Pictures pop thru and add detail a...,Great item Pictures pop thru and add detail a...,great item pictures pop thru and add detail a...,great item pictures pop thru add detail 3...,great item pictures pop thru add detail 34 ...,great item picture pop thru add detail 34 pain...,great item pictur pop thru add detail 34 paint...
4,I was pleased with the product.,I was pleased with the product,i was pleased with the product,pleased product,pleased product,pleased product,pleas product
...,...,...,...,...,...,...,...
102183,fun game,fun game,fun game,fun game,fun game,fun game,fun game
102184,"Nice kit,well priced",Nice kit well priced,nice kit well priced,nice kit well priced,nice kit well priced,nice kit well priced,nice kit well price
102185,Does what it is supposed to do.,Does what it is supposed to do,does what it is supposed to do,supposed,supposed,supposed,suppos
102186,Grandson loves playing with these police figur...,Grandson loves playing with these police figur...,grandson loves playing with these police figur...,grandson loves playing police figurines……,grandson loves playing police figurines……,grandson love playing police figurines……,grandson love play polic figurines……


# Explanation

Using the **`poor_amazon_toy_reviews.txt`** and **`good_amazon_toy_reviews.txt`** datasets, clean and parse the text reviews. Explain the decisions you make:
- why remove/keep stopwords?

**stopwords which have high tf but low idf don't have too much meaning, so they are useless for our analysis and should be removed**

- stemming versus lemmatization?

**at first I wanted lemmatization since it trims less and the word will contain more meaning than stemming, while later I found the dimensionality is too high so I had to use stemming to lower the number of features**

- regex cleaning and substitution?

**use regex to remove punctuations and common stopwords**

- adding in custom stopwords?

**for good review we better remove good word like 'good','great','love'**

**for poor review we better remove bad word like**


- what `n` for your `n-grams`?

**(3,4) seems more meaningful**

- which words to collocate together?

**it's determined by ngram range and we can see the result in the table above**

In [11]:
def tfidf(corpus,ngram_range,stop_words):
    vectorizer = TfidfVectorizer(ngram_range=ngram_range,
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             min_df=0.001,
                             stop_words=stop_words)
    X = vectorizer.fit_transform(corpus)
    terms = vectorizer.get_feature_names()
    tf_idf = pd.DataFrame(X.toarray().transpose(), index=terms).sum(axis=1)
    score = pd.DataFrame(tf_idf, columns=["score"])
    score.sort_values(by="score", ascending=False, inplace=True)
    return score

In [12]:
tfidf(good_reviews_df['stem'],ngram_range=(2,3),stop_words=['love','good','great'])

Unnamed: 0,score
year old,3113.943085
well made,1313.729694
highli recommend,799.183031
fast ship,728.308912
lot fun,717.357632
...,...
discount rate,43.520015
receiv product free,40.970739
price exchang,39.304921
product discount exchang,37.420996


In [13]:
tfidf(good_reviews_df['stem'],ngram_range=(3,4),stop_words=['love','good','great'])

Unnamed: 0,score
year old grandson,561.844408
year old son,535.944025
year old daughter,517.684359
year old granddaught,350.052407
easi put togeth,340.743145
two year old,330.289651
bought year old,297.872449
one year old,252.74675
would definit recommend,237.065236
three year old,231.40084


**From tfidf of good reviews we learn that customers like it beacuse it's an easy-put-together toy for their little kid or grandkid as a birthday present. The shipping is fast and they got discount.**

In [14]:
tfidf(poor_reviews_df['stem'],ngram_range=(3,4),stop_words=['poor','bad','zero'])

Unnamed: 0,score
broke first day,51.288028
total wast money,43.818158
complet wast money,42.610187
wast time money,41.652088
first time use,34.250543
look noth like,33.509598
year old son,31.294345
look like pictur,29.23207
broke first time,27.994626
noth like pictur,26.450489


**From tfidf of poor reviews we learn that these customers buy this toy for the same usage like the other group, their main complaints is the poor quaility of this product**

# Explanation

Explain to what degree the TF-IDF findings make sense - what are its limitations?

**It makes sense, but still has some limitations. For example, different n-grams ranges return very different results, and it's difficlut to decide which one is better. Also, because of high dimensionality I have to do more aggressive removel like min_df, which might lose valueable information**

In [15]:
catalog_df = pd.read_csv('truncated_catalog.csv')

In [16]:
def is_womens_clothing(line):
    for i in line:
        if re.search(r'\bm[ae]n|kid|girl|boy|\bmale|baby',str(i).lower()):
            return False
    return True

catalog_df['is_womens_clothing'] = catalog_df.apply(is_womens_clothing, axis=1)

In [17]:
def category(line):
    for i in line:
        if re.search(r'bottom',str(i).lower()):
            return 'Bottom'
        if re.search(r'one ?piece',str(i).lower()):
            return 'One Piece'
        if re.search(r'shoe',str(i).lower()):
            return 'Shoe'
        if re.search(r'handbag',str(i).lower()):
            return 'Handbag'
        if re.search(r'scarf',str(i).lower()):
            return 'Scarf'
    return ''

catalog_df['category'] = catalog_df.apply(category, axis=1)

In [18]:
color_list = ['Beige', 'Black', 'Blue', 'Brown', 'Burgundy', 'Gold', 'Gray', 'Green', 'Multi', 'Navy', 'Neutral', 'Orange', 'Pinks', 'Purple', 'Red', 'Silver', 'Teal', 'White', 'Yellow']

In [27]:
def color(line):
    s = set()
    for i in line:
        for color in color_list:
            if re.search(color.lower(),str(i).lower()):
                s.add(color)
    return list(s)

catalog_df['color'] = catalog_df.apply(color, axis=1)

In [28]:
catalog_df

Unnamed: 0,brand,name,description,brand_category,brand_canonical_url,details,tsv,is_womens_clothing,category,color
0,FILA,Original Fitness Sneakers,Vintage Fitness leather sneakers with logo pri...,TheMensStore/Shoes/Sneakers/LowTop,https://www.saksfifthavenue.com/fila-original-...,Leather/synthetic upper\nLace-up closure\nText...,"'design':12 'fila':1A 'fit':3A,6 'leather':7 '...",False,Shoe,[]
1,CHANEL,HAT,,Unknown,https://www.saksfifthavenue.com/chanel-hat/pro...,WOOL TWEED & FELT,'chanel':1A 'hat':2A,True,,[]
2,Frame,Petit Oval Buckle Belt,A Timeless Leather Belt Crafted From Smooth Co...,Accessories,https://frame-store.com/products/petit-oval-bu...,,"'belt':5A,9 'buckl':4A,21 'cowhid':13 'craft':...",True,,"[Multi, Gold]"
3,Lilly Pulitzer Kids,Little Gir's & Girl's Ariana One-Piece UPF 50+...,Pretty ruffle sleeves and trim elevate essenti...,"JustKids/Girls214/Girls/SwimwearCoverups,JustK...",https://www.saksfifthavenue.com/lilly-pulitzer...,Scoopneck\nAdjustable straps\nFlutter sleeves\...,'50':14A 'allov':28 'ariana':9A 'color':27 'el...,False,,[]
4,Kissy Kissy,Baby Girl's Endearing Elephants Pima Cotton Co...,Versatile convertible gown with elephant applique,JustKids/Baby024months/InfantGirls/FootiesRompers,https://www.saksfifthavenue.com/kissy-kissy-ba...,V-neckline\nLong sleeves\nFront snap closure\n...,"'appliqu':17 'babi':3A 'convert':10A,13 'cotto...",False,Bottom,[]
...,...,...,...,...,...,...,...,...,...,...
42368,Mara Hoffman,Atlas oversized belted mélange wool coat,Mélange beige and cream wool Button fastenings...,Clothing / Coats / Long,https://www.net-a-porter.com/us/en/product/117...,"Fits true to size, take your normal size \nDes...",'100':21 'atlas':3A 'beig':10 'belt':5A 'breas...,True,,[Beige]
42369,Philosophy di Lorenzo Serafini,Cropped crochet-trimmed georgette top,"Cream georgette Ties at neck, concealed hook f...",Clothing / Tops / Blouses,https://www.net-a-porter.com/us/en/product/111...,"Fits true to size, take your normal size \nInt...",'100':21 'back':20 'conceal':16 'cream':11 'cr...,True,,[]
42370,Vanessa Bruno,Juna cotton-corduroy mini skirt,Sand cotton-corduroy Concealed hook and zip fa...,Clothing / Skirts / Mini,https://www.net-a-porter.com/us/en/product/116...,"Fits true to size, take your normal size \nTho...",'100':20 '35':25 '65':23 'acet':24 'back':19 '...,True,,[]
42371,Eve Denim,Annabel Rigid Mid-Rise Skinny Jean,Although mom jeans and boyfriend jeans are all...,women:CLOTHING:JEANS,https://pink.modaoperandi.com/eve-denim-r20/an...,Button and zip fastening \nComposition: 98% co...,"'add':36 'although':10 'annabel':3A,40 'boyfri...",False,,[]
