In [1]:
import pandas as pd
import numpy as np
import re
from fuzzywuzzy import fuzz
import matplotlib.pyplot as plt
import scipy.stats as st

In [2]:
# read txt file

good_reviews = open('good_amazon_toy_reviews.txt', "r", encoding='utf8').read()
poor_reviews = open('poor_amazon_toy_reviews.txt', "r", encoding='utf8').read()

good_reviews_df = pd.read_csv('good_amazon_toy_reviews.txt', sep="\n", header=None, names=["line"])
poor_reviews_df = pd.read_csv('poor_amazon_toy_reviews.txt', sep="\n", header=None, names=["line"])

In [3]:
# tokenize
word_list = [word.lower() for word in re.findall(r'\b[A-z]+\b', good_reviews+poor_reviews)]
word_set = set(word_list)

# Question 1
* A manager in the marketing department wants to find out the most frequently used words in positive reviews (five stars) and negative reviews (one star) in order to determine what occasion the toys are purchased for (Christmas, birthdays, and anniversaries.). He would like your opinion on **which gift occasions (Christmas, birthdays, or anniversaries) tend to have the most positive reviews** to focus marketing budget on those days.

In [4]:
# fuzzy match for serveral gift occasions
occasion_re = {}

christmas_word = [word for word in word_set if fuzz.ratio(r'christmas',word) >= 80]
birthday_word = [word for word in word_set if fuzz.ratio(r'birthday',word) >= 80]
anniversary_word = [word for word in word_set if fuzz.ratio(r'anniversary',word) >= 80]

occasion_re['christmas'] = r'\b(xmas|' + '|'.join([word for word in christmas_word if word not in ['christ','christians','charisma']]) + r')\b'
occasion_re['birthday'] = r'\b(' + '|'.join([word for word in birthday_word if word not in ['birthed']]) + r')\b'
occasion_re['anniversary']= r'\b(' + '|'.join([word for word in anniversary_word if word not in ['adversary']]) + r')\b'
occasion_re['valentine'] = r'\b(' + '|'.join([word for word in word_set if fuzz.ratio(r'valentine',word) >= 80]) + r')\b'
occasion_re['thanksgiving'] = r'\b(' + '|'.join([word for word in word_set if fuzz.ratio(r'thanksgiving',word) >= 90]) + r')\b'
occasion_re['halloween'] = r'\b(' + '|'.join([word for word in word_set if fuzz.ratio(r'halloween',word) >= 85]) + r')\b'
occasion_re['easter'] = r'\b(' + '|'.join([word for word in word_set if fuzz.ratio(r'easter',word) >= 95]) + r')\b'

occasion_re

{'christmas': '\\b(xmas|giftchristmas|chistmas|chrisrmas|chritmas|cristmas|christmas|cchristmas|chrisms|christmases|chistmass|christmaa)\\b',
 'birthday': '\\b(bithday|birhtday|bithdays|borthday|birthday|brthday|biryhday|birthdy|birthdat|birthdaygift|birthay|unbirthday|birtbday|birthdays|birthhday)\\b',
 'anniversary': '\\b(anniversaries|anniversary|anniversay|anniversry)\\b',
 'valentine': '\\b(valentines|valentine)\\b',
 'thanksgiving': '\\b(thanksgiving)\\b',
 'halloween': '\\b(halloweeen|halloween|holloween|halloweens)\\b',
 'easter': '\\b(easter)\\b'}

In [5]:
# counter
occasion_df = pd.DataFrame(columns=['good_word','good_review','poor_word','poor_review']
                           , index=['christmas','birthday','anniversary','valentine','thanksgiving','halloween','easter','total'])
occasion_df = occasion_df.fillna(0)

# parse from good reviews
for line in good_reviews_df['line']:
    for occasion in occasion_df.index:
        if occasion == 'total':
            occasion_df.loc[occasion,'good_word'] += len(re.findall(r'\b[A-z]+\b',line))
            occasion_df.loc[occasion,'good_review'] += 1
        else:
            if re.findall(occasion_re[occasion],line,flags=re.IGNORECASE):
                occasion_df.loc[occasion,'good_word'] += len(re.findall(occasion_re[occasion],line,flags=re.IGNORECASE))
                occasion_df.loc[occasion,'good_review'] += 1
    
# parse from poor reviews
for line in poor_reviews_df['line']:
    for occasion in occasion_df.index:
        if occasion == 'total':
            occasion_df.loc[occasion,'poor_word'] += len(re.findall(r'\b[A-z]+\b',line))
            occasion_df.loc[occasion,'poor_review'] += 1
        else:
            if re.findall(occasion_re[occasion],line,flags=re.IGNORECASE):
                occasion_df.loc[occasion,'poor_word'] += len(re.findall(occasion_re[occasion],line,flags=re.IGNORECASE))
                occasion_df.loc[occasion,'poor_review'] += 1
occasion_df

Unnamed: 0,good_word,good_review,poor_word,poor_review
christmas,1285,1211,76,70
birthday,4004,3812,446,411
anniversary,55,53,5,4
valentine,23,23,0,0
thanksgiving,11,11,0,0
halloween,416,375,21,16
easter,90,82,4,4
total,2794429,102188,432911,12696


**Answer: Given the table above, in both word and review level, "birthday" is most frequently used word compared to other gift occasion words.**

In [6]:
# stats
review_df = occasion_df.loc[:,['good_review','poor_review']]
review_df['%good'] = review_df['good_review']/(review_df['good_review']+review_df['poor_review'])
review_df['95%CI'] = 1.96*(review_df['%good']*(1-review_df['%good'])/(review_df['good_review']+review_df['poor_review']))**0.5
review_df['LB'] = review_df['%good']-review_df['95%CI']
review_df['UB'] = review_df['%good']+review_df['95%CI']
review_df.sort_values('LB',ascending=False)

Unnamed: 0,good_review,poor_review,%good,95%CI,LB,UB
valentine,23,0,1.0,0.0,1.0,1.0
thanksgiving,11,0,1.0,0.0,1.0,1.0
halloween,375,16,0.959079,0.019637,0.939443,0.978716
christmas,1211,70,0.945355,0.012447,0.932909,0.957802
easter,82,4,0.953488,0.044509,0.90898,0.997997
birthday,3812,411,0.902676,0.00894,0.893736,0.911615
total,102188,12696,0.889489,0.001813,0.887676,0.891302
anniversary,53,4,0.929825,0.066315,0.86351,0.99614


**Answer: Since gift occasion words can appear multiple times in one review, I will do the stats in review level.**

**After compute the % good reviews for each occasion, I compute the 95% confidence interval for % good reviews. Since the data isn't large enough, I sort it by lower bound as a safer estimation. Also ignore valentine and thanksgiving because of small data point.**

**As a conculsion, even though birthday have the most buyers, Halloween and Christmas tend to have the most positive reviews. Ae can take a look into those related reviews and find out why this toy is better for Halloween and Christmas than birthday.**

# Question 2
* One of your product managers suspects that **toys purchased for male recipients (husbands, sons, etc.)** tend to be much more likely to be reviewed poorly. She would like to see some data points confirming or rejecting her hypothesis. 

In [7]:
# fuzzy match for male word
male_word = ['males?','sons?','boyfriends?']
male_word.extend([word for word in word_set if fuzz.ratio(r'husband',word) >= 85])
male_word.extend(['fathers?','daddy','granddaddy'])
male_word.extend([word for word in word_set if fuzz.ratio(r'dad',word) >= 85])
male_word.extend([word for word in word_set if fuzz.ratio(r'grandpa',word) >= 85])
male_word.extend([word for word in word_set if fuzz.ratio(r'grandda',word) >= 85])
male_word.extend([word for word in word_set if fuzz.ratio(r'grandson',word) >= 85])
male_word.remove('dead')
male_word.remove('grands')
male_word.remove('grandma')
male_word.remove('granddaug')
male_word_re = r'\b(' + '|'.join([word for word in set(male_word)]) + r')\b'
male_word_re

'\\b(grandsone|gandson|granddad|dads|grandaon|grabdson|gramdson|gransson|gandsons|gradson|husbands|grandsond|husban|grandson|vgrandson|huband|grandad|fathers?|mygrandson|grandsons|granddaddy|grandspn|husband|daddy|males?|grandso|granpa|grndson|dada|grandpa|dadi|granda|dad|sons?|granson|boyfriends?|grandma)\\b'

In [8]:
# counter
male_df = pd.DataFrame(columns=['good_review','poor_review'],index=['male','total'])
male_df = male_df.fillna(0)

# parse male word from good reviews
for line in good_reviews_df['line']:
    for i in male_df.index:
        if i == 'total':
            male_df.loc[i,'good_review'] += 1
        else:
            if re.findall(male_word_re,line,flags=re.IGNORECASE):
                male_df.loc[i,'good_review'] += 1
    
# parse male word from poor reviews
for line in poor_reviews_df['line']:
    for i in male_df.index:
        if i == 'total':
            male_df.loc[i,'poor_review'] += 1
        else:
            if re.findall(male_word_re,line,flags=re.IGNORECASE):
                male_df.loc[i,'poor_review'] += 1
                
male_df

Unnamed: 0,good_review,poor_review
male,12242,863
total,102188,12696


In [9]:
percentage = male_df.transpose()['male']/male_df.transpose()['total']
percentage

good_review    0.119799
poor_review    0.067974
dtype: float64

## Conduct Two Population Percentage Hypothesis Test

**H0: difference in male percentage of good reviews and poor reviews = 0**

In [10]:
diff = percentage[0]-percentage[1]
diff

0.05182463711632826

In [11]:
p_pool = (male_df.loc['male','good_review']+male_df.loc['male','poor_review']) \
        / (male_df.loc['total','good_review']+male_df.loc['total','poor_review'])
p_pool

0.11407158525120992

In [12]:
SE_diff = (p_pool*(1-p_pool)*(1/male_df.loc['total','good_review']+1/male_df.loc['total','poor_review']))**0.5
SE_diff

0.002991468414646907

In [13]:
Z = (0-diff) / SE_diff
Z

-17.324146517002518

In [14]:
p_value = st.norm.cdf(Z)
p_value

1.546246715865031e-67

## Conculsion

**If H0 is true, good reviews and poor reviews will have same male percentage; However, p_value = 1.469657362095804e-21, which reject H0. Therefore, toys purchased for male recipients (husbands, sons, etc.) tend to be much more likely to have good reviews.**

# Question 3
* Use **regular expressions to parse out all references to recipients and gift occassions**, and account for the possibility that people may spell words "son" / "children" / "Christmas" as both singular and plural, upper or lower-cased.

In [15]:
# regular expression to parse out all references to recipients and gift occassions
recipient_word = ['males?','females','sons?','boyfriends?','fathers?','mothers?''daddy','granddaddy']
recipient_word.extend([word for word in word_set if fuzz.ratio(r'girlfriend',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'husband',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'dad',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'mom',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'parent',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'child',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'children',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'kid',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'daughter',word) >= 89])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'grandpa',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'grandfather',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'grandda',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'grandma',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'grandparent',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'grandchild',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'grandson',word) >= 85])
recipient_word.extend([word for word in word_set if fuzz.ratio(r'granddaughter',word) >= 85])
recipient_word.remove('dead')
recipient_word.remove('arent')
recipient_word.remove('apparent')
recipient_word.remove('chil')
recipient_word.remove('kind')
recipient_word.remove('skid')

recipient_re = r'\b(' + '|'.join([word for word in set(recipient_word)]) + r')\b'
recipient_re

'\\b(gandson|childrem|children|daugher|girilfriend|grandchilds|grnddaughter|husbands|mothers?daddy|grandsond|husban|grandson|grandaugher|granddraughter|grandmama|granddaughter`s|grndson|dada|grandparenting|daughter`s|females|dadi|granddaugjter|gifriend|boyfriends?|grandma|grandsone|chldren|granddad|granddaug|hranddsughter|grandaon|gramdson|grandparents|chidlren|granddsughters|granddaugter|vgrandson|fathers?|grandad|mygrandson|granddaughter|daugter|granddaddy|grandparent|granddaugher|mygranddaughter|grandaugther|kidz|childrens|mom|daughther|granda|grands|moms|child|parent|dads|grabdson|gransson|gandsons|ourgranddaughters|momy|girlfriends|huband|parentsa|kidd|girlfriend|grandsons|grandman|granddaudhter|kido|grandddaughter|gradndaughter|grandspn|husband|kid|grandaught|daughtr|grandso|granpa|granddaugther|dad|granddaughhter|kids|parents|childs|parental|gradson|grandmas|grandfather|granddaoughters|daughters|grandchildren|daughter|grandaugter|granddaighter|granddaughteris|males?|grandchild|g

In [16]:
# counter
recipient_df = pd.DataFrame(columns=['good_word','poor_word']
                           , index=['recipient'])
recipient_df = recipient_df.fillna(0)

# parse from good reviews
for line in good_reviews_df['line']:
    recipient_df.loc['recipient','good_word'] += len(re.findall(recipient_re,line,flags=re.IGNORECASE))
    
# parse from poor reviews
for line in poor_reviews_df['line']:
    recipient_df.loc['recipient','poor_word'] += len(re.findall(recipient_re,line,flags=re.IGNORECASE))

recipient_df

Unnamed: 0,good_word,poor_word
recipient,40745,2960


In [17]:
occasion_df

Unnamed: 0,good_word,good_review,poor_word,poor_review
christmas,1285,1211,76,70
birthday,4004,3812,446,411
anniversary,55,53,5,4
valentine,23,23,0,0
thanksgiving,11,11,0,0
halloween,416,375,21,16
easter,90,82,4,4
total,2794429,102188,432911,12696


In [18]:
total = recipient_df.loc['recipient',:].values.sum() + occasion_df.loc[:,['good_word','poor_word']].values.sum()

In [19]:
total

3277481

In [20]:
# regular expression to parse "son" / "children" / "Christmas" as both singular and plural, upper or lower-cased
son_count = len(re.findall(r'\bsons?\b', good_reviews+poor_reviews, flags=re.IGNORECASE))
child_count = len(re.findall(r'\bchild(?:ren)?\b', good_reviews+poor_reviews, flags=re.IGNORECASE))
christmas_count = len(re.findall(r'\bchristmas(?:es)?\b', good_reviews+poor_reviews, flags=re.IGNORECASE))

**Answer: The possibility that people may spell words "son" / "children" / "Christmas" as both singular and plural, upper or lower-cased.**

In [21]:
# possibility for spelling son
son_count/total

0.002493073186389181

In [22]:
# possibility for spelling child
child_count/total

0.0014074833690874181

In [23]:
# possibility for spelling christmas
christmas_count/total

0.00038840804874231154

# Question 4
* Explain what some of pitfalls/limitations are of using only a word count analysis to make these inferences. What additional research/steps would you need to do to verify your conclusions?

**Answer: the biggest one is tokenization. When I tokenized the reviews, I made an assumption that every word has a word boundary like whitespace or punctuation. However I found in many cases due to typo the word boundary disappeared.**

**Also, using word boundary to tokenize isn't a very accurate way, since in some cases several words should be grouped as one token like 'mothers day'.**

**The possible next step will be looking for some powerful packages that can tokenize sentence better.**