In [82]:
import re
import numpy as np
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus import words, stopwords
from nltk.corpus import brown
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from bs4 import BeautifulSoup
from collections import Counter
import requests
import os
import time
import pandas as pd

# Homework 6

## Evaluate text similarity of Amazon book search results by doing the following:

### Do a book search on Amazon. Manually copy the full book title (including subtitle) of each of the top 24 books listed in the first two pages of search results. 

### In Python, run one of the text-similarity measures covered in this course, e.g., cosine similarity. Compare each of the book titles, pairwise, to every other one. 

### Which two titles are the most similar to each other? Which are the most dissimilar? Where do they rank, among the first 24 results?

In [None]:
## Search == "Potato"

In [25]:
search_page = "https://www.amazon.com/s?k=potato&i=stripbooks&ref=nb_sb_noss_1"

s_html = requests.get(search_page).content

In [26]:
s_soup = BeautifulSoup(s_html, 'html.parser')

In [27]:
s_soup.find_all("p", attrs={'class' : "a-last"})[0].contents[0]

"Sorry, we just need to make sure you're not a robot. For best results, please make sure your browser is accepting cookies."

In [None]:
## Automated search did not work, Amazon did not allow it

In [29]:
titles = [
    "The Guernsey Literary and Potato Peel Pie Society",
    "The Enormous Potato",
    "Potato Pants!",
    "Sweet Potato Soul: 100 Easy Vegan Recipes for the Southern Flavors of Smoke, Sugar, Spice, and Soul"
    "Jamie O'Rourke and the Big Potato"
    "The Potato Reset: Weight Loss & Recipe Guide"
    "Mosh Potatoes: Recipes, Anecdotes, and Mayhem from the Heavyweights of Heavy Metal",
    "The Potato Hack: Weight Loss Simplified",
    "The Guernsey Literary and Potato Peel Pie Society (Random House Reader's Circle Deluxe Reading Group Edition): A Novel"
    "Potatoes Not Prozac: A Natural Seven-Step Plan to: Control Your Craving",
    "No Small Potatoes: Junius G. Groves and His Kingdom in Kansas",
    "The Sweet Potato Queens' Book of Love: A Fallen Southern Belle's Look at Love, Life, Men, Marriage, and Being Prepared"
    "One Potato, Two Potato, Dead (A Farm-to-Fork Mystery)",
    "Potatoes Make Me Happy: Blank Lined Journal - Funny Potato Gifts, Journals for Millenials",
    "Potato: A History of the Propitious Esculent",
    "The Potato Chip Puzzles: The Puzzling World of Winston Breen",
    "The History and Social Influence of the Potato (Cambridge Paperback Library)",
    "Smashed, Mashed, Boiled, and Baked--and Fried, Too!: A Celebration of Potatoes in 75 Irresistible Recipes",
    "Black Potatoes: The Story of the Great Irish Famine, 1845-1850",
    "small potatoes: mouthwatering musings of a misanthrope",
    "Williams-Sonoma Collection: Potato",
    "Zaydo Potato: A Muslim Superhero: Zaydo Potato: A Muslim Superhero (Zaydo Potato: (An Islamic Book Series))",
    "Mr. Crum's Potato Predicament",
    "The Potato Factory: The Australian Trilogy, Book 1",
    "The Sweet Potato Diet: The Super Carb-Cycling Program to Lose Up to 12 Pounds in 2 Weeks",
    "The Essential Mashed Potato Recipe Book: Mash Your Way to A Delicious Side Dish",
    "Super Potato's Galactic Breakout",
    "Four Funny Potatoes! (Hilarious Rhyming, Picture Book for Kids Ages 3-7)",
    "The Potato Chip Champ: Discovering Why Kindness Counts",
    "Math Potatoes: Mind-stretching Brain Food",
    "One Potato, Two Potato",
    ]

In [259]:
class CosSim:
    def __init__(self, all_text):
        corpus = set()
        self.titles = {}
        for text in all_text:
            self.titles[text] = Counter()
            for word in word_tokenize(text):
                if re.search('^[a-zA-z]', word):
                    corpus.add(word)
                    self.titles[text][word] += 1
        self.corpus = list(corpus)
        self.booknames = list(self.titles.keys())
        bookmtrx = []
        for book in self.booknames:
            bk = []
            for word in self.corpus:
                if word in self.titles[book]:
                    bk.append(self.titles[book][word])
                else:
                    bk.append(0)
            bookmtrx.append(bk)
        self.bookmatrix = np.matrix(bookmtrx)

    def compare(self):
        sim = []
        mx = self.bookmatrix
        for r1, book1 in enumerate(self.booknames):
            row = []
            for r2, book2 in enumerate(self.booknames):
                if book1 == book2:
                    row.append(1)
                    continue
                row.append((np.dot(mx[r1], mx[r1].T) / (mx[r1].sum() + mx[r2].sum())).sum())
            sim.append(row)
        return np.matrix(sim)

In [260]:
titles_comp = CosSim(titles)

In [268]:
cx = titles_comp.compare()

In [269]:
print(cx)

[[ 1.          0.72727273  0.8         0.17391304  0.57142857  0.22222222
   0.42105263  0.22857143  0.38095238  0.53333333  0.44444444  0.42105263
   0.34782609  0.47058824  0.53333333  0.72727273  0.33333333  0.66666667
   0.53333333  0.34782609  0.36363636  0.66666667  0.44444444  0.5
   0.61538462  0.66666667]
 [ 0.27272727  1.          0.6         0.07317073  0.33333333  0.09677419
   0.21428571  0.1         0.1875      0.3         0.23076923  0.21428571
   0.16666667  0.25        0.3         0.5         0.15789474  0.42857143
   0.3         0.16666667  0.17647059  0.42857143  0.23076923  0.27272727
   0.375       0.42857143]
 [ 0.2         0.4         1.          0.05        0.25        0.06666667
   0.15384615  0.06896552  0.13333333  0.22222222  0.16666667  0.15384615
   0.11764706  0.18181818  0.22222222  0.4         0.11111111  0.33333333
   0.22222222  0.11764706  0.125       0.33333333  0.16666667  0.2
   0.28571429  0.33333333]
 [ 1.2173913   1.36585366  1.4         1.    

In [270]:
row = int(cx.argmax() / cx.shape[0])
col = cx.argmax() % cx.shape[0]
print("(", row, ",", col, ")")

( 16 , 2 )


In [271]:
print(titles_comp.booknames[16])
print("AND")
print(titles_comp.booknames[2])

Zaydo Potato: A Muslim Superhero: Zaydo Potato: A Muslim Superhero (Zaydo Potato: (An Islamic Book Series))
AND
Potato Pants!


In [273]:
## The most commonly related books appear to be the shortest book, and the one that mentions one of the words in tnhe shortest book 3 times

## Now evaluate using a major search engine.

### Enter one of the book titles from question 1a into Google, Bing, or Yahoo!. Copy the capsule of the first organic result and the 20th organic result. Take web results only (i.e., not video results), and skip sponsored results. 

### Run the same text similarity calculation that you used for question 1b on each of these capsules in comparison to the original query (book title). 

### Which one has the highest similarity measure? 

### Submit all of your inputs and outputs and your code for this assignment, along with a brief written explanation of your findings. 

In [290]:
search_title = "The Guernsey Literary and Potato Peel Pie Society"

first_result = '''The Guernsey Literary and Potato Peel Pie Society - Wikipedia
https://en.wikipedia.org/wiki/The_Guernsey_Literary_and_Potato_Peel_Pie_Society
The Guernsey Literary and Potato Peel Pie Society is a historical novel by Mary Ann Shaffer and Annie Barrows that was published in 2008. The book is set in ..'''

twentieth_result = '''The Guernsey Literary and Potato Peel Pie Society on iTunes
https://itunes.apple.com/...guernsey-literary-and-potato-peel-pie-society/id136996714...
 Rating: 4.5 - ‎46 votes
Watch trailers, read customer and critic reviews and buy The Guernsey Literary and Potato Peel Pie Society directed by Mike Newell for £9.99.'''

all_results = [first_result, twentieth_result]

page_comp = CosSim(all_results)

page_comp

results = page_comp.compare()

results[results < 1].max()

0.89189189189189189

In [285]:
search_title = "The Enormous Potato"

first_result = '''The Enormous Potato: Aubrey Davis, Dusan Petricic: 0625816466934 ...
https://www.amazon.com/Enormous-Potato-Aubrey-Davis/dp/1550746693
The Enormous Potato [Aubrey Davis, Dusan Petricic] on Amazon.com. *FREE* shipping on qualifying offers. Folk tales give us hope. They show us that we can ...'''

twentieth_result = '''The Enormous Potato - Growing Minds
https://growing-minds.org/childrens-literature/the-enormous-potato/
One farmer can't pull this potato out of the ground. But can the farmer, his wive, daughter, dog, cat, and mouse get this spud uncovered?
'''

all_results = [first_result, twentieth_result]

page_comp = CosSim(all_results)

page_comp

results = page_comp.compare()

results[results < 1].max()

0.78787878787878785

In [285]:
search_title = "The Enormous Potato"

first_result = '''The Enormous Potato: Aubrey Davis, Dusan Petricic: 0625816466934 ...
https://www.amazon.com/Enormous-Potato-Aubrey-Davis/dp/1550746693
The Enormous Potato [Aubrey Davis, Dusan Petricic] on Amazon.com. *FREE* shipping on qualifying offers. Folk tales give us hope. They show us that we can ...'''

twentieth_result = '''The Enormous Potato - Growing Minds
https://growing-minds.org/childrens-literature/the-enormous-potato/
One farmer can't pull this potato out of the ground. But can the farmer, his wive, daughter, dog, cat, and mouse get this spud uncovered?
'''

all_results = [first_result, twentieth_result]

page_comp = CosSim(all_results)

page_comp

results = page_comp.compare()

results[results < 1].max()

0.78787878787878785

In [288]:
search_title = "Jamie O'Rourke and the Big Potato"

first_result = '''Jamie O'Rourke and the Big Potato: An Irish Folktale: Tomie dePaola ...
https://www.amazon.com/Jamie-ORourke-Big-Potato-Folktale/dp/0448450909
Jamie O'Rourke and the Big Potato: An Irish Folktale [Tomie dePaola] on Amazon.com. *FREE* shipping on qualifying offers. Jamie O?Rourke is the laziest ma'''

twentieth_result = '''Jamie O'Rourke and the Big Potato Book Review and Ratings by Kids https://www.dogobooks.com/jamie-orourke-and-the-big-potato/book.../039922257X
Feb 20, 1992 - After his wife is injured, Jamie O'Rourke, the laziest man in Ireland, must find a way to feed his family and luckily bumps into a leprechaun with'''

all_results = [first_result, twentieth_result]

page_comp = CosSim(all_results)

page_comp

results = page_comp.compare()

results[results < 1].max()

0.88607594936708856

In [287]:
search_title = "Sweet Potato Soul: 100 Easy Vegan Recipes for the Southern Flavors of Smoke, Sugar, Spice, and Soul"

first_result = '''Sweet Potato Soul: 100 Easy Vegan Recipes for the Southern Flavors ...
https://www.amazon.com/Sweet-Potato-Soul-Recipes-Southern/dp/0451498895
Sweet Potato Soul: 100 Easy Vegan Recipes for the Southern Flavors of Smoke, Sugar, Spice, and Soul [Jenne Claiborne] on Amazon.com. *FREE* shipping on'''

twentieth_result = '''Sweet Potato Soul - Claiborne, Jenne - 9780451498892 | HPB
https://www.hpb.com/products/sweet-potato-soul-9780451498892
Sweet Potato Soul: 100 Easy Vegan Recipes For The Southern Flavors Of Smoke, Sugar, Spice, And Soul. by Claiborne, Jenne'''

all_results = [first_result, twentieth_result]

page_comp = CosSim(all_results)

page_comp

results = page_comp.compare()

results[results < 1].max()

0.65573770491803274

In [289]:
search_title = "The Potato Reset: Weight Loss & Recipe Guide"

first_result = '''The Potato Reset: Weight Loss & Recipe Guide: Jeannine L Elder ...
https://www.amazon.com/Potato-Reset-Weight-Recipe-Guide/dp/198684076X
The Potato Reset: Weight Loss & Recipe Guide [Jeannine L Elder] on Amazon.com. *FREE* shipping on qualifying offers. Reset your tastebuds, lose weight ..'''

twentieth_result = '''The Potato Reset: Weight Loss & Recipe Guide: Jeannine L Elder ...
https://www.amazon.ca/Potato-Reset-Weight-Recipe-Guide/dp/198684076X
If you've been enticed by Penn Jillette's story of how he lost over 100 lbs starting with a potato-only diet but the idea of eating nothing but baked/boiled potatoes ...'''

all_results = [first_result, twentieth_result]

page_comp = CosSim(all_results)

page_comp

results = page_comp.compare()

results[results < 1].max()

0.82191780821917804

In [291]:
##The Guernsey Literary and Potato Peel Pie Society presented the highest coorealation between its 1st and 20th docu