# Which Grocery Store should I go for East Asian Food? Logit Regression Model

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep # control the crawl rate to avoid hammering the servers with too many requests
from random import randint
import re # regular expression

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import chromedriver_binary # adds chromedriver binary to path

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer #tokenizing the words

from sklearn.linear_model import LogisticRegression

In [2]:
def parsing_pages(list_of_pages):
    pages = []
    for page in list_of_pages:
        if pd.notnull(page):
            all_pages = requests.get(page)
            each_page = BeautifulSoup(all_pages.content, "html.parser")
            pages.append(each_page)
            sleep(randint(2,6))
        else: 
            pages.append(None)
    return pages

## Scrapping H Mart Items for East Asian Food

In [4]:
hmart = ['https://www.hmart.com/groceries?p=' + str(i) for i in range(1,16)]
hmart_parsed = parsing_pages(hmart)

# finding item names from a list of beautifulsoup objects
key = 'product name product-item-name'
items = [] 
for page in hmart_parsed:
    items_in_pages = page.find_all('strong', class_=key)
    for item in items_in_pages: # cleaning up the messy text
        items_per_page = item.get_text().replace('\n','').replace('\r','').strip().replace('     ', ' ').lower()
        items.append(items_per_page)

# extracting item names from the text
brands = [i.split('        ')[0] for i in items]
item_names = [i.split('        ')[-1] for i in items]

# extracting volumns at the end
regrex = re.compile(r'\d\..*')
item_volumns = [regrex.findall(i) for i in item_names]

## asian items with brand names
asian_items = []
for i, j in zip(brands, item_names):
    asian_item = i + ' ' + j
    asian_items.append(asian_item)

# getting rid of texts about volumns 
asian_items_no_volumns = []
for i in asian_items:
    asian_item = re.sub(' \d\..*', '', i)
    asian_items_no_volumns.append(asian_item)

columns = {
    'item_name': asian_items_no_volumns,
    'item_volumn': item_volumns,
    'type': 'asian'
}
asian_items = pd.DataFrame(data = columns)

In [40]:
len(asian_items)

705

## Scraping Walmart Items for Non-East Asian Food

In [6]:
## Walmart uses dynamic web pages, thus can't use BeautifulSoup
driver = webdriver.Chrome()
driver.get("https://www.walmart.com/browse/food/")

list_of_items_from_pages = []
for n in range(1,26):
        driver.get("https://www.walmart.com/browse/food/?page=" + str(n))
        items_from_pages = driver.find_elements_by_css_selector(".product-title-link.line-clamp span")
        for i in items_from_pages:
            items_extracted = i.text
            list_of_items_from_pages.append(items_extracted)
        sleep(3)

        
# getting rid of units/volumns at the beginning of each item
pack = re.compile(r'\(\d{1,} (?:pack|count|cans)\)') 
list_of_items_from_pages = [re.sub(pack, '', i.lower()) for i in list_of_items_from_pages]

# isolating volumns at the end
regrex = re.compile(r'(?:, \d{1,}|\d{1,} (?:mg|oz|fl|ct)|, \d{1,} (?:mg|oz|fl|ct)).*')
walmart_item_volumns = [regrex.findall(i) for i in list_of_items_from_pages]
walmart_item_names = [re.sub(regrex, '', i.split(',')[0].strip()) for i in list_of_items_from_pages]

columns = {
    'item_name': walmart_item_names,
    'item_volumn': walmart_item_volumns,
    'type': 'non-asian'
}
walmart_items = pd.DataFrame(data = columns)

## Pre-processing the Data

In [8]:
## First, concat the two lists together
frames = [walmart_items, asian_items]
df = pd.concat(frames).reset_index(drop = True)

# seperate training and test sets
from random import sample
df_train = df.sample(frac=0.6, random_state=0)
df_test = df.drop(df_train.index)

count_vect = CountVectorizer(stop_words='english',max_df=0.85)
train_vect = count_vect.fit_transform(df_train.item_name)

### Tokenizing/vectorizing the Item Names

In [9]:
# training set
train_tf_transformer = TfidfTransformer(use_idf=False).fit(train_vect)
train_tf = train_tf_transformer.transform(train_vect)
train_tf.shape

(1019, 1757)

In [10]:
# test set
test_vect = count_vect.transform(df_test.item_name) # not fitting, but transforming
# test_tf_transformer = TfidfTransformer(use_idf=False).fit(train_vect)
test_tf = train_tf_transformer.transform(test_vect)
test_tf.shape

(679, 1757)

## Building the Logistic Regression Model

In [11]:
X_train = train_tf
y_train = df_train.type

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

X_test = test_tf
y_test = df_test.type

log_reg.score(X_test, y_test)

0.9337260677466863

## Comparing Wegmans and Harris Teeter

Wegmans

In [14]:
## Wegmans
# driver = webdriver.Chrome()

wegmans_international_items = []
driver.get("https://shop.wegmans.com/shop/categories/345?page=1")
sleep(3)
driver.find_element_by_css_selector("#shopping-selector-shop-context-intent-instore").click()
sleep(3)
for n in range(1,26):
    sleep(2)
    driver.get("https://shop.wegmans.com/shop/categories/345?page=" + str(n))
    sleep(3)
    items = driver.find_elements_by_css_selector(".cell-title")
    for i in items:
        items_extracted = i.text
        wegmans_international_items.append(items_extracted)
    sleep(randint(3,10))

wegmans_intl_item_names = [i.split('\n')[0] for i in wegmans_international_items]
wegmans_intl_item_volumns = [i.split('\n')[1] for i in wegmans_international_items]

columns = {
    'item_name': wegmans_intl_item_names,
    'item_volumn': wegmans_intl_item_volumns
}
wegmans_intl = pd.DataFrame(data = columns)

In [16]:
wegmans_intl_vect = count_vect.transform(wegmans_intl.item_name)
wegmans_intl_tf = train_tf_transformer.transform(wegmans_intl_vect)

X_intl = wegmans_intl_tf
y_intl = log_reg.predict(X_intl)

In [20]:
rate = round(len(y_intl[y_intl == 'asian']) / len(y_intl), 2) 
print('Proportion of East Asian food:', rate)
print('Numbers of East Asian food items:', len(y_intl[y_intl == 'asian']))

Proportion of East Asian food: 0.37
Numbers of East Asian food items: 133


In [26]:
## Demo the identified food items
wegmans_intl['type'] = y_intl
wegmans_intl[wegmans_intl['type'] == 'asian'][:12]

Unnamed: 0,item_name,item_volumn,type
10,"Wegmans Organic Salsa, Medium",15.5 ounce,asian
12,Wegmans Organic Mild Salsa,15.5 ounce,asian
19,"Kikkoman Soy Sauce, Less Sodium",10 fl. oz.,asian
26,"Huy Fong Chili Sauce, Hot, Sriracha",17 ounce,asian
27,Wegmans Reduced Sodium Soy Sauce,11.4 fl. oz.,asian
28,Wegmans Organic Sesame Garlic Sauce,14 ounce,asian
35,Wegmans Organic Teriyaki Sauce,14 ounce,asian
36,"Old El Paso Enchilada Sauce, Red, Mild",10 ounce,asian
40,Wegmans Organic Black Bean Dip,16.5 ounce,asian
42,Wegmans Diced Green Chile Peppers,7 ounce,asian


Harris Teeter

In [27]:
def scraping_ht_pages(url, last_page_number):
    list_of_items_ht = []
    if last_page_number == 1:
        driver.get(url)
        sleep(randint(2,6))
        elements = driver.find_elements_by_css_selector(".product-name")
        for i in elements:
            list_of_items_ht.append(i.text)
        sleep(randint(2,6))
        return list_of_items_ht
    else:
        for n in range(1, last_page_number+1):
            driver.get(url + str(n) + "&appliedSort=Brand")
            sleep(randint(2,6))
            elements = driver.find_elements_by_css_selector(".product-name")
            for i in elements:
                list_of_items_ht.append(i.text)
            sleep(randint(2,6))
        return list_of_items_ht

In [32]:
## different pages of Asian food
canned_url = "https://www.harristeeter.com/shop/store/332/category/583/subCategory/713,1557/products?isSpecialSubCategory=false"
ht_canned = scraping_ht_pages(canned_url, 1)

noodles_url = "https://www.harristeeter.com/shop/store/332/category/583/subCategory/713,976/products?pageNo="
ht_noodles = scraping_ht_pages(noodles_url, 2)

others_url = "https://www.harristeeter.com/shop/store/332/category/583/subCategory/713,1558/products?isSpecialSubCategory=false"
ht_other = scraping_ht_pages(others_url, 1)

sauces_url = "https://www.harristeeter.com/shop/store/332/category/583/subCategory/713,912/products?pageNo="
ht_sauces = scraping_ht_pages(sauces_url, 3)

seasoning_url = "https://www.harristeeter.com/shop/store/332/category/583/subCategory/713,1231/products?isSpecialSubCategory=false"
ht_seasoning = scraping_ht_pages(seasoning_url, 1)

ramen_url = "https://www.harristeeter.com/shop/store/332/category/583/subCategory/713,1164/products?pageNo="
ht_ramen = scraping_ht_pages(ramen_url, 2)

In [38]:
harris_teeter_asian = ht_canned + ht_noodles + ht_other + ht_sauces + ht_seasoning + ht_ramen
ht_asian = pd.Series(harris_teeter_asian)

In [34]:
ht_asian_vect = count_vect.transform(harris_teeter_asian)
ht_asian_tf = train_tf_transformer.transform(ht_asian_vect)

In [35]:
X_ht = ht_asian_tf
y_ht = log_reg.predict(X_ht)

In [37]:
rate_ht = round(len(y_ht[y_ht == 'asian']) / len(y_ht), 2) 
print('Proportion of East Asian food:', rate_ht)
print('Numbers of East Asian food items:', len(y_ht[y_ht == 'asian']))

Proportion of East Asian food: 0.65
Numbers of East Asian food items: 88


In [39]:
## demo the East Asian food items
frames = {
    'item_name': ht_asian,
    'type': y_ht
}
ht_asian = pd.DataFrame(data=frames)
ht_asian[ht_asian['type'] == 'asian'][:10]

Unnamed: 0,item_name,type
7,Sharwood's Curry Cooking,asian
9,Annie Chuns Maifun Brown Rice Noodles,asian
10,Annie Chuns Noodle Soup - Miso with Tofu and S...,asian
16,Hakubaku Organic Ramen Noodles,asian
17,KA ME Rice Sticks,asian
19,Ka-Me Pad Thai Express Rice Noodles,asian
22,Ka-Me Thai Rice Noodles - Stir Fry,asian
24,La Choy Rice Noodles,asian
25,"Maruchan Ramen Noodle Soup, Gold, Spicy Miso F...",asian
28,"Tasty Bite Rice, Organic, Smoky Chipotle",asian


Comparing the selection of International Food between the two supermarkets, Wegmans has more East Asian items (133) compared to Harris Teeter (88). 
I'll visit Harris Teeter first because it'll be more efficient 