# Which Grocery Store should I go for East Asian Food? Logit Regression Model

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep # control the crawl rate to avoid hammering the servers with too many requests
from random import randint
import re # regular expression

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import chromedriver_binary # adds chromedriver binary to path

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer #tokenizing the words

from sklearn.linear_model import LogisticRegression

In [None]:
def parsing_pages(list_of_pages):
    pages = []
    for page in list_of_pages:
        if pd.notnull(page):
            all_pages = requests.get(page)
            each_page = BeautifulSoup(all_pages.content, "html.parser")
            pages.append(each_page)
            sleep(randint(2,6))
        else: 
            pages.append(None)
    return pages

## Scrapping H Mart Items for East Asian Food

In [None]:
hmart = ['https://www.hmart.com/groceries?p=' + str(i) for i in range(1,16)]
hmart_parsed = parsing_pages(hmart)

# finding item names from a list of beautifulsoup objects
key = 'product name product-item-name'
items = [] 
for page in hmart_parsed:
    items_in_pages = page.find_all('strong', class_=key)
    for item in items_in_pages: # cleaning up the messy text
        items_per_page = item.get_text().replace('\n','').replace('\r','').strip().replace('     ', ' ').lower()
        items.append(items_per_page)

# extracting item names from the text
item_names = [i.split('        ')[-1] for i in items]

# extracting volumns at the end
regrex = re.compile(r'\d\..*')
item_volumns = [regrex.findall(i) for i in item_names]

# getting rid of texts about volumns 
asian_items_no_volumns = []
for i in asian_items:
    asian_item = re.sub(' \d\..*', '', i)
    asian_items_no_volumns.append(asian_item)

columns = {
    'item_name': asian_items_no_volumns,
    'item_volumn': item_volumns,
    'type': 'asian'
}
asian_items = pd.DataFrame(data = columns)

## Scraping Walmart Items for Non-East Asian Food

In [None]:
## Walmart uses dynamic web pages, thus can't use BeautifulSoup
driver = webdriver.Chrome()
driver.get("https://www.walmart.com/browse/food/")

list_of_items_from_pages = []
for n in range(1,26):
        driver.get("https://www.walmart.com/browse/food/?page=" + str(n))
        items_from_pages = driver.find_elements_by_css_selector(".product-title-link.line-clamp span")
        for i in items_from_pages:
            items_extracted = i.text
            list_of_items_from_pages.append(items_extracted)
        time.sleep(3)

        
# getting rid of units/volumns at the beginning of each item
pack = re.compile(r'\(\d{1,} (?:pack|count|cans)\)') 
list_of_items_from_pages = [re.sub(pack, '', i.lower()) for i in list_of_items_from_pages]

# isolating volumns at the end
regrex = re.compile(r'(?:, \d{1,}|\d{1,} (?:mg|oz|fl|ct)|, \d{1,} (?:mg|oz|fl|ct)).*')
walmart_item_volumns = [regrex.findall(i) for i in list_of_items_from_pages]
walmart_item_names = [re.sub(regrex, '', i.split(',')[0].strip()) for i in list_of_items_from_pages]

columns = {
    'item_name': walmart_item_names,
    'item_volumn': walmart_item_volumns,
    'type': 'non-asian'
}
walmart_items = pd.DataFrame(data = columns)

## Pre-processing the Data

In [None]:
## First, concat the two lists together
frames = [walmart_items, hmart]
df = pd.concat(frames).reset_index(drop = True)

# seperate training and test sets
from random import sample
df_train = df.sample(frac=0.6, random_state=0)
df_test = df.drop(df_train.index)

count_vect = CountVectorizer(stop_words='english',max_df=0.85)
train_vect = count_vect.fit_transform(df_train.item_name)

### Tokenizing/vectorizing the Item Names

In [None]:
# training set
train_tf_transformer = TfidfTransformer(use_idf=False).fit(train_vect)
train_tf = train_tf_transformer.transform(train_vect)
train_tf.shape

In [None]:
# test set
test_vect = count_vect.transform(df_test.item_name) # not fitting, but transforming
# test_tf_transformer = TfidfTransformer(use_idf=False).fit(train_vect)
test_tf = train_tf_transformer.transform(test_vect)
test_tf.shape

## Building the Logistic Regression Model

In [None]:
X_train = train_tf
y_train = df_train.type

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

X_test = test_tf
y_test = df_test.type

log_reg.score(X_test, y_test)

## Comparing Wegmans and Harris Teeter

In [None]:
driver = webdriver.Chrome()

wegmans_international_items = []
driver.get("https://shop.wegmans.com/shop/categories/345?page=1")
time.sleep(3)
driver.find_element_by_css_selector("#shopping-selector-shop-context-intent-instore").click()
time.sleep(3)
for n in range(1,26):
    time.sleep(2)
    driver.get("https://shop.wegmans.com/shop/categories/345?page=" + str(n))
    time.sleep(3)
    items = driver.find_elements_by_css_selector(".cell-title")
    for i in items:
        items_extracted = i.text
        wegmans_international_items.append(items_extracted)
    time.sleep(randint(3,10))

wegmans_intl_item_names = [i.split('\n')[0] for i in wegmans_international_items]
wegmans_intl_item_volumns = [i.split('\n')[1] for i in wegmans_international_items]

columns = {
    'item_name': wegmans_intl_item_names,
    'item_volumn': wegmans_intl_item_volumns
}
wegmans_intl = pd.DataFrame(data = columns)

In [None]:
rate = round(len(y_intl[y_intl == 'asian']) / len(y_intl), 2) 
print('Proportion of East Asian Food: ', rate)

In [None]:
## Demo the identified food items
wegmand_intl['type'] = y_intl