# Argos Search Results Notebook

## Installations

In [105]:
import requests
import json
from fake_useragent import UserAgent

In [106]:
import pandas as pd
import numpy as np

In [107]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selectorlib import Extractor
import time

In [108]:
from selenium.webdriver.support.ui import WebDriverWait

## Loading Pre-Documented Gender Stereotyped Items

In [109]:
stereo_toys = pd.read_csv('~/Downloads/SI485/Data/predoc_stereotyped_items.csv', delimiter =',')
stereo_toys

Unnamed: 0,BOY,GIRL,NEUTRAL
0,vehicle toys,doll,toy animals
1,sport,domestic toys,books
2,military toys,educational art,educational teaching
3,race cars,clothes,musical games
4,outer space toys,dollhouses,games
...,...,...,...
67,toy rocket,barbie furniture set,
68,soccer ball,,
69,blue ipad,pink ipad,
70,toy robots,,


In [110]:
with open('Data/all_items.txt') as f:
    all_items = f.read().splitlines()

In [111]:
len(all_items)

166

In [142]:
trial = all_items[:10]
trial

['vehicle toys',
 'sport',
 'military toys',
 'race cars',
 'outer space toys',
 'depots',
 'machines',
 'doll-humanoid',
 'action figures',
 'gi joe action figure']

In [113]:
generic = ['toys', 'books', 'learning material', 'games', 'sports']

In [114]:
gender = ['boys', 'girls', 'neutral']

## Search

### Collecting ASIN of Retrieved Products

In [145]:
def asin(driver):
    asin_list = []
    for index in range(1, 10):
        driver.implicitly_wait(120)
        asins = driver.find_elements('xpath','//a[@aria-labelledby]')

        for asin in asins:
            asin_list.append(asin.get_attribute('aria-labelledby'))

        return asin_list
        
        
       # asins = driver.find_elements('xpath','//a[@aria-labelledby]') # is this going through each individual page?
        #for asin in asins:
         #   time.sleep(0.5)
          #  asin_list.append(asin.get_attribute('aria-labelledby'))
   # return asin_list

### Get Title Information of Retrieved Products

In [127]:
from selenium.webdriver.common.by import By
def item_info(driver):
    item = []
    for index in range(1, 10):
        #driver.implicitly_wait(10)
        elem = driver.find_elements('xpath','//a[meta/@itemprop]')

        for i in elem:
            item.append(i.text)

        return item

### Collect Product Link of Retrieved Products

In [128]:
def item_link(driver):
    href = []
    links = driver.find_elements('xpath', "//a[@data-test = 'component-product-card-link']")
    for link in links:
        href.append(link.get_attribute('href'))
    return href

## Running Queries for Boys, Girls, and Neutral

In [146]:
def search(item, who):
    if who == 'neutral':
        query = item + '-for-' + 'kids'
    else:
        query = item + '-for-' + who
    driver.get(f'https://www.argos.co.uk/search/{query}/?clickOrigin=searchbar:home:term:{query}')
    driver.implicitly_wait(120)
    list_asin = asin(driver)
    item_list = item_info(driver)
    item_page = item_link(driver)
    return (list_asin, item_list), item_page

In [130]:
columns1 = ['gender', 'query', 'result']
qr = pd.DataFrame(columns=columns1)
columns2 = ['gender', 'query', 'href']
qr_link = pd.DataFrame(columns=columns2)

In [131]:
trial

['legos', 'scooter', 'drum set', 'puzzles', 'board games', 'rock painting']

## Running Queries in a Loop

In [147]:
import warnings
warnings.filterwarnings('ignore')
driver = webdriver.Chrome(ChromeDriverManager().install())
data1 = []
data2 = []
item = ''
for item in all_items:
    for g in gender:
        result, link = search(item, g)
        values1 = [g, item, result]
        values2 = [g, item, link]
        zipped1 = zip(columns1, values1)
        zipped2 = zip(columns2, values2)
        a_dictionary1 = dict(zipped1)
        a_dictionary2 = dict(zipped2)
        time.sleep(1.5)
        data1.append(a_dictionary1)
        data2.append(a_dictionary2)
driver.close()

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=109.0.5414.119)
Stacktrace:
0   chromedriver                        0x0000000105a5cfa8 chromedriver + 4886440
1   chromedriver                        0x00000001059da643 chromedriver + 4351555
2   chromedriver                        0x0000000105628b27 chromedriver + 477991
3   chromedriver                        0x000000010562cf78 chromedriver + 495480
4   chromedriver                        0x000000010562cc46 chromedriver + 494662
5   chromedriver                        0x000000010562db9e chromedriver + 498590
6   chromedriver                        0x00000001056af35f chromedriver + 1028959
7   chromedriver                        0x00000001056935d2 chromedriver + 914898
8   chromedriver                        0x00000001056ae5fe chromedriver + 1025534
9   chromedriver                        0x00000001056933a3 chromedriver + 914339
10  chromedriver                        0x000000010565d57f chromedriver + 693631
11  chromedriver                        0x000000010565eb1e chromedriver + 699166
12  chromedriver                        0x0000000105a29b9e chromedriver + 4676510
13  chromedriver                        0x0000000105a2e91e chromedriver + 4696350
14  chromedriver                        0x0000000105a3619f chromedriver + 4727199
15  chromedriver                        0x0000000105a2f81a chromedriver + 4700186
16  chromedriver                        0x0000000105a02a62 chromedriver + 4516450
17  chromedriver                        0x0000000105a4e8c8 chromedriver + 4827336
18  chromedriver                        0x0000000105a4ea45 chromedriver + 4827717
19  chromedriver                        0x0000000105a647ef chromedriver + 4917231
20  libsystem_pthread.dylib             0x00007ff812938259 _pthread_start + 125
21  libsystem_pthread.dylib             0x00007ff812933c7b thread_start + 15


In [None]:
qr = qr.append(data1, True)
qr

In [None]:
qr['result'][0]

In [None]:
qr_link = qr_link.append(data2, True)
qr_link

In [None]:
qr_link.href[1]

In [None]:
first20 = pd.DataFrame()

In [None]:
first20 = first20.append(qr, ignore_index = True)

In [None]:
first20

In [None]:
len(first20.loc[0]['result'][1])

In [None]:
first20.to_csv('argos_search_results.csv', index = False)