# Argos Search Results Notebook

## Installations

Importing all necessary modules to run this notebook. Ensure selenium, selectorlib, and fake-useragent have been installed prior to running this notebook.

In [1]:
import requests
import json
from fake_useragent import UserAgent

In [2]:
import pandas as pd
import numpy as np

In [3]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

In [4]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selectorlib import Extractor
import time

## Loading Pre-Documented Gender Stereotyped Toys

Taking in predoc_stereotyped_items.csv to use for later analysis. This CSV file contains 72 rows.

In [5]:
stereo_toys = pd.read_csv('~/Desktop/SI 485/GitHub Repo/predoc_info/predoc_stereotyped_items.csv', delimiter=',')
stereo_toys

Unnamed: 0,BOY,GIRL,NEUTRAL
0,vehicle toys,doll,toy animals
1,sport,domestic toys,books
2,military toys,educational art,educational teaching
3,race cars,clothes,musical games
4,outer space toys,dollhouses,games
...,...,...,...
67,toy rocket,barbie furniture set,
68,soccer ball,,
69,blue ipad,pink ipad,
70,toy robots,,


## Loading List of Toys Collected from Previous Research

all_items.txt contains a list of strings, where each string represents a toy that will be searched on Argos. This text file contains 166 rows.

In [6]:
with open('all_items.txt') as f:
    all_items = f.read().splitlines()

In [7]:
len(all_items)

166

trial is a list of 6 toys from all_items. trial was used to test if the code works on a small sample of all_items rather than running the full iteration every time.

In [8]:
trial = all_items[160:]
trial

['legos', 'scooter', 'drum set', 'puzzles', 'board games', 'rock painting']

In [9]:
generic = ['toys', 'books', 'learning material', 'games', 'sports']

In [10]:
gender = ['boys', 'girls', 'neutral']

## Search

### Collecting EAN of Retrieved Products

Made use of inspect element on Argos' webpage to determine the xpath

In [11]:
def asin(driver):
    asin_list = []
    for index in range(1, 10):
        asins = driver.find_elements('xpath','//a[@aria-labelledby]')
        for asin in asins:
            #time.sleep(0.4)
            asin_list.append(asin.get_attribute('aria-labelledby'))
    return asin_list

### Get Title Information of Retrieved Products

In [12]:
from selenium.webdriver.common.by import By
def item_info(driver):
    item = []
    elem = driver.find_elements('xpath', "//a[meta/@itemprop]")
    for i in elem:
        #time.sleep(0.4)
        item.append(i.text)
    return item

### Collect Product Link of Retrieved Products

In [13]:
def item_link(driver):
    href = []
    links = driver.find_elements('xpath', "//a[@data-test = 'component-product-card-link']")
    for link in links:
        #time.sleep(0.4)
        href.append(link.get_attribute('href'))
    return href

## Preparing Search Function

This function is used to scrape all necessary information for each toy in all_items.

In [20]:
def search(item, who):
    if who == 'neutral':
        query = item + '-for-' + 'kids'
    else:
        query = item + '-for-' + who
    driver.get(f'https://www.argos.co.uk/search/{query}/?clickOrigin=searchbar:home:term:{query}')
    time.sleep(15)
    list_asin = asin(driver)
    item_list = item_info(driver)
    item_page = item_link(driver)
    return (list_asin, item_list), item_page

Initializing databases for scraped data.

In [21]:
columns1 = ['gender', 'query', 'result']
qr = pd.DataFrame(columns=columns1)
columns2 = ['gender', 'query', 'href']
qr_link = pd.DataFrame(columns=columns2)

In [22]:
trial

['legos', 'scooter', 'drum set', 'puzzles', 'board games', 'rock painting']

## Running Queries for Boys, Girls, and Neutral

This code is used to scrape all data from toys included in all_items. As of right now, trial is used on line 7 in order to run the code on a small sample, but changing this with all_items will run it on all toys.

In [23]:
import warnings
warnings.filterwarnings('ignore')
driver = webdriver.Chrome(ChromeDriverManager().install())
data1 = []
data2 = []
item = ''
for item in all_items:
    for g in gender:
        result, link = search(item, g)
        values1 = [g, item, result]
        values2 = [g, item, link]
        zipped1 = zip(columns1, values1)
        zipped2 = zip(columns2, values2)
        a_dictionary1 = dict(zipped1)
        a_dictionary2 = dict(zipped2)
        time.sleep(15)
        data1.append(a_dictionary1)
        data2.append(a_dictionary2)
driver.close()

Appending data to previously initialized dataframe.

In [24]:
qr = qr.append(data1, True)
qr

Unnamed: 0,gender,query,result
0,boys,vehicle toys,"([product-title-1403565, product-title-1403668..."
1,girls,vehicle toys,"([product-title-1403565, product-title-1403668..."
2,neutral,vehicle toys,"([product-title-1403565, product-title-1403668..."
3,boys,sport,"([product-title-7655915, product-title-9506512..."
4,girls,sport,"([product-title-8869607, product-title-1353213..."
...,...,...,...
493,girls,board games,"([product-title-3904567, product-title-3907306..."
494,neutral,board games,"([product-title-2687421, product-title-9452123..."
495,boys,rock painting,"([product-title-9493113, product-title-tuc1414..."
496,girls,rock painting,"([product-title-9493113, product-title-9493113..."


In [25]:
qr['result'][4][0]

['product-title-8869607',
 'product-title-1353213',
 'product-title-8251813',
 'product-title-tuc141678763',
 'product-title-tuc141503761',
 'product-title-tuc141492571',
 'product-title-tuc141981990',
 'product-title-tuc141974466',
 'product-title-tuc141710268',
 'product-title-tuc141670825',
 'product-title-tuc141317829',
 'product-title-tuc141072999',
 'product-title-tuc139876963',
 'product-title-tuc139876841',
 'product-title-tuc140490697',
 'product-title-tuc139876576',
 'product-title-tuc139875872',
 'product-title-tuc141494841',
 'product-title-tuc142535924',
 'product-title-tuc142533339',
 'product-title-tuc142532328',
 'product-title-tuc141982159',
 'product-title-tuc141946035',
 'product-title-tuc141948102',
 'product-title-tuc141946051',
 'product-title-tuc141948305',
 'product-title-tuc141946887',
 'product-title-tuc141946799',
 'product-title-tuc141946684',
 'product-title-tuc141948170',
 'product-title-8869607',
 'product-title-1353213',
 'product-title-8251813',
 'produ

In [26]:
qr['result'][4][1]

['',
 'Tikkers Girls Multicolor Silicone Strap Fitness Tracker Set',
 '',
 'Raleigh Girls Leisure Bike Helmet - Pink, 48-54cm',
 '',
 'Piranha Harlem 20 inch Wheel Size Girls Kids Mountain Bike',
 '',
 'Disney Moana Hooded Towel Poncho - One Size',
 '',
 'Floral & Spot Swimsuits 2 Pack',
 '',
 'Green Floral All-In-One Swimsuit',
 '',
 "Kids' Family Tropical Leaf Swimsuit",
 '',
 'Pink Star Towelling Throw-On',
 '',
 "Kids' Family Navy Abstract Swimsuit",
 '',
 'Disney Moana Pink Swimsuit',
 '',
 'Black First Period Swimsuit',
 '',
 'Navy Floral Print Swimsuit',
 '',
 "Kids' Black Wetsuit",
 '',
 'Navy & Red Long Leg Wetsuit',
 '',
 'L.O.L Surprise! Pink Costume',
 '',
 'Navy & Pink Long Wetsuit',
 '',
 'Black & Pink Short Wetsuit',
 '',
 'Navy Colour Block Swimsuit',
 '',
 'FATFACE Blue Sea Scape Tankini Set',
 '',
 'FATFACE Floral Tankini Set',
 '',
 'FATFACE Floral Swimsuit',
 '',
 'Disney Princess Ariel Blue Frill Swimsuit',
 '',
 'Tropical Print Swim Shorts',
 '',
 'Pink Abstract F

In [27]:
qr_link = qr_link.append(data2, True)
qr_link

Unnamed: 0,gender,query,href
0,boys,vehicle toys,[https://www.argos.co.uk/product/1403565?click...
1,girls,vehicle toys,[https://www.argos.co.uk/product/1403565?click...
2,neutral,vehicle toys,[https://www.argos.co.uk/product/1403565?click...
3,boys,sport,[https://www.argos.co.uk/product/7655915?click...
4,girls,sport,[https://www.argos.co.uk/product/8869607?click...
...,...,...,...
493,girls,board games,[https://www.argos.co.uk/product/3904567?click...
494,neutral,board games,[https://www.argos.co.uk/product/2687421?click...
495,boys,rock painting,[https://www.argos.co.uk/product/9493113?click...
496,girls,rock painting,[https://www.argos.co.uk/product/9493113?click...


In [28]:
qr_link.href[1]

['https://www.argos.co.uk/product/1403565?clickSR=slp:term:vehicle%20toys%20for%20girls:1:303:1',
 'https://www.argos.co.uk/product/1403668?clickSR=slp:term:vehicle%20toys%20for%20girls:2:303:1',
 'https://www.argos.co.uk/product/1403709?clickSR=slp:term:vehicle%20toys%20for%20girls:3:303:1',
 'https://www.argos.co.uk/product/9480250?clickSR=slp:term:vehicle%20toys%20for%20girls:4:303:1',
 'https://www.argos.co.uk/product/7355703?clickSR=slp:term:vehicle%20toys%20for%20girls:5:303:1',
 'https://www.argos.co.uk/product/9425916?clickSR=slp:term:vehicle%20toys%20for%20girls:6:303:1',
 'https://www.argos.co.uk/product/1403383?clickSR=slp:term:vehicle%20toys%20for%20girls:7:303:1',
 'https://www.argos.co.uk/product/9419128?clickSR=slp:term:vehicle%20toys%20for%20girls:8:303:1',
 'https://www.argos.co.uk/product/9697469?clickSR=slp:term:vehicle%20toys%20for%20girls:9:303:1',
 'https://www.argos.co.uk/product/9576418?clickSR=slp:term:vehicle%20toys%20for%20girls:10:303:1',
 'https://www.argos

Initialize a database to view only the first 20 items and their respective data points.

In [29]:
first20 = pd.DataFrame()
first20

In [30]:
first20 = first20.append(qr, ignore_index = True)

In [31]:
first20

Unnamed: 0,gender,query,result
0,boys,vehicle toys,"([product-title-1403565, product-title-1403668..."
1,girls,vehicle toys,"([product-title-1403565, product-title-1403668..."
2,neutral,vehicle toys,"([product-title-1403565, product-title-1403668..."
3,boys,sport,"([product-title-7655915, product-title-9506512..."
4,girls,sport,"([product-title-8869607, product-title-1353213..."
...,...,...,...
493,girls,board games,"([product-title-3904567, product-title-3907306..."
494,neutral,board games,"([product-title-2687421, product-title-9452123..."
495,boys,rock painting,"([product-title-9493113, product-title-tuc1414..."
496,girls,rock painting,"([product-title-9493113, product-title-9493113..."


In [32]:
len(first20.loc[0]['result'][1])

60

In [33]:
first20.to_csv('argos_search_results.csv', index = False)

## Data Cleaning

In [None]:
import pandas as pd
data = pd.read_csv('argos_search_results.csv', delimiter=',')
data

In [22]:
data['result'][4]

"(['product-title-9536836', 'product-title-9451423', 'product-title-8673439', 'product-title-1252053', 'product-title-1177994', 'product-title-1175587', 'product-title-8845401', 'product-title-5034095', 'product-title-8676742', 'product-title-4982416', 'product-title-3187573', 'product-title-9487734', 'product-title-8661702', 'product-title-9452006', 'product-title-9546888', 'product-title-7841925', 'product-title-9492671', 'product-title-8438038', 'product-title-1156142', 'product-title-9430523', 'product-title-2006923', 'product-title-9517886', 'product-title-1174997', 'product-title-8625250', 'product-title-8212773', 'product-title-7622175', 'product-title-9517446', 'product-title-5577198', 'product-title-9424137', 'product-title-9457032', 'product-title-9536836', 'product-title-9451423', 'product-title-8673439', 'product-title-1252053', 'product-title-1177994', 'product-title-1175587', 'product-title-8845401', 'product-title-5034095', 'product-title-8676742', 'product-title-4982416