In [None]:
import selenium
from selenium.webdriver import Chrome
from webdriver_manager.chrome import ChromeDriverManager #installs Chrome webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time

In [None]:
class Scraper:

    #load webpage in initialiser
    def __init__(self, url: str = "https://store.eu.square-enix-games.com/en_GB/"): #default url
        self.driver = Chrome(ChromeDriverManager().install()) 
        try:
            self.driver.get(url)
            #driver = Chrome() #specify location of chromedriver if downloading webdriver
            print("Webpage loaded successfully")
        except:
            print("Webpage not loaded - please check")

    #click accept cookies button on webpage
    def accept_cookies(self, xpath: str = '//*[@id="onetrust-accept-btn-handler"]'): 
        try:
            WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))
            time.sleep(5)
            self.driver.find_element(By.XPATH, xpath).click()
            print("'Accept Cookies' button clicked")
        except TimeoutException: #if accept button is not found after 10 seconds by driver
            print("No cookies found") 

    #access and type in search bar
    def search_bar(self, text, xpath: str = '//*[@id="search-button"]', 
                    xpath1: str = '//*[@id="search-form-wrapper"]/form/div/input',
                    xpath2: str = '//*[@id="search-form-wrapper"]/form/div/span/button'): 
        
        #click on search bar icon
        try:
            time.sleep(1)
            WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.XPATH, xpath)))
            self.driver.find_element(By.XPATH, xpath).click()
            print("Search bar opened")
        except TimeoutException:
            print("Search bar not found")
        
        #open search bar
        try:
            time.sleep(1)
            WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.XPATH, xpath1)))
            time.sleep(2)
            self.driver.find_element(By.XPATH, xpath1).click()
        except TimeoutException:
            print("Search bar not found - input")
        
        #input keywords to search
        try:
            self.search = self.driver.find_element(By.XPATH, xpath1)
            self.search.send_keys(text)
            print("Search keywords entered")
            time.sleep(2)
        except:
            print("Cannot input keywords")
        
        #submit input
        try:
            self.search = self.driver.find_element(By.XPATH, xpath2).click()
            print("Submit search button clicked - redirected to results")
            time.sleep(2)
        except:
            print("Cannot submit search")
        
    
    def navigate(self, xpath: str = '//*[@id="merchandise"]'): #navigate tabs - change for games, merchandise or preorders
        self.tab_select = self.driver.find_element(By.XPATH,xpath)
        time.sleep(2)
        self.tab_select.click()
        time.sleep(2)
        
    
    def find_container(self, xpath: str = '//div[@class="catalogue row"]'):
        return self.driver.find_element(By.XPATH, xpath)


In [None]:
if __name__ == "__main__": #will only run methods below if script is run directly
    scraper = Scraper() #call scraper class
    scraper.accept_cookies()
    scraper.navigate()
    scraper.search_bar("final fantasy") #add search keyword here
    

In [None]:
import selenium
from selenium.webdriver import Chrome
from webdriver_manager.chrome import ChromeDriverManager #installs Chrome webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

container = scraper.find_container()
#find many elements that correspond with the XPath - they have to be direct children of the container
#i.e. one level below the container
list_products = container.find_elements(By.XPATH, './/a')


In [None]:
link_list = []
for product in list_products: #iterate through each product
    #print(product.text) #print each product in text format
    link_list.append(product.get_attribute("href"))

In [None]:
from selenium.common.exceptions import NoSuchElementException
import uuid #universally unique id

#add link and product info to dictionary
product_dict = {"Link": [], "Product Name" :[], "Price" : [], "Description" : [], "ID": [], "UUID":[]}
image_dict = {"Link": [], "Product UUID": [], "Image UUID": []}

def retrieve_data():
   #link_list #lists all urls in specified webpage section
   for link in link_list[3:8]: #iterates through links 4 to 9 - FFVIII downloads not included (DOB required)
      scraper.driver.get(link)
      time.sleep(2)
      product_dict["Link"].append(link)
      try: #get product name
         product_name = scraper.driver.find_element(By.XPATH, '//*[@id="responsive-wrapper-title"]/header/h1')
            #shortened form of //*[@id="responsive-wrapper-title"]/header/h1
            #.//h1[@class="product-title"] - works some of the time
         product_dict["Product Name"].append(product_name.text)
         print("Product name obtained")
      except NoSuchElementException: #not all links accessed will have the same attributes
         product_dict["Product Name"].append("N/A")
      
      time.sleep(3)
      try: #get price
         price = scraper.driver.find_element(By.XPATH, './/span[@class = "prices"]')
            #shortened form of //*[@id="main-content"]/article/div[1]/div/div[2]/div[2]/div[1]/div[1]/div/span
         product_dict["Price"].append(price.text)
         print("Price obtained")
      except NoSuchElementException:
         product_dict["Price"].append("N/A")
      
      time.sleep(3)
      try: #get product description
         desc = scraper.driver.find_element(By.XPATH, './/div[@class="tab-pane-content"]')
            #shortened form of //*[@id="desc-collapse"]/div/div
         product_dict["Description"].append(desc.text)
         print("Product description obtained")
      except NoSuchElementException:
         product_dict["Description"].append("N/A")
      
      time.sleep(3)
      try: #get product SKU/ID from URL
         #to get SKU - product details tab needs to be clicked and then an if statement to get the SKU otherwise other info will be obtained
         r = link.rsplit("/", 6) #split link 6 times after every '/'
         product_dict["ID"].append(r[5])

         print("Product ID obtained")
      except NoSuchElementException:
         product_dict["ID"].append("N/A")

      time.sleep(3)
      try: #generate V4 UUID for product and image
         product_uuid = uuid.uuid4()
         image_uuid = uuid.uuid4()
         product_dict["UUID"].append(product_uuid), image_dict["Product UUID"].append(product_uuid)
         image_dict["Image UUID"].append(image_uuid)

         print("UUID generated")
      except NoSuchElementException:
         product_dict["UUID"].append("N/A")
         image_dict["Product UUID"].append("N/A")
         image_dict["Image UUID"].append("N/A")
      
      time.sleep(3)
      try: #download and save product image using product ID as file name
         image_link = f"{r[5]}.png"
         with open(image_link, "wb") as file: #wb = write and binary mode
            img = scraper.driver.find_element(By.XPATH, '//*[@id="main-content"]/article/div[1]/div/div[1]/div[3]/a/figure/img')
            file.write(img.screenshot_as_png)
         print("Product image downloaded")
         image_dict["Link"].append(image_link)
         print("Image link added")
      except:
         image_dict["Link"].append("N/A")
         print("Product image not downloaded")
      time.sleep(3)
retrieve_data()

In [None]:
product_dict #display all dictionary entries

## print product names and prices ##
#print(product_dict["Product Name"], product_dict["Price"])

In [None]:
image_dict #display all dictionary entries

In [None]:
import pandas as pd

pd.DataFrame(product_dict) #displays product dictionary in panda dataframe
#pd.DataFrame(image_dict) #displays image dictionary in panda dataframe

In [None]:
import os, os.path
import json
from json import JSONEncoder
from uuid import UUID

new_folder = "../webscraping/raw_data"
file = "data"
create_file = os.path.join(new_folder, file+".json") #add file type here

#Dealing with no UUID serialization support in json
JSONEncoder_olddefault = JSONEncoder.default
def JSONEncoder_newdefault(self, o):
    if isinstance(o, UUID): return str(o)
    return JSONEncoder_olddefault(self, o)
JSONEncoder.default = JSONEncoder_newdefault


try: #create raw_data folder in current directory - check if folder already exists
    if not os.path.exists(new_folder):
        os.mkdir(new_folder) #create folder if it doesn't exist
        with open(create_file, "w") as fp: #specify path here 
                json.dump(product_dict, fp,  indent=4)
                
    elif os.path.exists(new_folder): #if folder already exists
        with open(create_file, "w") as fp: 
                json.dump(product_dict, fp,  indent=4)
except FileExistsError:
    print("Already exists")

In [None]:
#move files into a new directory/folder
import os, shutil, os.path

new_folder1 = "images" #create images folder in current directory
sourcepath = "../webscraping"
source = os.listdir(sourcepath)
destinationpath = "../webscraping/images"

try:
 if not os.path.exists(new_folder1):
  os.mkdir(new_folder1)
  for files in source:
      if files.endswith('.png'):
          shutil.move(os.path.join(sourcepath, files), os.path.join(destinationpath, files))    
 elif os.path.exists(new_folder1):
      for files in source:
        if files.endswith('.png'):
            shutil.move(os.path.join(sourcepath, files), os.path.join(destinationpath, files)) 
except FileExistsError:
 print("Already exists")