Function to fetch a number of images (1 to N) for a specific product, using Google Images.

In [15]:
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep

# Headers to look less suspicios when scraping
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14'
headers = {'User-Agent': user_agent,'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}

# Fetch a number {limit} of images from Google Images querying for {product}
def fetch_product(product, limit):
  # Sleep between 1 to 5 seconds to change the request pattern
  sleep(randint(1,5))
  # Google Images URL
  url = f"https://www.google.com/search?q={product}&tbm=isch"
  response = requests.get(url)
  # Parse the HTML content
  soup = BeautifulSoup(response.content, 'html.parser')
  #print(soup.prettify())
  image_tags = soup.find_all('img', attrs={'class': 'DS1iW'}) # Class tag changes frequently
  # Return the first {limit} images, if any
  if image_tags:
    result = []
    for i in range(min(limit, len(image_tags))):
      result.append(image_tags[i]['src'])
    return result
  else:
    return [""]

Functions to fill the data into the map that will generate the Dataframe

In [16]:
# From a path, return the HTML tag of an image sourcing it
def path_to_image_html(path):
  print(path)
  return '<img src="' + path + '" width="100" >'

# Fill the various data in the map that will be used for the Dataframe
def fill_data(data, row, picture):
  data['Image'].append(picture)
  data['Season'].append(row['Season'])
  data['Brand'].append(row['Brand'])
  data['SKU'].append(row['SKU'])
  data['Description'].append(row['Description'])
  data['Style'].append(row['Style'])
  data['Colour Code'].append(row['Colour Code'])
  data['Colour Desc'].append(row['Colour Desc'])
  data['Size'].append(row['Size'])
  data['Gender'].append(row['Gender'])
  data['Category'].append(row['Category'])
  data['Sub-Category'].append(row['Sub-Category'])
  data['DDP-EUR'].append(row['DDP-EUR'])
  data['Alpi Servizio Moda'].append(row['Alpi Servizio Moda'])
  data['Alpi UK'].append(row['Alpi UK'])

A CSV log is used to save the images that were scraped in the past.

In [17]:
from google.colab import drive
import csv

# Mount your Google Drive
drive.mount('/content/drive')

# Path to the CSV log file on Google Drive
log_file_path = '/content/drive/My Drive/Data/ATS_log.csv'

log_map = {}

# Fill the map containing SKU - Image path reading each line of the log
with open(log_file_path, 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
      if row['SKU'] not in log_map:
        log_map[row['SKU']] = row['Image']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Filtering products

In [36]:
from typing_extensions import TypeVarTuple
filters = {
    'Season': [],
    'Brand': ['13 09 SR'],
    'Colour Code': [],
    'Gender': ['WOMEN','UNISEX'],
    'Category': [],
    'Sub-Category': [],
}

whs_filters = {
    'Alpi Servizio Moda': True,
    'Alpi UK': True,
    'CN_HK': False,
    'DHL_KR': False,
    '3RD Party Logistic': False
}

def match_filters(row):
  for key, value in filters.items():
    if len(value) > 0 and not row[key] in value:
      print('Not matching filters')
      return False
  whs_match = False
  for key, value in whs_filters.items():
    if value == True and row[key] != '':
      whs_match = True
  if whs_match == False:
      print('Not available in selected warehouse')
    return False
  return True

Process the ATS file and fetch images for each product

In [39]:
from google.colab import drive
import csv
import pandas as pd
from IPython.core.display import display,HTML

# Mount your Google Drive
drive.mount('/content/drive')

# Path to the CSV file on Google Drive
csv_file_path = '/content/drive/My Drive/Data/ATS_cut.csv'

# Data to be print
data = {
    'Image': [],
    'Season': [],
    'Brand': [],
    'SKU': [],
    'Description': [],
    'Style': [],
    'Colour Code': [],
    'Colour Desc': [],
    'Size': [],
    'Gender': [],
    'Category': [],
    'Sub-Category': [],
    'DDP-EUR': [],
    'Alpi Servizio Moda': [],
    'Alpi UK': []
}

last_product_fetched = ""

# Open the CSV file and read each line
with open(csv_file_path, 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
      # Only consider products in Italian warehouses
      print('Processing row: ' + str(row))
      if match_filters(row):
        product = row['Brand'] + ' ' + row['Style'] # Text that will be queried
        # Only fetch new products, skip duplicate queries and data saved on the log
        if row['SKU'] in log_map:
          picture = log_map[row['SKU']]
          print('Reading from log ' + picture)
          fill_data(data, row, picture)
        elif product != last_product_fetched:
          pictures = fetch_product(product, 1)
          print('Fetched ' + pictures[0])
          last_product_fetched = product
          fill_data(data, row, path_to_image_html(pictures[0]))
        else:
          fill_data(data, row, path_to_image_html(pictures[0]))

# Create a DataFrame from the data dictionary
df = pd.DataFrame(data)
pd.set_option('display.max_colwidth', None)

# Save the DataFrame in HTML
df.to_html('ATS.html', escape=False)

# Save the DataFrame in CSV for logging purposes
df.to_csv(log_file_path, index=False)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Processing row: {'Internal ID': '1', 'Season': 'CARRYOVER', 'Brand': '13 09 SR', 'SKU': 'BINBLKSS22-BLACK-37', 'Description': 'Bingo pool slide', 'Style': 'BINBLKSS22', 'Colour Code': 'BLACK', 'Colour Desc': 'BLACK', 'Size': '37', 'Gender': 'WOMEN', 'Category': 'Shoes', 'Sub-Category': 'Flip Flops & Slides', 'DDP-EUR': '0', 'Alpi Servizio Moda': '', 'CN HK': '', 'DHL KR': '', '3RD Party Logistic': '', 'Alpi UK': '2'}
Fetched https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTETScbsmF0DIX1IAJ6PhJrHHzCL15B0avesYqlktLDjjaXNXUGTnMvyY-DWtk&s
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTETScbsmF0DIX1IAJ6PhJrHHzCL15B0avesYqlktLDjjaXNXUGTnMvyY-DWtk&s
Processing row: {'Internal ID': '2', 'Season': 'CARRYOVER', 'Brand': '13 09 SR', 'SKU': 'BINBLKSS22-BLACK-38', 'Description': 'Bingo pool slide', 'Style': 'BINBLKSS22', 'Colour Code': 'BLACK', 'Colour Desc'

KeyboardInterrupt: 