# 01. Setup and Load Data

### 1.1 Dependencies Install and Setup

In [25]:
#!pip install requests
#!pip install beautifulsoup4

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import urllib.parse
import os
from urllib.parse import urlparse, urljoin
import re


### 1.2 Data Load

##### Load xlsx file path and data frame creation

In [3]:
product_ref_path = os.path.join("product_ref_call", "product_ref.xlsx")

#Routine for file check;

if not os.path.exists(product_ref_path):
    print("File Error: The file does not exist.", product_ref_path)
    exit()
try:
    product_ref_df = pd.read_excel(product_ref_path)
except Exception as e:
    print("Error reading the Excel file:", e)
    exit()

In [5]:
product_ref_df.head(5)

Unnamed: 0,season,week,prod_ref,Status,Última Etapa,Category,Family,Theme,Store Date Planned,Store Date Final,...,Data Aprovação modelo s/ reconhecimento,Data Repetição fotografia,TXTS Etapa 9,Data Repetição modelo,TXTS Etapa 10,Data Aprovação repetição,TXTS Etapa 11,Date Published,# Fotos,Number of Units
0,23 Fall/Winter,2023_45,214658_GY,Online: After Store Date,Published,Winter Textiles,Blanket Scarves,STARLIGHT,2023-11-01,2023-11-01,...,NaT,NaT,0.0,NaT,0.0,NaT,0.0,2023-11-15,6.0,Singular
1,23 Fall/Winter,2023_45,214658_LM,Online: After Store Date,Published,Winter Textiles,Blanket Scarves,STARLIGHT,2023-11-01,2023-11-01,...,NaT,NaT,0.0,NaT,0.0,NaT,0.0,2023-11-15,2.0,Singular
2,23 Fall/Winter,2023_45,214658_NV,Online: After Store Date,Published,Winter Textiles,Blanket Scarves,STARLIGHT,2023-11-01,2023-11-01,...,NaT,NaT,0.0,NaT,0.0,NaT,0.0,2023-11-15,2.0,Singular
3,23 Fall/Winter,2023_45,214687_DM,Online: OK,Published,Winter Textiles,Blanket Scarves,STARLIGHT,2023-11-01,2023-11-01,...,NaT,NaT,0.0,NaT,0.0,NaT,0.0,2023-10-28,2.0,Singular
4,23 Fall/Winter,2023_45,214688_GN,Online: After Store Date,Published,Winter Textiles,Blanket Scarves,KUSAMA,2023-11-01,2023-11-01,...,NaT,NaT,0.0,NaT,0.0,NaT,0.0,2023-11-15,5.0,Singular


# 02. Setup Web Scraping Tool

### 1.1 Target data and directory creation

In [8]:
page_url_01_list = []

for index, row in product_ref_df.iterrows():
    product_ref = row["prod_ref"]
    page_url_01 = f"https://www.parfois.com/pt/pt/search/?q={product_ref}&lang=pt_PT"
    page_url_01_list.append(page_url_01)


In [9]:
save_dir = "downloaded_images"
os.makedirs(save_dir, exist_ok=True)

### 1.2 Web Scrapping Run

In [None]:
for page_url_01 in page_url_01_list:
    response_page_url_01 = requests.get(page_url_01)
    soup_page_numbers = BeautifulSoup (response_page_url_01.content)
    
    parfois_components = soup_page_numbers.find_all("div", class_="full-width clearfix")
    for component in parfois_components:
        parfois_components2 = component.find_all("div", class_="pdp-main") #ok
        
        for component in parfois_components2:
            parfois_components3 = component.find_all("div", class_="product-col-1") #ok
            
            for component in parfois_components3:
                parfois_components4 = component.find_all("div", class_="product-thumbnails") #ok
                
                for component in parfois_components4:
                    parfois_components5 = component.find_all("div", class_="vertical-carousel") #ok
                    
                    for component in parfois_components5:
                        img_tags = component.find_all("li", class_="thumb")  #ok
                        
                        for li_tag in img_tags:
                                img_tag = li_tag.find("img", class_="productthumbnail seleccionada")
                                if img_tag:
                                    img_url = img_tag["data-hi-res"]
                                    img_name = os.path.basename(urlparse(img_url).path)
                                    img_path = os.path.join("downloaded_images", img_name)

                                    response = requests.get(img_url)
                                    with open(img_path, "wb") as img_file:
                                        img_file.write(response.content)



### 1.3 Creation of a DF to check missing Ref. after the Web Scrapping Run

##### There may be ref. no longer on the website or needing extra routine on the web scrapping tool

In [11]:
downloaded_images_path = "downloaded_images"  

existing_images = set(os.listdir(downloaded_images_path))

image_downloaded_ref = pd.DataFrame(existing_images)

image_downloaded_ref.rename(columns={0: "image_file_name"}, inplace=True)


In [13]:
def standardise_file_names(filename):
    match = re.match(r'^([^_]+_[^_]+)', filename)
    if match:
        return match.group(1)
    else:
        return filename

In [14]:
image_downloaded_ref["image_file_name"] = image_downloaded_ref["image_file_name"].apply(standardise_file_names)
image_downloaded_ref = image_downloaded_ref.drop_duplicates(subset="image_file_name").reset_index(drop=True)

In [20]:
product_ref_df["image_download_check"] = product_ref_df["prod_ref"].isin(image_downloaded_ref["image_file_name"]).astype(int)


##### After a manual check of the values 0 for the download check, we get products that are no longer on the website

##### Next step is to run the model_run_for_image_collector.ipynb to get the model predictions