In [1]:
import requests
from bs4 import BeautifulSoup
import json

urls = [
    "https://outfitters.com.pk/collections/men-t-shirts",
    "https://outfitters.com.pk/collections/men-polo-shirts",
    "https://outfitters.com.pk/collections/men-shirts",
    "https://outfitters.com.pk/collections/men-denim-collection",
    "https://outfitters.com.pk/collections/men-shorts-1",
    "https://outfitters.com.pk/collections/men-trousers",
    "https://outfitters.com.pk/collections/men-activewear",
    "https://outfitters.com.pk/collections/men-footwear"
]

# List to store product data
products_data = []

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    products = soup.find_all('div', class_='card__content')

    for product in products:
        name_elem = product.find('h3', class_='card__heading')
        if name_elem:
            name = name_elem.text.strip()
        else:
            name = "N/A"

        price_elem = product.find('span', class_='money')
        if price_elem:
            price = price_elem.text.strip()
        else:
            price = "N/A"

        product_url_elem = product.find('a', class_='product-link-main')
        if product_url_elem:
            product_url = "https://outfitters.com.pk" + product_url_elem['href']
        else:
            product_url = "N/A"

        color_elem = product.find('div', class_='item-image-wrapper')
        if color_elem:
            color = color_elem.get('color', 'N/A')
        else:
            color = "N/A"

        # Create a dictionary to store product data
        product_data = {
            "name": name,
            "price": price,
            "color": color,
            "url": product_url
        }

        products_data.append(product_data)

# Write product data to JSON file
with open("outfitters.json", "w") as json_file:
    json.dump(products_data, json_file, indent=4)

print("Data saved to outfitters.json")


Data saved to outfitters.json


In [2]:
import requests
from bs4 import BeautifulSoup
import json
import csv
import os
from urllib.parse import urlparse, parse_qs

# URLs to scrape
urls = [
    "https://outfitters.com.pk/collections/men-t-shirts",
    "https://outfitters.com.pk/collections/men-polo-shirts",
    "https://outfitters.com.pk/collections/men-shirts",
    "https://outfitters.com.pk/collections/men-denim-collection",
    "https://outfitters.com.pk/collections/men-shorts-1",
    "https://outfitters.com.pk/collections/men-trousers",
    "https://outfitters.com.pk/collections/men-activewear",
    "https://outfitters.com.pk/collections/men-footwear"
]

# Directory to save images
image_directory = 'downloaded_images'
os.makedirs(image_directory, exist_ok=True)

# CSV file setup
csv_file = open('outfitters_products.csv', 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product Name', 'Price', 'Color', 'Product Page URL', 'Downloaded Image ID', 'Downloaded Image URL'])

# Set to store unique image IDs
image_ids = set()

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    products = soup.find_all('div', class_='card__content')

    for product in products:
        # Extract product details
        name = product.find('h3', class_='card__heading').text.strip() if product.find('h3', class_='card__heading') else "N/A"
        price = product.find('span', class_='money').text.strip() if product.find('span', class_='money') else "N/A"
        color = product.find('div', class_='item-image-wrapper').get('color', 'N/A') if product.find('div', class_='item-image-wrapper') else "N/A"
        product_url_elem = product.find('a', class_='product-link-main')
        product_url = "https://outfitters.com.pk" + product_url_elem['href'] if product_url_elem else "N/A"
        
        # Image URL extraction
        img_tag = product.find('img', class_='motion-reduce image-second')
        if img_tag:
            img_url = f"https:{img_tag['src']}"
            parsed_url = urlparse(img_url)
            query_params = parse_qs(parsed_url.query)
            image_id = query_params.get('v', [''])[0]
            
            # Check for uniqueness of the image ID
            if image_id in image_ids:
                raise ValueError(f"Duplicate image ID {image_id} found. Process aborted.")
            else:
                image_ids.add(image_id)
                # Modify image URL for better quality
                clean_img_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
                # Download the image
                image_response = requests.get(clean_img_url)
                image_response.raise_for_status()
                image_path = os.path.join(image_directory, f"{image_id}.jpg")
                with open(image_path, 'wb') as img_file:
                    img_file.write(image_response.content)
                
                # Write product details to CSV
                csv_writer.writerow([name, price, color, product_url, image_id, clean_img_url])
                print(f"Downloaded {clean_img_url} as {image_path}")

# Close CSV file
csv_file.close()
print("Scraping, downloading and CSV compilation completed.")


Downloaded https://outfitters.com.pk/cdn/shop/files/F0978106904_1.jpg as downloaded_images/1724320263.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0941106901_2.jpg as downloaded_images/1719997503.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0941106999_1.jpg as downloaded_images/1724311945.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0957106003_2_d3830873-6c36-4100-969b-c632c00c9121.jpg as downloaded_images/1724312127.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0957106901_1.jpg as downloaded_images/1724312128.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F1035106619_1.jpg as downloaded_images/1727159544.jpg


ValueError: Duplicate image ID 1727159544 found. Process aborted.

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import os
from urllib.parse import urlparse

# URLs to scrape
urls = [
    "https://outfitters.com.pk/collections/men-t-shirts",
    "https://outfitters.com.pk/collections/men-polo-shirts",
    "https://outfitters.com.pk/collections/men-shirts",
    "https://outfitters.com.pk/collections/men-denim-collection",
    "https://outfitters.com.pk/collections/men-shorts-1",
    "https://outfitters.com.pk/collections/men-trousers",
    "https://outfitters.com.pk/collections/men-activewear",
    "https://outfitters.com.pk/collections/men-footwear"
]

# Directory to save images
image_directory = 'downloaded_images'
os.makedirs(image_directory, exist_ok=True)

# CSV file setup
csv_file = open('outfitters_products.csv', 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product Name', 'Price', 'Color', 'Product Page URL', 'Downloaded Image ID', 'Downloaded Image URL'])

# Set to store unique image IDs
image_ids = set()

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    products = soup.find_all('div', class_='card__content')

    for product in products:
        # Extract product details
        name = product.find('h3', class_='card__heading').text.strip() if product.find('h3', class_='card__heading') else "N/A"
        price = product.find('span', class_='money').text.strip() if product.find('span', class_='money') else "N/A"
        color = product.find('div', class_='item-image-wrapper').get('color', 'N/A') if product.find('div', class_='item-image-wrapper') else "N/A"
        product_url_elem = product.find('a', class_='product-link-main')
        product_url = "https://outfitters.com.pk" + product_url_elem['href'] if product_url_elem else "N/A"
        
        # Image URL extraction
        img_tag = product.find('img', class_='motion-reduce image-second')
        if img_tag:
            img_url = f"https:{img_tag['src']}"
            parsed_url = urlparse(img_url)
            # Extract image ID from the file name in the path
            image_id = parsed_url.path.split('/')[-1].split('_')[0]
            
            # Check for uniqueness of the image ID
            if image_id in image_ids:
                raise ValueError(f"Duplicate image ID {image_id} found. Process aborted.")
            else:
                image_ids.add(image_id)
                # Modify image URL for better quality
                clean_img_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
                # Download the image
                image_response = requests.get(clean_img_url)
                image_response.raise_for_status()
                image_path = os.path.join(image_directory, f"{image_id}.jpg")
                with open(image_path, 'wb') as img_file:
                    img_file.write(image_response.content)
                
                # Write product details to CSV
                csv_writer.writerow([name, price, color, product_url, image_id, clean_img_url])
                print(f"Downloaded {clean_img_url} as {image_path}")

# Close CSV file
csv_file.close()
print("Scraping, downloading and CSV compilation completed.")


Downloaded https://outfitters.com.pk/cdn/shop/files/F0978106904_1.jpg as downloaded_images/F0978106904.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0941106901_2.jpg as downloaded_images/F0941106901.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0941106999_1.jpg as downloaded_images/F0941106999.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0957106003_2_d3830873-6c36-4100-969b-c632c00c9121.jpg as downloaded_images/F0957106003.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0957106901_1.jpg as downloaded_images/F0957106901.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F1035106619_1.jpg as downloaded_images/F1035106619.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F1035106113_11.jpg as downloaded_images/F1035106113.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F1121106128_3.jpg as downloaded_images/F1121106128.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F1121106904_1_d2201f27-4f53-40a9-a34b-591cc6de0383.jpg

ValueError: Duplicate image ID F0113125628 found. Process aborted.

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import os
from urllib.parse import urlparse

# URLs to scrape
urls = [
    "https://outfitters.com.pk/collections/men-t-shirts",
    "https://outfitters.com.pk/collections/men-polo-shirts",
    "https://outfitters.com.pk/collections/men-shirts",
    "https://outfitters.com.pk/collections/men-denim-collection",
    "https://outfitters.com.pk/collections/men-shorts-1",
    "https://outfitters.com.pk/collections/men-trousers",
    "https://outfitters.com.pk/collections/men-activewear",
    "https://outfitters.com.pk/collections/men-footwear"
]

# Directory to save images
image_directory = 'downloaded_images'
os.makedirs(image_directory, exist_ok=True)

# CSV file setup
csv_file = open('outfitters_products.csv', 'w', newline='', encoding='utf-8')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product Name', 'Price', 'Color', 'Product Page URL', 'Downloaded Image ID', 'Downloaded Image URL'])

# Dictionary to store images under each product ID
product_images = {}

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    products = soup.find_all('div', class_='card__content')

    for product in products:
        # Extract product details
        name = product.find('h3', class_='card__heading').text.strip() if product.find('h3', class_='card__heading') else "N/A"
        price = product.find('span', class_='money').text.strip() if product.find('span', class_='money') else "N/A"
        color = product.find('div', class_='item-image-wrapper').get('color', 'N/A') if product.find('div', class_='item-image-wrapper') else "N/A"
        product_url_elem = product.find('a', class_='product-link-main')
        product_url = "https://outfitters.com.pk" + product_url_elem['href'] if product_url_elem else "N/A"
        
        # Image URL extraction
        img_tags = product.find_all('img', class_='motion-reduce image-second')
        if img_tags:
            for i, img_tag in enumerate(img_tags):
                img_url = f"https:{img_tag['src']}"
                parsed_url = urlparse(img_url)
                # Extract image ID from the filename
                image_id = os.path.splitext(os.path.basename(parsed_url.path))[0]
                
                # Assign a unique sequence number to each image of the same product
                if image_id not in product_images:
                    product_images[image_id] = []
                img_seq = len(product_images[image_id]) + 1
                full_image_id = f"{image_id}_{img_seq}"
                
                # Modify image URL for better quality
                clean_img_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
                # Download the image
                image_response = requests.get(clean_img_url)
                image_response.raise_for_status()
                image_path = os.path.join(image_directory, f"{full_image_id}.jpg")
                with open(image_path, 'wb') as img_file:
                    img_file.write(image_response.content)
                product_images[image_id].append(full_image_id)
                
                # Write product details to CSV
                csv_writer.writerow([name, price, color, product_url, full_image_id, clean_img_url])
                print(f"Downloaded {clean_img_url} as {image_path}")

# Close CSV file
csv_file.close()
print("Scraping, downloading and CSV compilation completed.")


Downloaded https://outfitters.com.pk/cdn/shop/files/F0978106904_1.jpg as downloaded_images/F0978106904_1_1.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0978106904_2_dd5b6386-0af5-4cb9-8756-8cbdbfa1bcfa.jpg as downloaded_images/F0978106904_2_dd5b6386-0af5-4cb9-8756-8cbdbfa1bcfa_1.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0978106904_3.jpg as downloaded_images/F0978106904_3_1.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0978106904_2.jpg as downloaded_images/F0978106904_2_1.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0978106904_5.jpg as downloaded_images/F0978106904_5_1.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0978106904_4.jpg as downloaded_images/F0978106904_4_1.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0941106901_2.jpg as downloaded_images/F0941106901_2_1.jpg
Downloaded https://outfitters.com.pk/cdn/shop/files/F0941106901_5.jpg as downloaded_images/F0941106901_5_1.jpg
Downloaded https://outfitters.com.pk/c

In [3]:
import pandas as pd
from urllib.parse import urlparse, parse_qs

# Load the existing CSV file
csv_path = 'outfitters_products.csv'
df = pd.read_csv(csv_path)

# Define a function to extract the product ID from the URL
def extract_product_id(url):
    try:
        # Assuming the product ID is the last part of the path in the URL before any query string
        parsed_url = urlparse(url)
        product_id = parsed_url.path.split('/')[-1].split('?')[0]
        return product_id
    except Exception as e:
        print(f"Error extracting product ID: {e}")
        return "N/A"

# Apply the function to extract product IDs
df['Product ID'] = df['Product Page URL'].apply(extract_product_id)

# Save the updated DataFrame back to CSV
updated_csv_path = 'updated_outfitters_products.csv'
df.to_csv(updated_csv_path, index=False)

print(f"Updated CSV file saved as {updated_csv_path}")


Updated CSV file saved as updated_outfitters_products.csv


In [5]:
import pandas as pd

# Load the existing CSV file
csv_path = 'updated_outfitters_products.csv'
df = pd.read_csv(csv_path)

# Define the desired column order to position 'Product ID' before 'Downloaded Image ID'
column_order = ['Product Name', 'Price', 'Color', 'Product Page URL', 'Product ID', 'Downloaded Image ID', 'Downloaded Image URL']

# Reorder the columns in the DataFrame
df = df[column_order]

# Save the reordered DataFrame back to CSV
updated_csv_path = 'reordered_outfitters_products.csv'
df.to_csv(updated_csv_path, index=False)

print(f"Reordered CSV file saved as {updated_csv_path}")


Reordered CSV file saved as reordered_outfitters_products.csv
