# Outfitters Scraping Pipeline

In [6]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd
from collections import Counter
import asyncio
import time
import requests
from PIL import Image
from io import BytesIO
import os
from pymongo import MongoClient
from datetime import datetime
from bson import Binary
import io
import matplotlib.pyplot as plt
import nest_asyncio
import re
import boto3
from slugify import slugify

In [53]:
# MongoDB Configuration
MONGODB_URI = "mongodb+srv://AhmadJabbar:0uU29STyRwhoxV0X@shopsavvy.xaqy1.mongodb.net/"
DATABASE_NAME = "test"
COLLECTION_NAME = "products"

# Create a folder for saving images if it doesn't exist
os.makedirs("product_images", exist_ok=True)

# MongoDB Utility Functions
def get_mongo_client():
    return MongoClient(MONGODB_URI)

def fetch_all_links():
    """Fetch all product links from MongoDB."""
    try:
        client = MongoClient(MONGODB_URI)
        db = client[DATABASE_NAME]
        collection = db[COLLECTION_NAME]
        
        # fetch only the 'link' field for all products
        links = [product['link'] for product in collection.find() if 'link' in product]
        print(f"Total links fetched: {len(links)}")
        return links
    except Exception as e:
        print(f"Error fetching links: {e}")
        return []
    finally:
        client.close()

def validate_link(link):
    """Check if the link is valid."""
    try:
        response = requests.head(link, timeout=10)  # Use HEAD request for faster validation
        if response.status_code == 200:
            return (link, True)  # Link is valid
        else:
            return (link, False)  # Link is invalid
    except requests.RequestException as e:
        return (link, False)  # Link is invalid or unreachable

def validate_all_links(links):
    """Validate all links concurrently with progress bar."""
    valid_links = []
    invalid_links = []
    
    # create a progress bar using tqdm
    with ThreadPoolExecutor(max_workers=10) as executor: 
        futures = {executor.submit(validate_link, link): link for link in links}
        
        # Use tqdm to show progress for the number of completed tasks
        for future in tqdm(as_completed(futures), total=len(futures), desc="Validating links"):
            link = futures[future]
            try:
                result = future.result()
                if result[1]:
                    valid_links.append(result[0])
                else:
                    invalid_links.append(result[0])
            except Exception as e:
                print(f"Error processing {link}: {e}")
    
    print(f"Valid links: {len(valid_links)}")
    print(f"Invalid links: {len(invalid_links)}")
    return valid_links, invalid_links

def remove_invalid_links_from_mongodb(invalid_links):
    """Remove invalid links from MongoDB."""
    try:
        client = get_mongo_client()
        db = client[DATABASE_NAME]
        collection = db[COLLECTION_NAME]
        
        # Remove the invalid links from the database
        collection.delete_many({"link": {"$in": invalid_links}})
        print(f"Removed {len(invalid_links)} invalid links from MongoDB.")
    except Exception as e:
        print(f"Error removing invalid links from MongoDB: {e}")
    finally:
        client.close()

# Main workflow
def main():
    # Fetch all links from MongoDB
    links = fetch_all_links()
    
    # Validate all links
    valid_links, invalid_links = validate_all_links(links)
    
    # Remove invalid links from MongoDB
    remove_invalid_links_from_mongodb(invalid_links)
    
    # Optionally, return valid links for further processing
    return valid_links

# Run the main function
if __name__ == "__main__":
    valid_links = main()
    print(f"Total valid links: {len(valid_links)}")


Total links fetched: 2092


Validating links: 100%|█████████████████████| 2092/2092 [02:24<00:00, 14.52it/s]


Valid links: 2090
Invalid links: 2
Removed 2 invalid links from MongoDB.
Total valid links: 2090


In [60]:
async def scrape_data(url, product_type, gender):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)

        # Scroll to load all products
        print("Scrolling to load all products...")
        previous_height = await page.evaluate("document.body.scrollHeight")
        
        while True:
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await page.wait_for_timeout(5000)
            
            new_height = await page.evaluate("document.body.scrollHeight")
            if new_height == previous_height:
                break
            previous_height = new_height

        html = await page.content()
        soup = BeautifulSoup(html, "html.parser")
        
        # Extract all product links
        product_links = extract_product_links(soup)
        print(f"Found {len(product_links)} product links.")
        
        # Filter out links that are already valid (i.e., already scraped)
        new_links = [link for link in product_links if link not in valid_links]
        print(f"Scraping {len(new_links)} new product links.")
        
        # Scrape all product details
        product_details = await scrape_all_product_details(new_links, page)
        
        # Clean and format data
        cleaned_product_details = clean_and_additional_info(product_details, product_type, gender)

        # Append the cleaned data to the global list
        all_cleaned_product_data.extend(cleaned_product_details)

        # Print cleaned product details
        for product in cleaned_product_details:
            print(product)
            print()

        await browser.close()


def extract_product_links(soup):
    base_url = "https://outfitters.com.pk"
    product_links = []  # Initialize a list to store product links
    
    # Locate the product grid container
    product_grid = soup.find('div', class_='product-grid-container')
    
    # Traverse the structure to find product links
    if product_grid:
        for li in product_grid.find_all('li', class_='grid__item grid-item-list'):
            card = li.find('div', class_='card card--standard card--media')
            if card:
                link_tag = card.find('a', href=True)
                if link_tag:
                    # Combine base URL with the href and add to the list
                    product_links.append(base_url.rstrip('/') + '/' + link_tag['href'].lstrip('/'))
    
    return product_links

async def scrape_all_product_details(product_links, page):
    product_details = []
    
    for link in product_links:
        print(f"Scraping {link}...")
        details = await fetch_product_details(link, page)
        if details:
            product_details.append(details)
    
    return product_details

async def fetch_product_details(link, page):
    attempts = 3
    for attempt in range(attempts):
        try:
            await page.goto(link, timeout=15000)
            
            html = await page.content()
            soup = BeautifulSoup(html, "html.parser")

            # Extract product name
            product_title_div = soup.find('div', class_='product__title')
            if product_title_div:
                product_name = product_title_div.find('h1').get_text(strip=True) if product_title_div.find('h1') else "Not Available"
            else:
                product_name = "Not Available"
            
            # Extract price from 'price__regular'
            price_div = soup.find('div', class_='price__regular')
            if price_div:
                money_span = price_div.find('span', class_='money')  # Locate the <span> with class 'money'
                if money_span:
                    price_text = money_span.get_text(strip=True)  # Get the text content
                    price_match = re.search(r'\d+', price_text.replace(",", ""))  # Extract only the numerical part
                    if price_match:
                        price = int(price_match.group())
                    else:
                        price = None
                else:
                    price = None
            else:
                price = None
            
            colors = extract_colors(soup)
            primary_color = select_primary_color(colors, soup)
            
            sizes = extract_sizes(soup)
            
            # Find the swiper-wrapper div
            swiper = soup.find('div', class_='swiper-wrapper')

            # Find all img tags inside the swiper-wrapper
            img_tags = swiper.find_all('img', {'class': 'zoomImg'})
            image_links = []

            for img in img_tags:
                alt_text = img.get('alt')
                if alt_text == primary_color:
                    src = img.get('src')
                    # Remove the leading "//"
                    if src.startswith("//"):
                        src = src[2:]
                    image_links.append(src)

            return [product_name, price, colors, primary_color, sizes, image_links, link]
        
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for {link} due to error: {e}")
            if attempt == attempts - 1:
                print(f"Skipping {link} after {attempts} failed attempts.")
                return None
            await asyncio.sleep(5)

def extract_colors(soup):
    # Find the color wrapper div
    color_wrapper = soup.find('div', class_='color-wrapper')

    # Extract the color names
    colors = []
    if color_wrapper:
        # Find all <label> elements within the color wrapper
        color_labels = color_wrapper.find_all('label')
        for label in color_labels:
            # Extract the title attribute (color name)
            color_name = label.get('title')
            if color_name:
                colors.append(color_name)

    # Return the list of colors
    return colors

def select_primary_color(colors, soup):
    if not colors:
        return "Not Available"
    
    if len(colors) == 1:
        return colors[0]  # Return the only color if there's only one
    
    # Find the script tag with class 'analytics'
    script_tag = soup.find('script', class_='analytics')
    
    if script_tag:
        # Get the content of the script tag
        script_content = script_tag.string
        
        # Use regex to find the variant information and extract only the color part (before '/')
        match = re.search(r'"variant":"([^"]+)"', script_content)
        
        if match:
            variant = match.group(1)
            # Extract color part before the first '/'
            color_match = re.match(r'([^\/]+)', variant)  # Match the part before '/'
            
            if color_match:
                color = color_match.group(1)
                # Remove backslashes and any unwanted characters
                return color.replace('\\', '').strip()  # Remove backslashes and extra spaces
            else:
                return None  # No color part found in the variant string
        else:
            return None  # No variant found
    return None  # No script tag found

def extract_sizes(soup):
    # Find the div with the class 'size-wrapper'
    size_wrapper = soup.find('div', class_='size-wrapper')
    
    # Initialize an empty list to store sizes
    sizes = []
    
    if size_wrapper:
        # Find all input elements of type radio inside the size wrapper
        inputs = size_wrapper.find_all('input', type='radio')
        
        # Loop through all inputs and get the size from the 'value' attribute
        for input_tag in inputs:
            size_value = input_tag.get('value')
            if size_value:
                sizes.append(size_value)
    
    # Return the unique sizes list
    return list(set(sizes))


async def download_images(soup, product_name, primary_color):
    image_paths = []
    image_container = soup.find_all('div', class_='product__thumb-item')
    for idx, img_tag in enumerate(image_container):
        img_url = img_tag.find('img')
        if img_url and img_url.get('src'):
            img_url = 'https:' + img_url['src']
            img_data = requests.get(img_url).content
            image = Image.open(BytesIO(img_data))

            # Save the image to disk with primary color in the filename
            image_path = f"product_images/{product_name}_{primary_color}_{idx + 1}.jpg"
            image.save(image_path)
            image_paths.append(image_path)
    return image_paths

def clean_and_additional_info(product_data, product_type, gender):
    cleaned_product_data = []
    for product in product_data:
        if not product:
            continue
        
        product_name, product_price, unique_colors, primary_color, sizes, image_links, link = product
        
        cleaned_product = {
            "Product": product_name,
            "Price": product_price,
            "Colors": unique_colors,
            "Sizes": sizes,
            "Primary Color": primary_color,
            "Link": link,
            "Images": image_links,
            "Type": product_type,
            "Gender": gender
        }
        
        cleaned_product_data.append(cleaned_product)
    
    return cleaned_product_data

# Create a folder to save images
os.makedirs("product_images2", exist_ok=True)

# Define a global list to store the cleaned product data
all_cleaned_product_data = []

nest_asyncio.apply()


async def scrape_all_categories():
    urls = [
#         ("https://outfitters.com.pk/collections/men-t-shirts", "T-Shirt", "Men"),
#         ("https://outfitters.com.pk/collections/men-sweatshirts", "Hoodies/Sweatshirts", "Men"),
#         ("https://outfitters.com.pk/collections/men-sweater-1", "Sweaters/Cardigans", "Men"),
#         ("https://outfitters.com.pk/collections/men-jacket-1", "Jackets/Coats", "Men"),
#         ("https://outfitters.com.pk/collections/men-activewear", "Activewear", "Men"),
#         ("https://outfitters.com.pk/collections/men-polo-shirts", "Polo", "Men"),
#         ("https://outfitters.com.pk/collections/men-shirts", "Shirt", "Men"),
        ("https://outfitters.com.pk/collections/men-denim-collection", "Jeans", "Men"),
        ("https://outfitters.com.pk/collections/men-trousers", "Trousers", "Men"),
        ("https://outfitters.com.pk/collections/men-shorts", "Shorts", "Men"),
        ("https://outfitters.com.pk/collections/women-t-shirts", "T-Shirt", "Women"),
        ("https://outfitters.com.pk/collections/women-sweatshirts", "Hoodies/Sweatshirts", "Women"),
        ("https://outfitters.com.pk/collections/women-co-ord-sets", "CO-ORD", "Women"),
#         ("https://outfitters.com.pk/collections/women-shirts", "Shirt", "Women"),
#         ("https://outfitters.com.pk/collections/women-denim-collection", "Jeans", "Women"),
#         ("https://outfitters.com.pk/collections/women-jacket-1", "Jackets/Coats", "Women"),
#         ("https://outfitters.com.pk/collections/women-dresses-and-jumpsuit", "Dresses/Skirts", "Women"),
#         ("https://outfitters.com.pk/collections/women-activewear", "Activewear", "Women"),
#         ("https://outfitters.com.pk/collections/women-trouser", "Trousers", "Women"),
#         ("https://outfitters.com.pk/collections/women-shorts-sale", "Shorts", "Women"),
#         ("https://outfitters.com.pk/collections/women-sweaters-sale", "Sweaters/Cardigans", "Women"),
        
    ]
    
    # Create tasks for each category
    tasks = [scrape_data(url, product_type, gender) for url, product_type, gender in urls]
    await asyncio.gather(*tasks)

    
await scrape_all_categories()

print(len(all_cleaned_product_data))
print(all_cleaned_product_data)

Scrolling to load all products...
Scrolling to load all products...
Scrolling to load all products...
Scrolling to load all products...
Scrolling to load all products...
Scrolling to load all products...
Found 4 product links.
Scraping 0 new product links.
Found 1 product links.
Scraping 0 new product links.
Found 41 product links.
Scraping 0 new product links.
Found 37 product links.
Scraping 0 new product links.
Found 57 product links.
Scraping 0 new product links.
Found 92 product links.
Scraping 0 new product links.
0
[]


In [55]:
# AWS S3 Configuration
AWS_ACCESS_KEY = "AKIAQWHCPYEG5KK2MRGI"
AWS_SECRET_KEY = "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"
BUCKET_NAME = "shop-savvy"
REGION = "eu-north-1"

# Initialize S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY,
    region_name=REGION,
)

def optimize_and_upload_image(image_url, product_name, idx):
    try:
        # Ensure the image URL has the proper scheme (https://)
        if not image_url.startswith("http"):
            image_url = "https://" + image_url

        # Download the image from URL
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)

        image = Image.open(io.BytesIO(response.content))

        # Resize and optimize the image
        max_size = (1200, 1200)
        image.thumbnail(max_size)

        # Determine format dynamically
        image_format = "JPEG" if image.format == "JPEG" else "PNG"

        compressed_image = io.BytesIO()
        image.save(compressed_image, format=image_format, quality=75, optimize=True)
        compressed_image.seek(0)

        # Generate a unique product-based S3 key
        product_slug = slugify(product_name)  # Converts "Loose Pleated Trousers" -> "loose-pleated-trousers"
        s3_key = f"{product_slug}/image_{idx}.{image_format.lower()}"

        # Upload the image to S3
        s3.upload_fileobj(compressed_image, BUCKET_NAME, s3_key)

        # Generate the new public URL for the image
        return f"https://{BUCKET_NAME}.s3.{REGION}.amazonaws.com/{s3_key}"
    
    except Exception as e:
        print(f"Error processing {image_url}: {e}")
        return None  # Return None if an image fails

# Process each product with an index to track progress
for product_idx, product in enumerate(all_cleaned_product_data, 1):  # Start counting from 1 for easier human-readable progress
    print(f"Processing product {product_idx}/{len(all_cleaned_product_data)}: {product['Product']}")

    new_image_urls = []
    
    for idx, image_url in enumerate(product['Images']):
        # Print progress for each image
        print(f"Processing image {idx + 1}/{len(product['Images'])} for product {product['Product']}")

        # Optimize and upload each image
        new_image_url = optimize_and_upload_image(image_url, product["Product"], idx)
        
        if new_image_url:
            new_image_urls.append(new_image_url)  # Append only if successful

    # Update the product's images list with the new S3 URLs
    product['Images'] = new_image_urls

    # Print updated product for verification
    print(f"Updated product {product_idx}/{len(all_cleaned_product_data)}: {product['Product']}")


Processing product 1/1: Basic Sweatshirt
Processing image 1/7 for product Basic Sweatshirt
Processing image 2/7 for product Basic Sweatshirt
Processing image 3/7 for product Basic Sweatshirt
Processing image 4/7 for product Basic Sweatshirt
Processing image 5/7 for product Basic Sweatshirt
Processing image 6/7 for product Basic Sweatshirt
Processing image 7/7 for product Basic Sweatshirt
Updated product 1/1: Basic Sweatshirt


In [56]:
print(len(all_cleaned_product_data))
print(all_cleaned_product_data)

1
[{'Product': 'Basic Sweatshirt', 'Price': 1990, 'Colors': ['Black', 'Off White', 'Light Grey Marl'], 'Sizes': ['M', 'L', 'XL', 'S'], 'Primary Color': 'Black', 'Link': 'https://outfitters.com.pk/collections/men-sweatshirts/products/f0484-107?variant=43703865016511', 'Images': ['https://shop-savvy.s3.eu-north-1.amazonaws.com/basic-sweatshirt/image_0.jpeg', 'https://shop-savvy.s3.eu-north-1.amazonaws.com/basic-sweatshirt/image_1.jpeg', 'https://shop-savvy.s3.eu-north-1.amazonaws.com/basic-sweatshirt/image_2.jpeg', 'https://shop-savvy.s3.eu-north-1.amazonaws.com/basic-sweatshirt/image_3.jpeg', 'https://shop-savvy.s3.eu-north-1.amazonaws.com/basic-sweatshirt/image_4.jpeg', 'https://shop-savvy.s3.eu-north-1.amazonaws.com/basic-sweatshirt/image_5.jpeg', 'https://shop-savvy.s3.eu-north-1.amazonaws.com/basic-sweatshirt/image_6.jpeg'], 'Type': 'Hoodies/Sweatshirts', 'Gender': 'Men'}]


In [57]:
products_copy = pd.DataFrame(all_cleaned_product_data)

# Standardize the product names by applying .title() to each product
products_copy['Product'] = products_copy['Product'].str.title()

# Define the size mapping
size_mapping = {
    'XXL': '2XL'
}

products_copy['Sizes'] = products_copy['Sizes'].apply(lambda size_list: [size_mapping.get(size, size) for size in size_list])

# Function to update 'Type' based on 'Product'
def update_bottom_type(row):
    product_name = row['Product'].lower()  # Convert product name to lowercase for case-insensitive comparison
    
    if 'jean' in product_name or 'jeans' in product_name:
        return 'Jeans'
    elif 'shorts' in product_name:
        return 'Shorts'
    else:
        return 'Trousers'

# Create a dictionary with the mappings
type_mappings = {
    'Hoodies/Sweatshirts': 'Hoodies & Sweatshirts',
    'Sweaters/Cardigans': 'Sweaters & Cardigans',
    'Jackets/Coats': 'Jackets & Coats',
    'Shirt': 'Shirts',
    'Dresses/Skirts': 'Dresses & Skirts',
    'CO-ORD': 'Co-ords'
}

# Apply the mapping to the 'Type' column in 'products_copy'
products_copy['Type'] = products_copy['Type'].map(type_mappings).fillna(products_copy['Type'])

color_individual_mapping = {
    'Black': 'Black',
    'All Black': 'Black',
    'Charcoal Black': 'Black',
    'Black Matte': 'Black',
    
    'White': 'White',
    'Skin': 'White',
    'Off White': 'White',
    'Ivory': 'White',
    'Oat White': 'White',
    'Cream': 'White',
    'Acru': 'White',
    'Ecru': 'White',
    'Antique White': 'White',

    'Grey': 'Grey',
    'Charcoal': 'Grey',
    'Metal': 'Grey',
    'Dark Grey': 'Grey',
    'Anthracite Grey': 'Grey',
    'Light Grey': 'Grey',
    'Heather Charcoal': 'Grey',
    'Melange Grey': 'Grey',
    'Heather Grey': 'Grey',
    'Slate Grey': 'Grey',
    'Dark Grey Marl': 'Grey',
    'Medium Grey Marl': 'Grey',
    'Light Grey Marl': 'Grey',
    'Pale Grey': 'Grey',
    'Mid Grey': 'Grey',

    'Red': 'Red',
    'Dark Red': 'Red',
    'Burgundy': 'Red',
    'Maroon': 'Red',
    'Wine': 'Red',
    'Rust': 'Red',
    'Brick Red': 'Red',
    'Salmon': 'Red',
    'Crimson': 'Red',
    'Cherry Red': 'Red',
    'Deep Maroon': 'Red',

    'Blue': 'Blue',
    'Dusty Blue': 'Blue',
    'Persian Blue': 'Blue',
    'Skyway': 'Blue',
    'Navy': 'Blue',
    'Midnight Blue': 'Blue',
    'Crystal Blue': 'Blue',
    'Royal Blue': 'Blue',
    'Cobalt': 'Blue',
    'Cobalt Blue': 'Blue',
    'Sky Blue': 'Blue',
    'Light Aqua': 'Blue',
    'Ice Blue': 'Blue',
    'Denim Blue': 'Blue',
    'Indigo Blue': 'Blue',
    'Mid Blue': 'Blue',
    'Light Blue': 'Blue',
    'Dark Blue': 'Blue',
    'Deep Blue': 'Blue',
    'Blue Ice': 'Blue',
    'Teal': 'Blue',
    'Navy Blue': 'Blue',
    'Melange Navy': 'Blue',
    'Pale Blue': 'Blue',
    'Light Navy': 'Blue',
    'Grey Blue': 'Blue',

    'Green': 'Green',
    'Lime': 'Green',
    'Antique Moss': 'Green',
    'Dark Green': 'Green',
    'Olive': 'Green',
    'Dark Olive': 'Green',
    'Mid Olive': 'Green',
    'Olive Green': 'Green',
    'Forest Green': 'Green',
    'Mint Green': 'Green',
    'Matcha Green': 'Green',
    'Peacock': 'Green',
    'Emerald': 'Green',
    'Grass Green': 'Green',
    'Apple Green': 'Green',
    'Sea Green': 'Green',
    'Light Olive': 'Green',

    'Brown': 'Brown',
    'Slate Brown': 'Brown',
    'Mocha': 'Brown',
    'Dark Brown': 'Brown',
    'Chocolate': 'Brown',
    'Chocolate Brown': 'Brown',
    'Coffee': 'Brown',
    'Caramel': 'Brown',
    'Mushroom': 'Brown',
    'Coconut Milk': 'Brown',
    'Peanut': 'Brown',
    'Honey': 'Brown',
    'Tan': 'Brown',
    'Camel': 'Brown',
    'Spice': 'Brown',
    'Mink': 'Brown',
    'Taupe': 'Brown',
    'Brown Grey': 'Brown',

    'Pink': 'Pink',
    'Light Pink': 'Pink',
    'Vanilla Ice': 'Pink',
    'Dirty Pink': 'Pink',
    'Blush Pink': 'Pink',
    'Tea Pink': 'Pink',
    'Baby Coral': 'Pink',
    'Blush Coral': 'Pink',
    'Pale Pink': 'Pink',
    'Pale Coral': 'Pink',
    'Blush Coral': 'Pink',
    'Dusty Mauve': 'Pink',
    'Rusty Pink': 'Pink',

    'Purple': 'Purple',
    'Plum': 'Purple',
    'Plum Purple': 'Purple',
    'Lavender Blue': 'Purple',
    'Bright Purple': 'Purple',
    'Violet': 'Purple',
    'Dusty Purple': 'Purple',
    'Cloudy Violet': 'Purple',
    'Grape': 'Purple',
    'Twilight': 'Purple',
    'Smoky Grape': 'Purple',

    'Yellow': 'Yellow',
    'Pale Yellow': 'Yellow',
    'Mustard': 'Yellow',
    'Golden': 'Yellow',
    'Yellow Beige': 'Yellow',
    'Stone': 'Yellow',
    'Cyberlime': 'Yellow',

    'Beige': 'Beige',
    'Khaki': 'Beige',
    'Khaaki': 'Beige',
    'Sand': 'Beige',
    'Oatmeal': 'Beige',
    'Light Khaki': 'Beige',

    'Orange': 'Orange',
    'Italian Clay': 'Orange',

    'Multi-color': 'Multi-color',
    'Multi Color': 'Multi-color',
    'Multi Colour': 'Multi-color',
    'Multi': 'Multi-color'
}

# Function to map a single color to its category using the provided color_individual_mapping
def map_single_color(color, color_individual_mapping):
    # Return the mapped color or 'Other' if the color is not found
    return color_individual_mapping.get(color, 'Other')

products_copy['FilterColor'] = products_copy['Primary Color'].apply(map_single_color, args=(color_individual_mapping,))
# Add a column with 'Outfitters' to products_copy
products_copy['Brand'] = 'Outfitters'

In [58]:
# Remove rows where any of the critical columns have "Not Available" values
products_copy = products_copy[
    ~(products_copy['Price'] == 'Not Available') & 
    ~(products_copy['Primary Color'] == 'Not Available') & 
    ~(products_copy['Colors'].apply(lambda x: not x))  # Check if the 'Colors' column is empty or 'Not Available'
]

In [59]:
# MongoDB Configuration
MONGODB_URI = "mongodb+srv://AhmadJabbar:0uU29STyRwhoxV0X@shopsavvy.xaqy1.mongodb.net/"
DATABASE_NAME = "test"
COLLECTION_NAME = "products"

# Connect to MongoDB
client = MongoClient(MONGODB_URI)
db = client[DATABASE_NAME]
collection = db[COLLECTION_NAME]

# Convert DataFrame to MongoDB format
records = products_copy.to_dict(orient="records")

# Format the records to match MongoDB schema
formatted_records = []
for record in records:
    formatted_record = {
        "product": record["Product"],
        "price": int(float(record["Price"])),
        "colors": record["Colors"],
        "sizes": record["Sizes"],
        "primary_color": record["Primary Color"],
        "link": record["Link"],
        "images": record["Images"],
        "type": record["Type"],
        "gender": record["Gender"],
        "filtercolor": record["FilterColor"],
        "brand": record["Brand"],
        "status": "valid"  # Assuming all new entries are valid
    }
    formatted_records.append(formatted_record)

# Insert into MongoDB
if formatted_records:
    collection.insert_many(formatted_records)
    print(f"Inserted {len(formatted_records)} new records successfully.")
else:
    print("No records to insert.")

# Close the connection
client.close()

Inserted 1 new records successfully.


In [64]:
MONGODB_URI = "mongodb+srv://AhmadJabbar:0uU29STyRwhoxV0X@shopsavvy.xaqy1.mongodb.net/"
DATABASE_NAME = "test"
COLLECTION_NAME = "products"

# MongoDB Utility Functions
def get_mongo_client():
    return MongoClient(MONGODB_URI)

# Function to extract all products and track duplicate IDs based on the 'link' attribute
def extract_and_remove_duplicates():
    client = get_mongo_client()
    db = client[DATABASE_NAME]
    collection = db[COLLECTION_NAME]

    # Extract all products from the collection
    products = list(collection.find())

    # Create a list to store duplicate product IDs
    duplicate_product_ids = []
    seen_links = set()

    for product in products:
        product_link = product.get('link')
        product_id = product.get('_id')

        if product_link:
            if product_link in seen_links:
                # If the link is already seen, it's a duplicate; add the product's ID to the list
                duplicate_product_ids.append(product_id)
            else:
                # If the link is not seen, mark it as seen
                seen_links.add(product_link)

    # Remove duplicate products based on the collected IDs
    if duplicate_product_ids:
        collection.delete_many({'_id': {'$in': duplicate_product_ids}})
        print(f"Removed {len(duplicate_product_ids)} duplicate products based on their IDs.")
    else:
        print("No duplicates found.")

# Call the function to extract duplicates and remove them from MongoDB
extract_and_remove_duplicates()

No duplicates found.
