# Set-Up

In [1]:
# Install requirements

#!pip install -r requirements.txt

In [2]:
# Import packages

import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import plotly.express as px
import plotly.graph_objects as go
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import re
from urllib.parse import urlencode
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
from bs4 import BeautifulSoup

# Item Scraping

In [3]:
# Create filters dictionary

size_ids = {'XXS': 1226, 'XS': 2, 'S': 3, 'M': 4, 'L': 5, 'XL': 6, 'XXL': 7, 'XXXL': 310}

In [4]:
# Define function to extract item details from the alt text

def parse_alt_text(alt_text):
    """
    Parses the alt text of an image to extract item details.
    Args:
        alt_text (str): The alt text of the image.
    Returns:
        dict: A dictionary containing the extracted item details.
    """
    # Initialize a dictionary to hold the extracted details
    item_details = {}

    # Extract the title (text before the first comma)
    title_match = re.match(r'^(.*?)(?=,)', alt_text)
    item_details['title'] = title_match.group(1).strip() if title_match else None

    # Extract brand
    brand_match = re.search(r'brand:\s*([^,]+)', alt_text, re.IGNORECASE)
    item_details['brand'] = brand_match.group(1).strip() if brand_match else None

    # Extract condition
    condition_match = re.search(r'condition:\s*([^,]+)', alt_text, re.IGNORECASE)
    item_details['condition'] = condition_match.group(1).strip() if condition_match else None

    # Extract size
    size_match = re.search(r'size:\s*([^,]+)', alt_text, re.IGNORECASE)
    item_details['size'] = size_match.group(1).strip() if size_match else None

    # Extract item price
    price_match = re.search(r'£\d+\.\d{2}(?=,)', alt_text)
    item_details['price'] = price_match.group(0) if price_match else None

    # Extract total price including buyer protection
    total_price_match = re.search(r'£\d+\.\d{2}\s*includes Buyer Protection', alt_text)
    item_details['total_price'] = total_price_match.group(0) if total_price_match else None

    return item_details

In [5]:
def search_vinted_items(query, max_pages=1, filters=None, get_description=True):
    """
    Searches Vinted for items matching the query and extracts metadata.
    Args:
        query (str): The search query.
        max_pages (int): The maximum number of pages to scrape.
        filters (dict): Additional filters to apply to the search.
        get_description (bool): Whether to get the seller's description from the item page.
    Returns:
        list: A list of dictionaries containing the extracted metadata.
    """
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome(options=options)

    base_url = "https://www.vinted.co.uk/catalog"

    # Initialize an empty list to hold the extracted details
    results = []

    for page in range(1, max_pages + 1):
        params = {
            "search_text": query,
            "page": page
        }

        # Add additional filters to the parameters
        if filters:
            params.update(filters)

        # Construct the URL with parameters
        url = base_url + "?" + urlencode(params, doseq=True)
        driver.get(url)
        time.sleep(10)  # Wait for JS

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        items = soup.find_all("div", class_="feed-grid__item")

        if not items:
            print(f"No items found on page {page}")
            continue

        for item in items:
            # Extract the item details
            link_tag = item.find("a", href=True)
            img_tag = item.find("img")
            item_url = f"{link_tag['href']}" if link_tag else "N/A"
            image_url = img_tag["src"] if img_tag else "N/A"
            alt_text = img_tag["alt"] if img_tag and "alt" in img_tag.attrs else "N/A"
            parsed_details = parse_alt_text(alt_text)

            # Go to detail page for each item to extract seller description
            seller_description = "N/A"
            if get_description:
                if item_url != "N/A":
                    cleaned_url = item_url.strip()
                    try:
                        driver.get(cleaned_url)
                        time.sleep(2)
                        detail_soup = BeautifulSoup(driver.page_source, 'html.parser')
                        desc_span = detail_soup.find(
                            "span",
                            class_="web_ui__Text__text web_ui__Text__body web_ui__Text__left web_ui__Text__format"
                        )
                        if desc_span:
                            seller_description = desc_span.get_text(strip=True)
                    except Exception as e:
                        print(f"Error loading {cleaned_url}: {e}")
                        seller_description = "Error"

            parsed_details['seller_description'] = seller_description

            # Append the urls to the parsed details dictionary
            parsed_details['item_url'] = item_url
            parsed_details['image_url'] = image_url
            results.append(parsed_details)

            # Print all items from parsed details dictionary
            """
            print(f"Title: {parsed_details.get('title')}")
            print(f"Brand: {parsed_details.get('brand')}")
            print(f"Condition: {parsed_details.get('condition')}")
            print(f"Size: {parsed_details.get('size')}")
            print(f"Item Price: {parsed_details.get('price')}")
            print(f"Total Price: {parsed_details.get('total_price')}")
            print(f"Image: {image_url}")
            print(f"Link: {item_url}")
            print(f"Seller Description: {seller_description}")
            print("-" * 60)
            """

    driver.quit()

    return results


In [27]:
# Query vinted

# Set filters
filters = {
    #"size_ids[]": size_ids['XS'],
    "price_to": 30
}

# Execute search
y2k_0508 = search_vinted_items("y2k", max_pages=10, filters=filters, get_description=False)

In [28]:
# Create a DataFrame from the results

df = pd.DataFrame(y2k_0508)
df

Unnamed: 0,title,brand,condition,size,price,total_price,seller_description,item_url,image_url
0,Boho style necklace,NO LABEL,Very good,,£2.00,£2.80 includes Buyer Protection,,https://www.vinted.co.uk/items/6286088724-boho...,https://images1.vinted.net/t/04_0013c_jHR7akFQ...
1,Bang on the door groovy chick mug vintage sant...,Groovy Chick,Good,,£1.99,£2.79 includes Buyer Protection,,https://www.vinted.co.uk/items/6288373201-bang...,https://images1.vinted.net/t/04_02051_89wpwhgw...
2,Y2K style green embellished sequin scarf,NO LABEL,New without tags,,£1.00,£1.75 includes Buyer Protection,,https://www.vinted.co.uk/items/6289774101-y2k-...,https://images1.vinted.net/t/04_016d4_3gjpmGTT...
3,Laura Ashley Bag,Laura Ashley,Very good,,£4.50,£5.43 includes Buyer Protection,,https://www.vinted.co.uk/items/6287420013-laur...,https://images1.vinted.net/t/01_01aa8_dauDFoBr...
4,y2k fitted blouse short-sleeved,Topshop,Good,XS / 6,£3.00,£3.85 includes Buyer Protection,,https://www.vinted.co.uk/items/6285846256-y2k-...,https://images1.vinted.net/t/02_00fea_uFy3ww51...
...,...,...,...,...,...,...,...,...,...
995,y2k coquette polka dot baby yellow sheer linge...,silver ox,Good,S / 8,£15.00,£16.45 includes Buyer Protection,,https://www.vinted.co.uk/items/6290742416-y2k-...,https://images1.vinted.net/t/04_02373_JKz4LSLw...
996,Vintage Y2K ruffle mohair wool ribbon tie card...,Made In Italy,Very good,M / 10,£23.00,£24.85 includes Buyer Protection,,https://www.vinted.co.uk/items/6287019620-vint...,https://images1.vinted.net/t/04_000f9_Hs4MtMo8...
997,Vintage Y2K low rise denim mini skirt,Vintage Dressing,Very good,M / 10,£24.00,£25.90 includes Buyer Protection,,https://www.vinted.co.uk/items/6290807981-vint...,https://images1.vinted.net/t/04_01245_Srh8yQQP...
998,Y2K vintage sporty style graphic print racer v...,y2k,Very good,M / 10,£21.00,£22.75 includes Buyer Protection,,https://www.vinted.co.uk/items/6281210013-y2k-...,https://images1.vinted.net/t/02_017b6_aFwkGd6y...


In [29]:
# Data Cleaning

def clean_df(df):
    """
    Cleans the DataFrame by removing duplicates and NaN values.
    Args:
        df (DataFrame): The DataFrame to clean.
    Returns:
        DataFrame: The cleaned DataFrame.
    """
    # Remove duplicates
    df = df.drop_duplicates()

    # Remove rows with NaN values
    df = df.dropna()

    # Make all text lowercase
    df['title'] = df['title'].str.lower()
    df['brand'] = df['brand'].str.lower()
    df['condition'] = df['condition'].str.lower()
    df['size'] = df['size'].str.lower()
    df['seller_description'] = df['seller_description'].str.lower()

    # Convert numeric columns to float
    df['price'] = df['price'].str.replace('£', '').astype(float)
    df['total_price'] = df['total_price'].str.replace(' includes Buyer Protection', '')
    df['total_price'] = df['total_price'].str.replace('£', '').astype(float)

    return df


In [30]:
# Display the cleaned DataFrame

clean_df = clean_df(df)
clean_df

Unnamed: 0,title,brand,condition,size,price,total_price,seller_description,item_url,image_url
4,y2k fitted blouse short-sleeved,topshop,good,xs / 6,3.0,3.85,,https://www.vinted.co.uk/items/6285846256-y2k-...,https://images1.vinted.net/t/02_00fea_uFy3ww51...
5,women’s y2k denim jeans,denim,satisfactory,other,1.5,2.28,,https://www.vinted.co.uk/items/6283612810-wome...,https://images1.vinted.net/t/04_007e7_5g3SLLmS...
6,y2k top,shein,very good,s / 8,1.0,1.75,,https://www.vinted.co.uk/items/6286188391-y2k-...,https://images1.vinted.net/t/04_0062c_PpjBb8ZC...
9,black and silver floral knitted embroidered sw...,new look,very good,m / 10,2.5,3.33,,https://www.vinted.co.uk/items/6288480073-blac...,https://images1.vinted.net/t/04_01bfb_qUoNdGgm...
10,cute y2k silk style lace baby blue / turquoise...,y2k,very good,s / 8,2.0,2.80,,https://www.vinted.co.uk/items/6284909987-cute...,https://images1.vinted.net/t/04_0172d_qipjx8FH...
...,...,...,...,...,...,...,...,...,...
995,y2k coquette polka dot baby yellow sheer linge...,silver ox,good,s / 8,15.0,16.45,,https://www.vinted.co.uk/items/6290742416-y2k-...,https://images1.vinted.net/t/04_02373_JKz4LSLw...
996,vintage y2k ruffle mohair wool ribbon tie card...,made in italy,very good,m / 10,23.0,24.85,,https://www.vinted.co.uk/items/6287019620-vint...,https://images1.vinted.net/t/04_000f9_Hs4MtMo8...
997,vintage y2k low rise denim mini skirt,vintage dressing,very good,m / 10,24.0,25.90,,https://www.vinted.co.uk/items/6290807981-vint...,https://images1.vinted.net/t/04_01245_Srh8yQQP...
998,y2k vintage sporty style graphic print racer v...,y2k,very good,m / 10,21.0,22.75,,https://www.vinted.co.uk/items/6281210013-y2k-...,https://images1.vinted.net/t/02_017b6_aFwkGd6y...


In [32]:
# Export to CSV
clean_df.to_csv('y2k_0508.csv', index=False)