In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

BASE_URL = "https://priceoye.pk"
START_URL = "https://priceoye.pk/wireless-earbuds"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

all_products = []

def get_soup(url):
    """Fetch a page and return BeautifulSoup"""
    r = requests.get(url, headers=HEADERS)
    if r.status_code != 200:
        print(f"⚠️ Failed to load {url}")
        return None
    return BeautifulSoup(r.text, "html.parser")


def extract_product_links(page_soup):
    """Extract product info from listing page"""
    products = []
    product_cards = page_soup.select("div.productBox.b-productBox")
    print(f"Found {len(product_cards)} product cards on this page.")

    for card in product_cards:
        # --- Product Name ---
        name_tag = card.select_one(".p-title")
        name = name_tag.get_text(strip=True) if name_tag else "N/A"

        # --- Product Price ---
        price_tag = card.select_one(".price-box span")
        price = price_tag.get_text(strip=True) if price_tag else "N/A"

        # --- Old Price and Discount ---
        old_price_tag = card.select_one(".price-diff-retail span")
        old_price = old_price_tag.get_text(strip=True) if old_price_tag else None

        discount_tag = card.select_one(".price-diff-saving")
        discount = discount_tag.get_text(strip=True) if discount_tag else None

        # --- Rating and Reviews ---
        rating_tag = card.select_one(".user-rating-content .h6.bold")
        rating = rating_tag.get_text(strip=True) if rating_tag else "N/A"

        reviews_tag = card.select_one(".user-rating-content .rating-h7.space")
        reviews = reviews_tag.get_text(strip=True) if reviews_tag else "N/A"

        # --- Product Link ---
        link_tag = card.select_one("a")
        href = link_tag["href"] if link_tag and "href" in link_tag.attrs else None
        product_url = href if href and href.startswith("http") else BASE_URL + href if href else None

        # --- Image ---
        img_tag = card.select_one("img")
        image_url = img_tag.get("src") if img_tag else None

        products.append({
            "name": name,
            "price": price,
            "old_price": old_price,
            "discount": discount,
            "rating": rating,
            "reviews": reviews,
            "product_url": product_url,
            "image_url": image_url
        })

    return products


def extract_product_details(product_url):
    """Extract detailed specs from product page"""
    soup = get_soup(product_url)
    if not soup:
        return None

    specs = {}
    # Each spec section like General Features, Connectivity, Battery
    sections = soup.select("div.p-spec-table.card")
    for section in sections:
        heading = section.select_one("h6")
        category = heading.get_text(strip=True) if heading else "Other"
        dt_tags = section.select("dt.spec-term")
        dd_tags = section.select("dd.spec-detail")
        for dt, dd in zip(dt_tags, dd_tags):
            key = f"{category} - {dt.get_text(strip=True)}"
            value = dd.get_text(strip=True)
            specs[key] = value

    return {"specifications": specs}


# Scrape all pages
page_number = 1
while True:
    page_url = f"{START_URL}?page={page_number}"
    print(f"\n🔎 Scraping listing page {page_number}: {page_url}")
    soup = get_soup(page_url)
    if not soup:
        break

    products = extract_product_links(soup)
    if not products:
        print("No more products found. Stopping.")
        break

    for p in products:
        if not p["product_url"]:
            continue

        print(f"➡️ {p['name']}")
        details = extract_product_details(p["product_url"])
        if details:
            p.update(details)

        all_products.append(p)
        time.sleep(random.uniform(1, 2))  # polite delay

    # Stop if there's no next page
    next_btn = soup.select_one("li.page-item a[aria-label='Next']")
    if not next_btn or "disabled" in next_btn.get("class", []):
        break

    page_number += 1
    time.sleep(random.uniform(1, 2))

# Save to CSV
df = pd.DataFrame(all_products)
df.to_csv("airbuds_data.csv", index=False, encoding="utf-8-sig")
print(f"\n✅ Done! {len(all_products)} products saved to airbuds_data.csv")



🔎 Scraping listing page 1: https://priceoye.pk/wireless-earbuds?page=1
Found 36 product cards on this page.
➡️ M10 TWS Wireless Bluetooth Earbuds
➡️ A6s Wireless Bluetooth Earbuds
➡️ Airs pro 2 Tws wireless Earbuds
➡️ Sound Style T10 Stereo Earbuds
➡️ A9 Pro Airpods
➡️ Lenovo HE05X Neckband Wireless
➡️ Itel BudsNeo 3 Wireless Earbuds
➡️ M25 Gaming Wireless Earbuds
➡️ Faster S5 ANC Special Edition Wireless Headphones
➡️ Air 39 Gaming Earbuds
➡️ N35 Gaming Wireless Earbuds
➡️ Ronin R-190 Earbuds
➡️ M90 Pro TWS Gaming Earbuds
➡️ Realme Buds Air 7
➡️ M28 TWS Wireless Earbuds
➡️ Lenovo HE05 TWS Wireless Earbuds
➡️ i12 Tws Touch Sensor Airpods
➡️ Redmi Buds 6 Active Wireless Earbuds
➡️ Audionic Trance Airbud 850 with 6 Mics ANC
➡️ Audionic Battlebuds Wireless Earbuds
➡️ Ronin R-7080 NOX Earbuds
➡️ Zero Wave Pro Earbuds
➡️ Anker Soundcore Space One Headphones
➡️ QCY MeloBuds Neo T31
➡️ M30 TWS Wireless Earbuds
➡️ Audionic Airbud 625 Pro Wireless Earbuds
➡️ Nothing Buds Pro 2
➡️ M20 TWS Wirel

In [2]:
import pandas as pd
import numpy as np
import ast  # to safely convert string to dictionary

# Load your CSV file
df = pd.read_csv("airbuds_data.csv")

# Identify the last column (the one containing dictionary-like strings)
last_col = df.columns[-1]
print(f"Expanding column: {last_col}")

# --- Step 1: Convert string dictionaries to real Python dicts safely ---
df[last_col] = df[last_col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else {})

# --- Step 2: Normalize/expand these dicts into individual columns ---
expanded_df = pd.json_normalize(df[last_col])

# --- Step 3: Concatenate expanded features with the original dataset ---
df_final = pd.concat([df.drop(columns=[last_col]), expanded_df], axis=1)

# --- Step 4: Save and preview ---
print("✅ New DataFrame created with expanded feature columns!")
print(df_final.head())

# Save to new CSV file
df_final.to_csv("airbuds_data_expanded.csv", index=False)


Expanding column: specifications
✅ New DataFrame created with expanded feature columns!
                                 name    price old_price discount  rating  \
0  M10 TWS Wireless Bluetooth Earbuds    Rs749   Rs2,499  70% OFF     4.7   
1      A6s Wireless Bluetooth Earbuds    Rs699   Rs2,999  77% OFF     5.0   
2     Airs pro 2 Tws wireless Earbuds    Rs749   Rs2,000  63% OFF     4.6   
3      Sound Style T10 Stereo Earbuds  Rs1,699   Rs2,999  43% OFF     5.0   
4                      A9 Pro Airpods  Rs1,649   Rs5,000  67% OFF     4.7   

   reviews                                        product_url  \
0   2614.0  https://priceoye.pk/wireless-earbuds/assorted/...   
1      3.0  https://priceoye.pk/wireless-earbuds/assorted/...   
2    148.0  https://priceoye.pk/wireless-earbuds/assorted/...   
3      3.0  https://priceoye.pk/wireless-earbuds/assorted/...   
4    179.0  https://priceoye.pk/wireless-earbuds/assorted/...   

                                           image_url  \
0 

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import joblib # To save the best model

# Load the dataset
df = pd.read_csv('airbuds_data_expanded.csv')

# --- Data Preprocessing ---

# 1. Drop irrelevant columns
# 'old_price' and 'discount' are highly correlated with 'price' and might introduce data leakage
# 'product_url' and 'image_url' are not useful for price prediction
# 'reviews' and 'rating' might be useful but often have missing values for newer products.
# For simplicity, and given your request for price prediction based on features,
# we'll exclude them for now, but they could be explored as features later.
df_cleaned = df.drop(columns=['old_price', 'discount', 'product_url', 'image_url', 'reviews', 'rating'])

# Convert 'price' to a numerical format and handle missing/non-numeric values
df_cleaned['price'] = df_cleaned['price'].astype(str).str.replace('Rs', '').str.replace(',', '').str.strip()
df_cleaned['price'] = pd.to_numeric(df_cleaned['price'], errors='coerce')

# Drop rows where 'price' is NaN after conversion (these are products with no price listed)
df_cleaned.dropna(subset=['price'], inplace=True)

# 2. Feature Engineering & Cleaning for specific columns

# Function to extract numeric value from string (e.g., '5.1', 'v5.0', '10mm', '30mAh', '2-3 Hrs')
def extract_numeric(text):
    if pd.isna(text):
        return np.nan
    text = str(text).lower()
    if 'v' in text: # for Bluetooth version
        text = text.replace('v', '')
    if 'hrs' in text: # for playtime/charging time
        parts = text.split('-')
        if len(parts) > 1:
            try:
                return (float(parts[0].strip()) + float(parts[1].replace('hrs', '').strip())) / 2
            except ValueError:
                return np.nan
        else:
            try:
                return float(text.replace('hrs', '').strip())
            except ValueError:
                return np.nan
    if 'hours' in text: # for playtime/charging time
        parts = text.split('-')
        if len(parts) > 1:
            try:
                return (float(parts[0].strip()) + float(parts[1].replace('hours', '').strip())) / 2
            except ValueError:
                return np.nan
        else:
            try:
                return float(text.replace('hours', '').strip())
            except ValueError:
                return np.nan
    if 'm' in text and 'mm' not in text: # for Bluetooth Range (e.g., '10m')
        try:
            return float(text.replace('m', '').strip())
        except ValueError:
            return np.nan
    if 'ft' in text: # for Bluetooth Range (e.g., '30ft')
        try:
            # Convert feet to meters (1 ft = 0.3048 m)
            return float(text.replace('ft', '').strip()) * 0.3048
        except ValueError:
            return np.nan
    if 'mah' in text: # for Battery Capacity
        try:
            return float(text.replace('mah', '').strip())
        except ValueError:
            return np.nan
    if 'mm' in text: # for Driver Size
        try:
            return float(text.replace('mm', '').strip())
        except ValueError:
            return np.nan
    if 'no' == text or 'n/a' == text or 'null' == text or 'na' == text or '' == text:
        return np.nan
    try:
        return float(text)
    except ValueError:
        return np.nan

# Apply extraction to relevant columns
df_cleaned['General Features - Driver Size'] = df_cleaned['General Features - Driver Size'].apply(extract_numeric)
df_cleaned['Connectivity - Bluetooth Version'] = df_cleaned['Connectivity - Bluetooth Version'].apply(extract_numeric)
df_cleaned['Connectivity - Bluetooth Range'] = df_cleaned['Connectivity - Bluetooth Range'].apply(extract_numeric)
df_cleaned['Battery - Capacity for buds'] = df_cleaned['Battery - Capacity for buds'].apply(extract_numeric)
df_cleaned['Battery - Capacity for Case'] = df_cleaned['Battery - Capacity for Case'].apply(extract_numeric)
df_cleaned['Battery - Playtime'] = df_cleaned['Battery - Playtime'].apply(extract_numeric)
df_cleaned['Battery - Charging Time'] = df_cleaned['Battery - Charging Time'].apply(lambda x: extract_numeric(str(x).split(',')[0].strip() if pd.notna(x) else x))


# Clean up boolean-like columns to consistent 'Yes'/'No'
def clean_yes_no(value):
    if pd.isna(value):
        return 'Unknown'
    s_value = str(value).lower().strip()
    if s_value in ['yes', 'y', 'true', 'anc', 'ai call noise cancelation', 'enc', 'dual-mic noise reduction', 'dust, sweat, and water resistant5']:
        return 'Yes'
    elif s_value in ['no', 'n', 'false']:
        return 'No'
    else:
        return 'Unknown' # Use 'Unknown' for N/A, null, etc. or keep as is if it's a specific feature type

df_cleaned['General Features - Noise Cancellation'] = df_cleaned['General Features - Noise Cancellation'].apply(clean_yes_no)
df_cleaned['General Features - Water Resistant'] = df_cleaned['General Features - Water Resistant'].apply(clean_yes_no)
df_cleaned['General Features - Auto Pairing'] = df_cleaned['General Features - Auto Pairing'].apply(clean_yes_no)
df_cleaned['General Features - Mic'] = df_cleaned['General Features - Mic'].apply(clean_yes_no)
df_cleaned['Connectivity - Microphone'] = df_cleaned['Connectivity - Microphone'].apply(clean_yes_no)


# Further clean 'General Features - Charging Interface'
def clean_charging_interface(value):
    if pd.isna(value):
        return 'Unknown'
    s_value = str(value).lower().strip()
    if 'type c' in s_value or 'usb-c' in s_value or 'c-type' in s_value:
        return 'Type-C'
    elif 'micro usb' in s_value or 'micro' in s_value:
        return 'Micro USB'
    elif 'lightning' in s_value:
        return 'Lightning'
    else:
        return 'Unknown' # Group others or specific non-standard ones

df_cleaned['General Features - Charging Interface'] = df_cleaned['General Features - Charging Interface'].apply(clean_charging_interface)

# Simplify 'General Features - Compatibility' - keep only common platforms
def clean_compatibility(value):
    if pd.isna(value):
        return 'Unknown'
    s_value = str(value).lower()
    if 'android' in s_value and 'ios' in s_value:
        return 'Android & iOS'
    elif 'android' in s_value:
        return 'Android Only'
    elif 'ios' in s_value:
        return 'iOS Only'
    elif 'windows' in s_value: # If only windows is mentioned without android/ios
        return 'Windows Compatible'
    else:
        return 'Unknown'

df_cleaned['General Features - Compatibility'] = df_cleaned['General Features - Compatibility'].apply(clean_compatibility)


# Separate features (X) and target (y)
X = df_cleaned.drop(columns=['price', 'name']) # 'name' is also not a feature for price prediction
y = df_cleaned['price']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=np.number).columns

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Impute missing numerical values with the mean
    ('scaler', StandardScaler())                  # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')), # Impute missing categorical with 'Unknown'
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode categorical features
])

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# --- Model Training ---

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

best_model = None
best_r2_score = -np.inf
model_performance = {}

print("Training and evaluating models...")

for name, model in models.items():
    # Create a full pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', model)])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    model_performance[name] = {'MSE': mse, 'RMSE': rmse, 'R2 Score': r2}

    print(f"\n--- {name} ---")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"R-squared (R2) Score: {r2:.2f}")

    # Track the best model based on R2 score
    if r2 > best_r2_score:
        best_r2_score = r2
        best_model = pipeline # Store the entire pipeline
        best_model_name = name

print(f"\n--- Best Model: {best_model_name} with R2 Score: {best_r2_score:.2f} ---")

# Save the best model and preprocessor (as part of the pipeline)
model_filename = 'best_airbuds_price_predictor.joblib'
joblib.dump(best_model, model_filename)
print(f"Best model saved as {model_filename}")

print("\nHere's a summary of all model performances:")
for name, metrics in model_performance.items():
    print(f"- {name}: R2 Score = {metrics['R2 Score']:.2f}, RMSE = {metrics['RMSE']:.2f}")

Training and evaluating models...

--- Linear Regression ---
Mean Squared Error (MSE): 909114552.85
Root Mean Squared Error (RMSE): 30151.53
R-squared (R2) Score: -2.01

--- Decision Tree Regressor ---
Mean Squared Error (MSE): 629901075.00
Root Mean Squared Error (RMSE): 25097.83
R-squared (R2) Score: -1.08





--- Random Forest Regressor ---
Mean Squared Error (MSE): 428973148.07
Root Mean Squared Error (RMSE): 20711.67
R-squared (R2) Score: -0.42

--- Gradient Boosting Regressor ---
Mean Squared Error (MSE): 544069590.74
Root Mean Squared Error (RMSE): 23325.30
R-squared (R2) Score: -0.80

--- Support Vector Regressor ---
Mean Squared Error (MSE): 391655807.31
Root Mean Squared Error (RMSE): 19790.30
R-squared (R2) Score: -0.30

--- Best Model: Support Vector Regressor with R2 Score: -0.30 ---
Best model saved as best_airbuds_price_predictor.joblib

Here's a summary of all model performances:
- Linear Regression: R2 Score = -2.01, RMSE = 30151.53
- Decision Tree Regressor: R2 Score = -1.08, RMSE = 25097.83
- Random Forest Regressor: R2 Score = -0.42, RMSE = 20711.67
- Gradient Boosting Regressor: R2 Score = -0.80, RMSE = 23325.30
- Support Vector Regressor: R2 Score = -0.30, RMSE = 19790.30


