In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time

In [2]:
# Functions to extract details
# -----------------------------

def get_title(soup):
    try:
        title = soup.find("span", attrs={"class": 'VU-ZEz'}).text.strip()
    except AttributeError:
        title = ""
    return title

def get_price(soup):
    try:
        price = soup.find("div", attrs={"class": "Nx9bqj CxhGGd"}).text.strip()
    except AttributeError:
        price = ""
    return price

def get_rating(soup):
    try:
        rating = soup.find("div", attrs={"class": "XQDdHH"}).text.strip()
    except AttributeError:
        rating = ""
    return rating


In [6]:
# ---------------------------------------
# Main script to scrape Flipkart data
# ---------------------------------------

if __name__ == '__main__':

    HEADERS = ({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    })

    BASE_URL = "https://www.flipkart.com/search?q=mobiles&as=on&as-show=on&otracker=AS_Query_OrganicAutoSuggest_3_7_na_na_na&otracker1=AS_Query_OrganicAutoSuggest_3_7_na_na_na&as-pos=3&as-type=RECENT&suggestionId=mobiles&requestId=7d1de872-4bfb-4cb1-bde2-ad006c4cf1ea&as-backfill=on&page="

    NUM_PAGES = 200

    data = {"title": [], "price": [], "rating": []}

    for page in range(1, NUM_PAGES + 1):
        print(f"\nScraping Page {page}...")
        url = BASE_URL + str(page)

        try:
            page_response = requests.get(url, headers=HEADERS, timeout=10)
            soup = BeautifulSoup(page_response.content, "html.parser")

            # Extract product links
            links = soup.find_all("a", attrs={'class': 'CGtC98'})
            links_list = ["https://www.flipkart.com" + link.get('href') for link in links if link.get('href')]

            for link in links_list:
                try:
                    product_response = requests.get(link, headers=HEADERS, timeout=10)
                    product_soup = BeautifulSoup(product_response.content, "html.parser")

                    data['title'].append(get_title(product_soup))
                    data['price'].append(get_price(product_soup))
                    data['rating'].append(get_rating(product_soup))

                    time.sleep(1)  # Delay between each product request

                except Exception as e:
                    print(f"Product error at {link}: {e}")
                    continue

            time.sleep(2)  # Delay between each page

        except Exception as e:
            print(f"Page {page} failed: {e}")
            continue

        # Optional: Save progress every 50 pages
        if page % 50 == 0:
            temp_df = pd.DataFrame.from_dict(data)
            temp_df.to_csv(f"flipkart_mobile_backup_page_{page}.csv", index=False)
            print(f"Saved backup at page {page}")

    # Final DataFrame and saving
    flipkart_df = pd.DataFrame.from_dict(data)
    flipkart_df['title'].replace('', np.nan, inplace=True)
    flipkart_df.dropna(subset=['title'], inplace=True)

    flipkart_df.to_csv("flipkart_mobile_data_all_200_pages.csv", index=False)
    #print("\nScraping complete. Data saved to flipkart_mobile_data_all_489_pages.csv")


Scraping Page 1...

Scraping Page 2...

Scraping Page 3...

Scraping Page 4...

Scraping Page 5...

Scraping Page 6...

Scraping Page 7...

Scraping Page 8...

Scraping Page 9...

Scraping Page 10...

Scraping Page 11...

Scraping Page 12...

Scraping Page 13...

Scraping Page 14...

Scraping Page 15...

Scraping Page 16...

Scraping Page 17...

Scraping Page 18...

Scraping Page 19...

Scraping Page 20...

Scraping Page 21...

Scraping Page 22...

Scraping Page 23...

Scraping Page 24...

Scraping Page 25...

Scraping Page 26...

Scraping Page 27...

Scraping Page 28...

Scraping Page 29...

Scraping Page 30...

Scraping Page 31...

Scraping Page 32...

Scraping Page 33...

Scraping Page 34...

Scraping Page 35...

Scraping Page 36...

Scraping Page 37...

Scraping Page 38...

Scraping Page 39...

Scraping Page 40...

Scraping Page 41...

Scraping Page 42...

Scraping Page 43...

Scraping Page 44...

Scraping Page 45...

Scraping Page 46...

Scraping Page 47...

Scraping Page 48...



In [7]:
flipkart_df

Unnamed: 0,title,price,rating
0,"MOTOROLA g05 (Plum Red, 64 GB) (4 GB RAM)",,4.2
1,"POCO C71 (Desert Gold, 128 GB) (6 GB RAM)","₹6,999",4.1
2,"REDMI A3X (Olive Green, 128 GB) (4 GB RAM)",,4.1
3,"Samsung Galaxy F05 (Twilight Blue, 64 GB) (4 ...",,4.2
4,"POCO C71 (Cool Blue, 128 GB) (6 GB RAM)",,4.1
...,...,...,...
859,"vivo Y29 5G (Glacier Blue, 128 GB) (6 GB RAM)","₹15,499",4.4
860,Nokia 105 Classic without Charger (Charcoal),₹947,3.8
861,"REDMI Note-14 Pro+ 5G (Titan Black, 512 GB) (...",,4.6
862,"hmd 110 4G DS (Purple, Blue)","₹2,436",4.1


In [8]:
import os
print(os.getcwd())

C:\Users\vpriy


In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
df = pd.read_csv("flipkart_mobile_backup_page_100.csv")

In [11]:
print(df.columns)

Index(['title', 'price', 'rating'], dtype='object')


In [12]:
# Remove ₹ symbol and commas
df['price'] = df['price'].replace('[₹,]', '', regex=True)

# Convert to numeric
df['price'] = pd.to_numeric(df['price'], errors='coerce')


In [13]:
df = df.dropna(subset=['price'])

In [14]:
# Create price categories after ensuring numeric prices
df['price_category'] = pd.cut(df['price'], bins=[0, 10000, 25000, df['price'].max()],
                              labels=['Low', 'Medium', 'High'])


In [15]:
df['rating_category'] = pd.cut(df['rating'], bins=[0, 3, 4, 5],
                                labels=['Low', 'Medium', 'High'])


In [16]:
df

Unnamed: 0,title,price,rating,price_category,rating_category
1,"POCO C71 (Desert Gold, 128 GB) (6 GB RAM)",6999.0,4.1,Low,High
5,"MOTOROLA g35 5G (Guava Red, 128 GB) (4 GB RAM)",9999.0,4.2,Low,High
6,"POCO C71 (Cool Blue, 64 GB) (4 GB RAM)",6399.0,3.9,Low,Medium
7,"MOTOROLA g05 (Forest Green, 64 GB) (4 GB RAM)",6999.0,4.2,Low,High
8,"REDMI A5 (Pondicherry Blue, 64 GB) (3 GB RAM)",6499.0,4.3,Low,High
...,...,...,...,...,...
857,"Tecno Spark 30C 5G (Azure Sky, 128 GB) (4 GB ...",10499.0,4.1,Medium,High
859,"vivo Y29 5G (Glacier Blue, 128 GB) (6 GB RAM)",15499.0,4.4,Medium,High
860,Nokia 105 Classic without Charger (Charcoal),947.0,3.8,Low,Medium
862,"hmd 110 4G DS (Purple, Blue)",2436.0,4.1,Low,High


In [17]:
import os
print(os.getcwd())

C:\Users\vpriy


In [18]:

df.to_excel("cleaned_flipkart_data.xlsx", index=False)

In [19]:
df

Unnamed: 0,title,price,rating,price_category,rating_category
1,"POCO C71 (Desert Gold, 128 GB) (6 GB RAM)",6999.0,4.1,Low,High
5,"MOTOROLA g35 5G (Guava Red, 128 GB) (4 GB RAM)",9999.0,4.2,Low,High
6,"POCO C71 (Cool Blue, 64 GB) (4 GB RAM)",6399.0,3.9,Low,Medium
7,"MOTOROLA g05 (Forest Green, 64 GB) (4 GB RAM)",6999.0,4.2,Low,High
8,"REDMI A5 (Pondicherry Blue, 64 GB) (3 GB RAM)",6499.0,4.3,Low,High
...,...,...,...,...,...
857,"Tecno Spark 30C 5G (Azure Sky, 128 GB) (4 GB ...",10499.0,4.1,Medium,High
859,"vivo Y29 5G (Glacier Blue, 128 GB) (6 GB RAM)",15499.0,4.4,Medium,High
860,Nokia 105 Classic without Charger (Charcoal),947.0,3.8,Low,Medium
862,"hmd 110 4G DS (Purple, Blue)",2436.0,4.1,Low,High
