In [None]:
import pandas as pd
import requests
import re
import time
from bs4 import BeautifulSoup

In [None]:
def fetch_with_splash(url,wait=2):
    params = {
        'url': url,
        'wait': wait
    }
    r = requests.get('http://localhost:8050/render.html', params=params)
    if r.status_code == 200:
        return r.text
    else:
        print(f"Error fetching {url} | Status: {r.status_code}")
        return None

In [None]:
with open('flipkart_urls.txt', 'r') as f:
    lines = [line.strip() for line in f]

In [None]:
all_reviews_data = []


for i in range(0, len(lines), 2):
    product_url = lines[i]
    product_name = lines[i+1]
    print(f"\nStarting scraping reviews for: {product_name}")

    
    html = fetch_with_splash(product_url)
    if not html:
        print(f"Failed to fetch product page for {product_name}. Skipping.")
        continue
    soup = BeautifulSoup(html, 'html.parser')
    divs = soup.findAll('div', class_="col pPAw9M")
    if not divs:
        print(f"No review link container found for {product_name}. Skipping.")
        continue
    a_tags = divs[-1].find_all('a')
    if not a_tags:
        print(f"No review links found for {product_name}. Skipping.")
        continue
    href = a_tags[-1].get('href')
    base_url = 'https://www.flipkart.com'
    review_base_url = base_url + href

    reviews_data = []

    
    for page_num in range(1, 11):
        page_url = f"{review_base_url}&page={page_num}"
        html = fetch_with_splash(page_url)
        if not html:
            print(f"Skipping page {page_num} due to fetch error.")
            continue
        soup = BeautifulSoup(html, 'html.parser')

        
        review_blocks = soup.findAll("div", {'class': "ZmyHeo"})
        if not review_blocks:
            print(f"No reviews found on page {page_num}. Stopping pagination.")
            break

        for block in review_blocks:
            review_text = None
            first_div_child = block.find("div")
            if first_div_child:
                sec_div_child = first_div_child.find("div")
                if sec_div_child:
                    review_text = sec_div_child.text.strip()

            date_text = None
            p_tags = block.find_all("p", {'class': "_2NsDsF"})
            for p in p_tags:
                txt = p.text.strip()
                if re.match(r'^[a-zA-Z]{3}, \d{4}$', txt) or re.match(r'^\d+\s+\w+\s+ago$', txt):
                    date_text = txt # Jul, 2024 or 3 days ago
                    break

            
            rating_text = None
            col_div = block.find("div", {'class': "col EPCmJX Ma1fCG"})
            if col_div:
                n_sub = col_div.find("div", {'class': "row"})
                if n_sub:
                    n_sub_2 = n_sub.find("div", {'class': "XQDdHH"})
                    if n_sub_2:
                        rating_text = n_sub_2.text.strip()

            if review_text:
                reviews_data.append({
                    'Product': product_name,
                    'Review': review_text,
                    'Date': date_text,
                    'Rating': rating_text
                })

        print(f"Scraped {len(review_blocks)} reviews from page {page_num}")
        time.sleep(1)

    print(f"Finished scraping {len(reviews_data)} reviews for {product_name}")
    all_reviews_data.extend(reviews_data)

In [None]:
output_file = '../data/flipkart.csv'


df = pd.DataFrame(all_reviews_data)
if not df.empty:
    df.to_csv(output_file, index=False)