In [1]:
from dputils.scrape import extract_many, get_webpage_data

In [22]:
url = "https://www.flipkart.com/search?q=mobiles&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off"
soup = get_webpage_data(url)

In [47]:
target = {
    'tag': 'div',
    'attrs':{'class':'_1YokD2 _3Mn1Gg'}
}
items = {
    'tag': 'div',
    'attrs':{'class':'_1AtVbE col-12-12'}
}
title = {
    'tag': 'div',
    'attrs':{'class':'_4rR01T'}
}
price = {
    'tag': 'div',
    'attrs':{'class':'_30jeq3 _1_WHN1'}
}
rating = {
    'tag': 'div',
    'attrs':{'class':'_3LWZlK'}
}
link = {
    'tag': 'a',
    'attrs':{'class':'_1fQZEK'},
    'output':'href'
}


In [48]:
from bs4 import BeautifulSoup

In [50]:
def extract_many_1(soup : BeautifulSoup, **selectors) -> list:
    """
    Extracts several data items from given BeautifulSoup object
    Output will be a list containing dicts of {title : data_stored_in_selectors}
    
    Args:
    soup (BeautifulSoup): Contains entire page data as BeautifulSoup object. Data items will be extracted from this object
    **selectors (dict): dict of {key : info}
        info must be written in following manner:
            first key is 'tag' with value as tag from where data is obtained
            second key is 'attrs' with value as dict containing id or class information
        Valid Examples:
            title =  {'tag' : 'h2', 'attrs' : {'class':'a-size-mini a-spacing-none a-color-base s-line-clamp-2'}}
            priceDict = {'tag' : 'span', 'attrs' : {'class' : 'a-price-whole'}}
        
        target is a special selector and can be added to specify which section of html code should data be extracted from
        Example of target use:
            target = {'tag' : 'div', 'attrs' : {'class':'s-main-slot s-result-list s-search-results sg-row'}}
        
        items is a mandatory selector which refers to the repeating blocks of html code from soup object
        Example of item use:
            items =  {'tag' : 'div', 'attrs' : {'class':'s-result-item'}}

        Valid call: 
            soup = get_webpage_data("https://www.amazon.com/s?k=headphones&crid=1DUUWW6PEVAJ1&sprefix=headphones%2Caps%2C161&ref=nb_sb_noss_1")
            extract_many(soup, 
                target = {'tag' : 'div', 'attrs' : {'class':'s-main-slot s-result-list s-search-results sg-row'}},
                items =  {'tag' : 'div', 'attrs' : {'class':'s-result-item'}},
                title =  {'tag' : 'h2', 'attrs' : {'class':'a-size-mini a-spacing-none a-color-base s-line-clamp-2'}})
    """
    if 'target' in selectors:
        tag = selectors['target'].get('tag')
        attrs = selectors['target'].get('attrs')
        if tag is None:
            print("Please give valid selectors")
            print("Example: target = {'tag' : 'div', 'attrs' : {...}")
            return None
        else:
            target = soup.find(tag, attrs)
            if target is None:
                print(f"Could not find target section with this {tag} and {attrs}")
                return None
    else:
        target = soup
    print(len(target.text))
    data_list = []
    if 'items' in selectors:
        print('tag',selectors['items'].get('tag'))
        print('attrs',selectors['items'].get('attrs'))
        items = target.find_all(selectors['items'].get('tag'), attrs = selectors['items'].get('attrs'))
        items_count = len(items)
        if items_count == 0:
            print("No data found")
            return data_list
        else:
            print(f"{items_count} items found")
            selectors.pop('target')
            selectors.pop('items')
            for idx, item in enumerate(items):
                data = {}
                try:
                    for key,info in selectors.items():
                        tag = info.get('tag', 'div')
                        attrs = info.get('attrs', None) 
                        output = info.get('output', 'text')
                        if output == 'text':
                            data[key] = item.find(tag, attrs = attrs).text.strip()
                        elif output == 'href':
                            data[key] = item.find(tag, attrs = attrs).attrs.get('href') 
                        elif output == 'src': 
                            data[key] = item.find(tag, attrs = attrs).attrs.get('src')     
                        else:
                            print('Not suitable output')
                    print(data)
                    data_list.append(data)
                except:
                    print("Item skipped at index:", idx)
            else:
                print("All items extracted")
            return data_list
    else:
        print("items is required as a parameter containing dict containing tag, attrs as keys")
        print("Example: items = {'tag' : 'div', 'attrs' : {...}")

In [51]:
out= extract_many_1(soup, target=target, items=items, title=title, price=price, rating=rating, link=link)

13757
9281
tag div
attrs {'class': '_1AtVbE col-12-12'}
26 items found
{'title': 'POCO C31 (Royal Blue, 64 GB)', 'price': '₹9,999', 'rating': '4.4', 'link': '/poco-c31-royal-blue-64-gb/p/itm19effae969b86?pid=MOBG73E7GKQK4KZP&lid=LSTMOBG73E7GKQK4KZPR5ICMK&marketplace=FLIPKART&q=mobiles&store=tyy%2F4io&srno=s_1_1&otracker=search&otracker1=search&fm=organic&iid=dc1256a9-9052-42f4-a50a-954ce272eaf4.MOBG73E7GKQK4KZP.SEARCH&ppt=None&ppn=None&ssid=bkcsh4mqdc0000001655357261831&qH=eb4af0bf07c16429'}
{'title': 'realme C20 (Cool Blue, 32 GB)', 'price': '₹7,499', 'rating': '4.4', 'link': '/realme-c20-cool-blue-32-gb/p/itmea1903897436b?pid=MOBGF4894MEWZJGV&lid=LSTMOBGF4894MEWZJGVW425N5&marketplace=FLIPKART&q=mobiles&store=tyy%2F4io&srno=s_1_2&otracker=search&otracker1=search&fm=organic&iid=dc1256a9-9052-42f4-a50a-954ce272eaf4.MOBGF4894MEWZJGV.SEARCH&ppt=None&ppn=None&ssid=bkcsh4mqdc0000001655357261831&qH=eb4af0bf07c16429'}
{'title': 'realme C20 (Cool Grey, 32 GB)', 'price': '₹7,499', 'rating': '4.

In [52]:
import pandas as pd

In [53]:
df  = pd.DataFrame(out)

In [54]:
df

Unnamed: 0,title,price,rating,link
0,"POCO C31 (Royal Blue, 64 GB)","₹9,999",4.4,/poco-c31-royal-blue-64-gb/p/itm19effae969b86?...
1,"realme C20 (Cool Blue, 32 GB)","₹7,499",4.4,/realme-c20-cool-blue-32-gb/p/itmea1903897436b...
2,"realme C20 (Cool Grey, 32 GB)","₹7,499",4.4,/realme-c20-cool-grey-32-gb/p/itmea1903897436b...
3,"SAMSUNG Galaxy F22 (Denim Blue, 64 GB)","₹10,499",4.3,/samsung-galaxy-f22-denim-blue-64-gb/p/itmce0a...
4,"SAMSUNG Galaxy F22 (Denim Black, 64 GB)","₹10,499",4.3,/samsung-galaxy-f22-denim-black-64-gb/p/itm6f4...
5,"SAMSUNG Galaxy F12 (Sea Green, 64 GB)","₹9,499",4.3,/samsung-galaxy-f12-sea-green-64-gb/p/itm4d358...
6,"REDMI Note 10S (Frost White, 64 GB)","₹12,999",4.4,/redmi-note-10s-frost-white-64-gb/p/itm2059d1d...
7,"REDMI Note 10S (Shadow Black, 64 GB)","₹12,999",4.4,/redmi-note-10s-shadow-black-64-gb/p/itmf77ca8...
8,"REDMI Note 10S (Cosmic Purple, 64 GB)","₹12,999",4.4,/redmi-note-10s-cosmic-purple-64-gb/p/itm2a1b4...
9,"REDMI Note 10S (Deep Sea Blue, 64 GB)","₹12,999",4.4,/redmi-note-10s-deep-sea-blue-64-gb/p/itmdd3b9...
