# Web-Scraper using Selinium for intakes.in

### Importing Required Packages

In [6]:
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

import pandas as pd
import os
import re

### Browser

In [10]:
intake_url = "https://intakes.in/product-category/baby-products/"

browser=webdriver.Firefox()

browser.get(intake_url)

### Extracting Product categories

In [11]:
dropup = browser.find_element_by_xpath('/html/body/div[1]/main/div/div[1]/div/aside[2]/ul/li[1]/button/i')
dropup.click()

prodcat = browser.find_element_by_xpath('/html/body/div[1]/main/div/div[1]/div/aside[2]/ul')
prod_cats = prodcat.text.split('\n')
print(len(prod_cats))

11


### Product Category Dictionary

In [12]:
prod_cats_dict = {}
for n in range(1, len(prod_cats)+1):
    prod_cats_dict[n] = prod_cats[n-1]

prod_cats_dict

{1: 'BABY PRODUCTS',
 2: 'BEVERAGES',
 3: 'DAIRY & FROZEN',
 4: 'HEALTH CARE & NUTRITION',
 5: 'HOME CARE',
 6: 'ORAL CARE',
 7: 'PACKAGED FOODS',
 8: 'PERSONAL CARE',
 9: 'STAPLES',
 10: 'Sugar',
 11: 'Uncategorized'}

### url_cat_part

In [13]:
url_cat_part = {}
for n in range(1, len(prod_cats)+1):
    url_cat_part[n] = '-'.join(prod_cats_dict[n].lower().split(' '))
url_cat_part[3] = 'dairy-frozen'
url_cat_part[4] = url_cat_part[4].replace('-&', '')

url_cat_part

{1: 'baby-products',
 2: 'beverages',
 3: 'dairy-frozen',
 4: 'health-care-nutrition',
 5: 'home-care',
 6: 'oral-care',
 7: 'packaged-foods',
 8: 'personal-care',
 9: 'staples',
 10: 'sugar',
 11: 'uncategorized'}

# Bot

(for all categories except Staples & Sugar)

In [277]:
%%time
### Data Dictionary

data_dict = {}
data_dict['Product Name'] = []
data_dict['MRP'] = []
data_dict['Price'] = []
data_dict['SKU'] = []
data_dict['Sub-Category'] = []

std_url = "https://intakes.in/product-category/"

'''
n - represnts the Category Number
Under the range of 1 - 11

# range(1, 2) --> Baby Products
# range(2, 3) --> Beverages
...
...
...
# range(11, 12) --> Uncategorized
Always better to run 1 category at a time.
-----------------------------------------

i - represents page number in the url
Under the range of 1 to N+1,
where N is the total number of pages under that category'''


#Product
for n in range(1, 12):
    #Page Number
    for i in range(1, ):
        url = std_url+'/'+url_cat_part[n]+f'/page/{i}/'
        browser.get(url)
        time.sleep(2)
        #Item Number
        for q in range(1, 13):
            try:
                qv = browser.find_element_by_xpath(f'/html/body/div[1]/main/div/div[2]/div/div[2]/div[{q}]/div/div[2]/div[1]/div[4]/a')
                browser.execute_script("arguments[0].click();", qv)
                time.sleep(2)
                
                #Product Name
                prod_name = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/a/h1').text
                data_dict['Product Name'].append(prod_name)

                #MRP & Price
                try:
                    mrp = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[2]/p/del/span').text
                    data_dict['MRP'].append(float(mrp[1:].replace(',', '')))

                    price = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[2]/p/ins/span').text
                    data_dict['Price'].append(float(price[1:].replace(',', '')))
                except:
                    mrp = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[2]/p/span').text
                    price = mrp
                    data_dict['MRP'].append(float(mrp[1:].replace(',', '')))
                    data_dict['Price'].append(float(price[1:].replace(',', '')))

                #SKU
                try:
                    sku = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[3]/span[1]/span').text
                    data_dict['SKU'].append(sku)
                except:
                    data_dict['SKU'].append('-')

                #Category & Sub-Category
                try:
                    categories = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[3]/span[2]').text
                    data_dict['Sub-Category'].append(categories.replace(', '+prod_cats_dict[n], '').replace(prod_cats_dict[n]+',', '').replace(prod_cats_dict[n], '').replace('Categories:', '').strip())
                except:
                    try:
                        categories = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[3]/span').text
                        data_dict['Sub-Category'].append(categories.replace(', '+prod_cats_dict[n], '').replace(prod_cats_dict[n]+',', '').replace(prod_cats_dict[n], '').replace('Categories:', '').strip())
                    except:
                        data_dict['Sub-Category'].append('-')
                        
                #Back to main
                webdriver.ActionChains(browser).send_keys(Keys.ESCAPE).perform()
                time.sleep(3)
            except:
                print(url,'>>>' , q)
            webdriver.ActionChains(browser).send_keys(Keys.ESCAPE).perform()
            time.sleep(3)

https://intakes.in/product-category//uncategorized/page/2/ >>> 6
https://intakes.in/product-category//uncategorized/page/2/ >>> 7
https://intakes.in/product-category//uncategorized/page/2/ >>> 8
https://intakes.in/product-category//uncategorized/page/2/ >>> 9
https://intakes.in/product-category//uncategorized/page/2/ >>> 10
https://intakes.in/product-category//uncategorized/page/2/ >>> 11
https://intakes.in/product-category//uncategorized/page/2/ >>> 12
Wall time: 1min 42s


(for Staples & Sugar)

In [None]:
### Data Dictionary

data_dict = {}
data_dict['Product Name'] = []
data_dict['MRP'] = []
data_dict['Price'] = []
data_dict['SKU'] = []
data_dict['Sub-Category'] = []

std_url = "https://intakes.in/product-category/"

for n in range(10, 11):
    for i in range(1, 2):
        url = std_url+'/'+url_cat_part[n]+f'/page/{i}/'
        browser.get(url)
        time.sleep(3)
        for q in range(1, 13):
            try:
                qv = browser.find_element_by_xpath(f'/html/body/div[1]/main/div/div[2]/div/div[2]/div[{q}]/div/div[2]/div[1]/div[4]/a')
                browser.execute_script("arguments[0].click();", qv)
                time.sleep(3)
                try:
                    grams = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/form/table/tbody/tr/td[1]/label')
                    n_grams = len(browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/form/table/tbody/tr').text.split('\n')[1:])
                    i = 1
                    while i <= n_grams:
                        prod_name = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/a/h1').text
                        gms = browser.find_element_by_xpath(f'/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/form/table/tbody/tr/td[2]/div/div[2]/div[{i}]/span').text 
                        data_dict['Product Name'].append(prod_name+'-'+gms)

                        try:
                            mrp = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[2]/p/del/span').text
                            data_dict['MRP'].append(float(mrp[1:].replace(',', '')))

                            price = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[2]/p/ins/span').text
                            data_dict['Price'].append(float(price[1:].replace(',', '')))                          
                        except:
                            mrp = browser.find_element_by_xpath(f'/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[2]/p/span[{i}]').text
                            price = mrp         
                            data_dict['MRP'].append(float(mrp[1:].replace(',', '')))
                            data_dict['Price'].append(float(price[1:].replace(',', '')))
                        try:
                            sku = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[3]/span[1]/span').text
                            data_dict['SKU'].append(sku)
                        except:
                            data_dict['SKU'].append('-')
                        try:
                            categories = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[3]/span[2]').text
        
                            data_dict['Sub-Category'].append(categories.replace('Categories: ', '').replace('Category: ', '').replace('STAPLES', '').replace(',', '').strip())
                        except:
                            categories = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[3]/span').text
                            data_dict['Sub-Category'].append(categories.replace('Categories: ', '').replace('Category: ', '').replace('STAPLES', '').replace(',', '').strip())
                        i += 1
                except:
                    #Product Name
                    prod_name = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/a/h1').text
                    data_dict['Product Name'].append(prod_name)
                            
                    #MRP & Price
                    try:
                        mrp = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[2]/p/del/span').text
                        data_dict['MRP'].append(float(mrp[1:].replace(',', '')))

                        price = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[2]/p/ins/span').text
                        data_dict['Price'].append(float(price[1:].replace(',', '')))                          
                    except:
                        mrp = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[2]/p/span').text
                        price = mrp
                        data_dict['MRP'].append(float(mrp[1:].replace(',', '')))
                        data_dict['Price'].append(float(price[1:].replace(',', '')))
        
                    #SKU
                    try:
                        sku = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[3]/span[1]/span').text
                        data_dict['SKU'].append(sku)
                    except:
                        data_dict['SKU'].append('-')

                    #Sub-Category
                    try:
                        categories = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[3]/span[2]').text
                        data_dict['Sub-Category'].append(categories.replace(', '+prod_cats_dict[n], '').replace(prod_cats_dict[n]+',', '').replace(prod_cats_dict[n], '').replace('Categories:', '').strip())
                    except:
                        try:
                            categories = browser.find_element_by_xpath('/html/body/div[2]/div/div[1]/div/div/div/div[2]/div/div[3]/span').text
                            data_dict['Sub-Category'].append(categories.replace(', '+prod_cats_dict[n], '').replace(prod_cats_dict[n]+',', '').replace(prod_cats_dict[n], '').replace('Categories:', '').strip())
                        except:   
                            data_dict['Sub-Category'].append('-')
                #Back to main
                webdriver.ActionChains(browser).send_keys(Keys.ESCAPE).perform()
                time.sleep(2)
            except:
                print(url,'>>>' , q)
            webdriver.ActionChains(browser).send_keys(Keys.ESCAPE).perform()
            time.sleep(2)

### Checking Counts

In [278]:
for i in data_dict.keys():
    print(i, len(data_dict[i])) 

Product Name 17
MRP 17
Price 17
SKU 17
Sub-Category 17


### Checking by name

In [270]:
for x  in range(len(data_dict['Product Name'])):
    if 'SUGAR( சர்க்கரை)INTAKES' in data_dict['Product Name'][x]:
        print(x)
        print(data_dict['Product Name'][x],
             data_dict['MRP'][x],
             data_dict['Price'][x],
             data_dict['SKU'][x],
             data_dict['Sub-Category'][x])

4
SUGAR( சர்க்கரை)INTAKES-1kg 39.0 39.0 N/A Sugar
5
SUGAR( சர்க்கரை)INTAKES-500gm 20.0 20.0 N/A Sugar


### DELETE Product Name

There are some item which have only bname no other required details like 
- SKU Number
- MRP
- Price
- Categories ans Sub-Categoriees

Such items will get added to Products and the correseponding Series of the dictionaries will be added by the next item details. This results in the variation in the number of elements in the Products Name Series and the rest.

We have to Delete them by identifying the index number of that item in Product Name and entering it in the below cell at index place. 

In [250]:
del(data_dict['Product Name'][360])

### MRP & Price updater

In some cases there are some Items those might have more than two quantity option varying with th prices and those might get wrongly mapped. So for updating the correct prices to respective quantity we can run the below code. 

(We need to give the index)

In [269]:
data_dict['MRP'][4] = 39.
data_dict['Price'][4] = 39.
data_dict['MRP'][5] = 20.
data_dict['Price'][5] = 20.

### Verify

In [251]:
for x in range(len(data_dict['Product Name'])):
    print(x, data_dict['Product Name'][x], 
          data_dict['MRP'][x],
          data_dict['Price'][x],
          data_dict['SKU'][x],
          data_dict['Sub-Category'][x])

0 Aachi Biryani Masala 50g 45.0 30.15 8906021120272 MASALA & SPICES
1 Aachi Chicken Kabab/65 Masala 50g 28.0 19.6 8906021120418 MASALA & SPICES
2 Aachi Chicken Masala 50g 28.0 18.9 8906021120463 MASALA & SPICES
3 Aachi Chilli Chicken Masala 50g 25.0 19.5 8906021120517 MASALA & SPICES
4 Aachi Chilli Powder 50g 22.0 14.3 8906021120562 MASALA & SPICES
5 Aachi Compounded Asafoetida-50g 26.0 23.4 8906021120128 MASALA & SPICES
6 Aachi Coriander Powder 50g 15.0 11.7 8906021120661 MASALA & SPICES
7 Aachi Curry Masala 50g 27.0 17.55 8906021120876 MASALA & SPICES
8 Aachi Egg Curry Masala 50g 31.0 22.01 8906021120968 MASALA & SPICES
9 Aachi Fish Curry Masala 50g 26.0 17.68 8906021121064 MASALA & SPICES
10 Aachi Garam Masala 50g 35.0 25.55 8906021121163 MASALA & SPICES
11 Aachi Idly Chilli Powder 50g 18.0 14.04 8906021121705 MASALA & SPICES
12 Aachi Kulambu Chilly Masala 100g 35.0 30.1 8906021121866 MASALA & SPICES
13 Aachi Kulambu Chilly Masala 50g 15.0 13.5 8906021121859 MASALA & SPICES
14 Aachi

119 FORTUNE BASMATI RICE 1kg FULL GRAIN 149.0 90.89 8906007287012 RICE, PUFFED RICE & POHA
120 FORTUNE BIRIYANI SPECIAL BASMATI RICE 1Kg 175.0 117.25 8906007283205 RICE, PUFFED RICE & POHA
121 FORTUNE Kachi Ghani Pure Mustard Oil-500ml 83.0 83.0 8906007280945 COOKING OILS
122 Fortune Sunflower Oil 1 Lit 142.0 112.0 8906007280242 COOKING OILS
123 Fortune Sunflower Oil 500 ml 65.0 55.0 8906007280235 COOKING OILS
124 FRIED YELLOW PEAS(வறுத்த மஞ்சள் பட்டாணி)INTAKES-100gm 12.0 12.0 N/A DALS & PULSES
125 FRIED YELLOW PEAS(வறுத்த மஞ்சள் பட்டாணி)INTAKES-200gm 22.0 22.0 N/A DALS & PULSES
126 Ganapathy Refined Groundnut Oil 1lit 212.0 170.0 8908006221226 COOKING OILS
127 Gold Winner Sunflower 1 Lit 134.0 134.0 8906010261078 COOKING OILS
128 Gold Winner Sunflower 500ml 68.0 68.0 8906010261047 COOKING OILS
129 Gold Winner Sunflower Oil 5Lit 634.0 634.0 8906010261139 COOKING OILS
130 Gold Winner Vanaspathi 100 ml 13.0 11.44 8906010262020 COOKING OILS
131 Gold Winner Vanaspathi 200 ml 23.0 21.39 890

266 SAKTHI CORIANDER POWDER 50g 15.0 10.05 8906002080212 MASALA & SPICES
267 SAKTHI CUMIN POWDER 50g 34.0 20.4 8906002082216 MASALA & SPICES
268 SAKTHI CURRY POWDER 100g 60.0 34.8 8906002080427 MASALA & SPICES
269 SAKTHI CURRY POWDER 50g 30.0 18.6 8906002080410 MASALA & SPICES
270 SAKTHI DHALL RICE POWDER 100g 42.0 27.3 8906002080816 MASALA & SPICES
271 SAKTHI EGG KURMA MASALA 50g 34.0 22.1 8906002081912 MASALA & SPICES
272 SAKTHI FISH CURRY MASALA 50g 22.0 15.4 8906002081776 MASALA & SPICES
273 SAKTHI FISH FRY MASALA 50g 26.0 17.16 8906002081714 MASALA & SPICES
274 SAKTHI Garam Masala-50g 35.0 23.8 8906002081813 MASALA & SPICES
275 SAKTHI GARLIC RICE POWDER 100g 48.0 30.72 8906002080915 MASALA & SPICES
276 SAKTHI MIXED MASALA KULAMBU CHILLI POWDER 50g 18.0 11.7 8906002082612 MASALA & SPICES
277 SAKTHI MUTTON MASALA 100g 62.0 37.82 8906002081424 MASALA & SPICES
278 SAKTHI MUTTON MASALA 50g 31.0 19.84 8906002081417 MASALA & SPICES
279 SAKTHI PEPPER CHICKEN MASALA 50g 40.0 25.6 890600208

#### Delete elements of Dictionary that need to be deleted.

Uncomment and give proper index

In [None]:
#del(data_dict['Product Name'][308])
#del(data_dict['MRP'][120])
#del(data_dict['Price'][120])
#del(data_dict['SKU'][120])
#del(data_dict['Category'][120])
#del(data_dict['Sub-Category'][120])

### Dictionary to DataFrame

In [279]:
df = pd.DataFrame(data_dict)
df.head()

Unnamed: 0,Product Name,MRP,Price,SKU,Sub-Category
0,BRITANNIA 50-50 TIMEPASS Classic salted – 78g,10.0,9.6,8901063018280,"BISCUITS, PACKAGED FOODS"
1,BRITANNIA CAKE GOBBLES-55g( CHOCO CHILLS),15.0,14.4,8901063362772,"CAKE AND CUP CAKES, PACKAGED FOODS"
2,BRITANNIA CAKE ROLL YO Choco Swiss roll,10.0,9.6,8901063362956,"CAKE AND CUP CAKES, PACKAGED FOODS"
3,BRITANNIA GOOD DAY Cashew Cookies – 100g,20.0,19.3,8901063093089,"BISCUITS, PACKAGED FOODS"
4,BRITANNIA LITTLE HEARTS Strawberry,10.0,10.0,8901063019119,"CEREAL MIXES, PACKAGED FOODS"
5,BRITANNIA MARIE GOLD – 120g,15.0,14.32,8901063162136,"BISCUITS, PACKAGED FOODS"
6,BRITANNIA TIGER KREEMZ Choco,5.0,5.0,8901063164130,"BISCUITS, PACKAGED FOODS"
7,BRITANNIA TIGER KREEMZ Choco vanilla,5.0,5.0,8901063164185,"BISCUITS, PACKAGED FOODS"
8,BRITANNIA TIGER KREEMZ Elaichi – 86g,10.0,10.0,8901063164239,"BISCUITS, PACKAGED FOODS"
9,BRITANNIA TREAT Creme Wafers Choco – 33g,10.0,5.0,8901063032231,PACKAGED FOODS


### Respective DataFrame names for each Category

#### Changing DataFrame to respective named df

In [283]:
df_Uncategorized = df

#### Adding Category Column and assigning value

In [284]:
df_Uncategorized['Category'] = 'UNCATEGORIZED'

In [285]:
df_Uncategorized.head()

Unnamed: 0,Product Name,MRP,Price,SKU,Sub-Category,Category
0,BRITANNIA 50-50 TIMEPASS Classic salted – 78g,10.0,9.6,8901063018280,"BISCUITS, PACKAGED FOODS",UNCATEGORIZED
1,BRITANNIA CAKE GOBBLES-55g( CHOCO CHILLS),15.0,14.4,8901063362772,"CAKE AND CUP CAKES, PACKAGED FOODS",UNCATEGORIZED
2,BRITANNIA CAKE ROLL YO Choco Swiss roll,10.0,9.6,8901063362956,"CAKE AND CUP CAKES, PACKAGED FOODS",UNCATEGORIZED
3,BRITANNIA GOOD DAY Cashew Cookies – 100g,20.0,19.3,8901063093089,"BISCUITS, PACKAGED FOODS",UNCATEGORIZED
4,BRITANNIA LITTLE HEARTS Strawberry,10.0,10.0,8901063019119,"CEREAL MIXES, PACKAGED FOODS",UNCATEGORIZED


### List of all the 11 DataFrames

In [None]:
[df_Baby_Products, df_Beverages, df_DAIRYnFROZEN, df_Health_Care, 
df_Home_Care, df_Oral_Care, df_Packaged_Foods, df_Personal_Care,
df_Staples, df_Sugar, df_Uncategorized]

### Concatinating all dfs to one

In [287]:
df_Intake = pd.concat([df_Baby_Products, df_Beverages, df_DAIRYnFROZEN, df_Health_Care, 
           df_Home_Care, df_Oral_Care, df_Packaged_Foods, df_Personal_Care,
           df_Staples, df_Sugar, df_Uncategorized])

In [288]:
df_Intake.shape

(2916, 6)

In [289]:
df_Intake.head()

Unnamed: 0,Product Name,MRP,Price,SKU,Sub-Category,Category
0,Abbott Pediasure Oats and Almond Chocolate Fla...,395.0,394.9,8904145912049,BABY FOODS,BABY PRODUCTS
1,Abbott Pediasure Premium Chocolate Flavour 200g,295.0,283.2,8904145911912,BABY FOODS,BABY PRODUCTS
2,Abbott Pediasure Vanilla Delight Falvour 400g,545.0,510.94,8904145912278,BABY FOODS,BABY PRODUCTS
3,Abbott Pediasure Vanilla Flavour 400g,395.0,394.9,8904145912025,BABY FOODS,BABY PRODUCTS
4,BABY DOVE Baby Bathing Bar 75g,52.0,50.44,8901030764882,OTHERS,BABY PRODUCTS


### Saving to Excel Sheets

In [290]:
writer = pd.ExcelWriter("Intakes_data.xlsx", engine='xlsxwriter')

#Intakes
df_Intake.to_excel(writer,sheet_name = 'All Categories', index=False)
#Baby Products
df_Baby_Products.to_excel(writer,sheet_name = 'BABY PRODUCTS', index=False)
#Beverages
df_Beverages.to_excel(writer,sheet_name = 'BEVERAGES', index=False)
#Dairy & Frozen
df_DAIRYnFROZEN.to_excel(writer,sheet_name = 'DAIRY & FROZEN', index=False)
#Health Care
df_Health_Care.to_excel(writer,sheet_name = 'HEALTH CARE & NUTRITION', index=False)
#Home Care
df_Home_Care.to_excel(writer,sheet_name = 'HOME CARE', index=False)
#Oral Care
df_Oral_Care.to_excel(writer,sheet_name = 'ORAL CARE', index=False)
#Packeged Foods
df_Packaged_Foods.to_excel(writer,sheet_name = 'PACKAGED FOODS', index=False)
#Personal Care
df_Personal_Care.to_excel(writer,sheet_name = 'PERSONAL CARE', index=False)
#Staples
df_Staples.to_excel(writer,sheet_name = 'STAPLES', index=False)
#Sugar
df_Sugar.to_excel(writer,sheet_name = 'SUGAR', index=False)
#Uncategorized
df_Uncategorized.to_excel(writer,sheet_name = 'UNCATEGORIZED', index=False)

writer.save() 