In [None]:
# importing libraries
from bs4 import BeautifulSoup
import requests
import json
import re
import urllib
import time
import logging

logger = logging.getLogger('scraper')
FORMAT = "[%(filename)s:%(lineno)s - %(funcName)20s() ] %(message)s"
logging.basicConfig(format=FORMAT)
#logger.setLevel(logging.DEBUG)
#logger.debug('your message') 

class web_scraping:
    def __init__(self, URL, data):
        self.URL = URL
        self.data = data

    # specifying user agent, You can use other user agents
    # available on the internet
        self.HEADERS = ({'User-Agent':
                'Mozilla/5.0 (X11; Linux x86_64)  AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
                                'Accept-Language': 'en-US, en;q=0.5'})
  
    # Making the HTTP Request
        self.webpage = requests.get(self.URL, headers=self.HEADERS)
  
    # Creating the Soup Object containing all data
        self.soup = BeautifulSoup(self.webpage.content, "lxml")
        

    # retreiving product title
    def product_title(self):
        try:
            title = self.soup.find("span", attrs={"id": 'productTitle'})          
            title_value = title.string        
            title_string = title_value.strip().replace(',', '')
            return title_string
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            title_string = "NA"
            return title_string
  

    # retreiving product brand
    def brand(self):
        try:
            title_string = 'NA'
            if self.soup.find("td", attrs={"class": 'a-span9'}):         
                title = self.soup.find("td", attrs={"class": 'a-span9'})
                title = title.find("span", attrs={'class': 'a-size-base'})        
                title_string = title.text
                return title_string
            return title_string
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            title_string = "NA"
            return title_string

    
    # retreiving product price
    def product_price(self):
        try:
            price = 'NA'
            if self.soup.find("span", attrs={'id': 'priceblock_ourprice'}):
                price = self.soup.find("span", attrs={'id': 'priceblock_ourprice'}).string.strip().replace(',', '')
                trim = re.compile(r'[^\d.,]+')
                result = trim.sub('', price)
                return result
            elif self.soup.find("span", attrs={'id': 'priceblock_dealprice'}):
                price = self.soup.find("span", attrs={'id': 'priceblock_dealprice'}).string.strip().replace(',', '')
                trim = re.compile(r'[^\d.,]+')
                result = trim.sub('', price)
                return result
            else:
                return price
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            result = 'NA'
            return result
  
    
    # retreiving product offers
    def offers(self):
        try:
            offers = 'NA'
            if self.soup.find("span", attrs={'class': 'saving-prompt'}):
                offers = self.soup.find("span", attrs={'class': 'saving-prompt'}).text
                return str(int(re.search(r'\d+', offers).group()))
            return offers
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            offers = "NA"
            return offers
    
    
    # retreiving strike product price
    def strike_product_price(self):
        try:
            price = 'NA'
            if self.soup.find("span", attrs={'class': 'priceBlockStrikePriceString a-text-strike'}):
                price = self.soup.find("span", attrs={'class': 'priceBlockStrikePriceString a-text-strike'}).text
                return str(float(re.search(r'\d+', price).group()))
            return price
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            price = "NA"
            return price
    
    
    # retreiving product fullfilled
    def fullfilled(self):
        try:
            fullfilled = 'NA'
            if self.soup.find("span", attrs={'class': 'a-icon-text-fba'}):
                fullfilled_soup = self.soup.find("span", attrs={'class': 'a-icon-text-fba'})
                fullfilled = fullfilled_soup.text
                return fullfilled
            return fullfilled
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            fullfilled = "NA"
            return fullfilled

    
    # retreiving product rating
    def rating(self):
        try:
            rating = self.soup.find("i", attrs={'class': 'a-icon a-icon-star a-star-4-5'}).string.strip().replace(',', '')
            return rating 
        except AttributeError:
            try:
                rating = self.soup.find("span", attrs={'class': 'a-icon-alt'}).string.strip().replace(',', '')
                return rating
            except Exception as e:
                logging.error("Exception occurred", exc_info=True)
                rating = "NA"
                return rating
          

    # retreiving product reviews
    def reviews(self):
        try:
            if self.soup.find("span", attrs={'id': 'acrCustomerReviewText'}):
                review_count = self.soup.find("span", attrs={'id': 'acrCustomerReviewText'}).string.strip().replace(',', '')
                return review_count
            return 'NA'
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            review_count = "NA"
            return review_count

  
    # print availiblility status
    def availability(self):
        try:
            available = self.soup.find("div", attrs={'id': 'availability'})
            if available.find("span", attrs={'class': 'a-size-medium a-color-state'}):
                available = available.find("span", attrs={'class': 'a-size-medium a-color-state'}).string.strip().replace(',', '')
                return available
            return 'NA'  
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            available = "NA"
            return available


    # retreiving product categories
    def categories(self):
        try:
            categories = []
            if self.soup.find("div", attrs={'id': 'showing-breadcrumbs_div'}):
                breadcrumbs_div = self.soup.find("div", attrs={'id': 'showing-breadcrumbs_div'})
                rating = breadcrumbs_div.find_all("span", attrs={'class': 'a-list-item'})
                for i in rating:
                    if len(i) > 1:
                        categories.append(i.text.strip().replace('›', ''))
                return categories
            return categories
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            categories = "NA"
            return categories
            

    # retreiving product icons
    def icons(self):
        try: 
            icon_list = []
            fullfilled_soup = self.soup.find("div", attrs={'class': 'a-row icon-farm-wrapper'})
            icons = fullfilled_soup.text.strip().replace('\n', '').split('  ')
            for i in icons:
                if len(i) > 1:
                    icon_list.append(i)
            return icon_list
        except Exception as e:
            logging.error("Exception occurred", exc_info=True)
            icon_list = []
            return icon_list

    
    # retreiving product details
    def product_details(self):
        try:
            if self.soup.find("div", attrs={'id': 'detailBullets_feature_div'}):
                features_div = self.soup.find("div", attrs={'id': 'detailBullets_feature_div'})
                features_ordered = features_div.find_all("span", attrs={'class': 'a-list-item'})
                prod_feat = {}    
                try:
                    for i in features_ordered:
                        feat = i.text.strip().replace("\n", "").split(":")
                        if len(feat) == 2:
                            prod_feat[feat[0]] = feat[1]
                    return prod_feat
                except Exception as e:
                    logging.error("Exception occurred", exc_info=True)
                    pass
            return {}
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            prod_feat = "NA"
            return prod_feat
            
    
    # retreiving product important infos
    def important_infos(self):
        try: 
            important_info = 'NA'
            if self.soup.find("div", attrs={'id': 'important-information'}):
                features_div = self.soup.find("div", attrs={'id': 'important-information'})
                important_info  = features_div.text.strip().replace("\n", "")
                return important_info
            return important_info
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            important_info = "NA" 
            return important_info  

    
    # retreiving product bought together
    def bought_together(self):    
        try:
            bought_together = []
            bought_together_div = self.soup.find("div", attrs={'id': 'sims-consolidated-1_feature_div'})
            if bought_together_div:
                features_ordered = bought_together_div.find_all("img", alt=True)
                for i in features_ordered:
                    if i['alt'] != '':
                        bought_together.append(i['alt'])
            return bought_together
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            bought_together = "NA"
            return bought_together
            
    
    # retreiving product subscription discount
    def subscription_discount(self):
        try: 
            subscription_discount = 'NA'
            if self.soup.find("span", attrs={'class': 'discountTextLeft'}):
                discount = self.soup.find("span", attrs={'class': 'discountTextLeft'})
                subscription_discount = discount.text
                return subscription_discount
            return subscription_discount
        except Exception as e:
            logging.error("Exception occurred", exc_info=True)
            subscription_discount = 'NA'
            return subscription_discount
            
  
    # retreiving product variations
    def variations(self):
        try:
            variations = []
            variations_div = self.soup.find("div", attrs={'id': 'variation_pattern_name'})
            if variations_div:
                variations_list = variations_div.find_all("img", attrs={'class': 'imgSwatch'})
                for i in variations_list:
                    variations.append(i['alt'])
            return variations
        except AttributeError as e:
            logging.error("Exception occurred", exc_info=True)
            variations = "NA"
            return variations
    

    # retreiving product data
    def Data(self):
        self.data["Url"] = self.URL
        self.data["product Title"] = self.product_title()
        self.data["brand"] = self.brand() 
        self.data["Products price"] = self.product_price()
        # trim = re.compile(r'[^\d.,]+')
        # result = trim.sub('', self.offers())
        self.data["Offer_Count"] = self.offers()
        # trim = re.compile(r'[^\d.,]+')
        # result = trim.sub('', self.strike_product_price())
        self.data["StrikeThrough_Products_price"] = self.strike_product_price()
        self.data["Fullfilled"] = self.fullfilled()
        self.data["Overall rating"] = self.rating()
        self.data["Total reviews"] = self.reviews()
        self.data["Availability"] = self.availability()
        self.data["Category Tree"] = self.categories()
        self.data["icons"] = self.icons()
        self.data["Product_Details"] = self.product_details()
        self.data["misc_info"] = self.important_infos()
        self.data["bought_together"] = self.bought_together()
        self.data["subscription_discount"] = self.subscription_discount()
        self.data["variations"] = self.variations()
        return self.data
    

    def complete_data(self):
        data = []
        data.append(self.product_title())
        data.append(self.brand())
        data.append(self.product_price())
        data.append(self.offers())
        data.append(self.strike_product_price())
        data.append(self.fullfilled())
        data.append(self.rating())
        data.append(self.reviews())
        data.append(self.availability())
        data.append(self.categories())
        data.append(self.icons())
        data.append(self.product_details())
        data.append(self.important_infos())
        data.append(self.bought_together())
        data.append(self.subscription_discount())
        data.append(self.variations())
        return data

In [None]:
import pandas as pd
df = pd.read_csv("sh_url.csv")
urls = df['URL']

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
complete_data = []
for links in urls[80:160]:
    print("-"*50)
    data = {}
    print(links)
    try: 
        product_info = web_scraping(links,data).Data()
        print(product_info)
        complete_data.append(product_info)
        time.sleep(5)
    except Exception as e:
        logging.error("Exception occurred", exc_info=True)
        print("-"*20 + " Exception " + "-"*20)
        print(links)
        print(e)

--------------------------------------------------
https://www.amazon.in/LOccitane-Aromachologie-Repairing-Shampoo-300ml/dp/B00E6KILTC/ref=sxin_10?asc_contentid=amzn1.osa.616a1218-209a-4c6e-b84c-511fbc004b38.A21TJRUUN4KGV.en_IN&asc_contenttype=article&ascsubtag=amzn1.osa.616a1218-209a-4c6e-b84c-511fbc004b38.A21TJRUUN4KGV.en_IN&creativeASIN=B00E6KILTC&cv_ct_cx=shampoo&cv_ct_id=amzn1.osa.616a1218-209a-4c6e-b84c-511fbc004b38.A21TJRUUN4KGV.en_IN&cv_ct_pg=search&cv_ct_we=asin&cv_ct_wn=osp-single-source-pecos-desktop&dchild=1&keywords=shampoo&linkCode=oas&pd_rd_i=B00E6KILTC&pd_rd_r=2dc6b676-5de4-46d9-965d-d5f5ec455d15&pd_rd_w=t9M3P&pd_rd_wg=av0AI&pf_rd_p=6567d3a4-28dd-430b-a035-b0af9623dfa4&pf_rd_r=1SK879FYV6GQC83CRS5N&qid=1622109910&sr=1-4-c84eb971-91f2-4a4d-acce-811265c24254&tag=timeszoom-21
{'Url': 'https://www.amazon.in/LOccitane-Aromachologie-Repairing-Shampoo-300ml/dp/B00E6KILTC/ref=sxin_10?asc_contentid=amzn1.osa.616a1218-209a-4c6e-b84c-511fbc004b38.A21TJRUUN4KGV.en_IN&asc_contenttype

In [None]:
complete_data

[{'Availability': 'NA',
  'Category Tree': [],
  'Fullfilled': 'NA',
  'Offer_Count': 'NA',
  'Overall rating': '4.5 out of 5 stars',
  'Product_Details': {'ASIN\u200f': '\u200eB00E6KILTC',
   'Best Sellers Rank': '#30,285 in Beauty (See Top 100 in Beauty) #1,946 in Shampoos (Beauty)',
   'Country of Origin\u200f': '\u200eFrance',
   'Date First Available\u200f': '\u200e22 August 2014',
   'Is Discontinued By Manufacturer\u200f': '\u200eNo',
   'Item Dimensions LxWxH\u200f': '\u200e5.9 x 5.9 x 16.7 Centimeters',
   'Item Weight\u200f': '\u200e300 g',
   'Item model number\u200f': '\u200e17SH300G18',
   'Manufacturer\u200f': "\u200eL'Occitane",
   'Net Quantity\u200f': '\u200e300.0 millilitre',
   'Product Dimensions\u200f': '\u200e5.89 x 5.89 x 16.69 cm; 300 Grams'},
  'Products price': '1950.00',
  'StrikeThrough_Products_price': 'NA',
  'Total reviews': '737 ratings',
  'Url': 'https://www.amazon.in/LOccitane-Aromachologie-Repairing-Shampoo-300ml/dp/B00E6KILTC/ref=sxin_10?asc_content

In [None]:
import json
out_file = open("output_data_1.json", "w") 
    
json.dump(complete_data, out_file) 
    
out_file.close() 