### Scraping Gsmarena.com.bd using BeautifulSoup

---
##### Required Libraries


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import unicodedata
import re
import time
import timeit
import datetime

headers = {'authority': 'www.gsmarena.com.bd',
        'pragma': 'no-cache',
        'cache-control': 'no-cache',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'none',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-dest': 'document',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',}

### Function Definitions

---
1. **``` get_brands()```** returns a list of all the brands' urls.
2. **``` get_products_link```** takes the url of a particular brand and returns all the phones' urls.
3. **``` get_products_spec```** takes a list of phones' urls and returns a list of all the phones' specifications.







In [None]:
# Get the Brands Urls
def get_brands():
  url = "https://www.gsmarena.com.bd/brands/"
  try:
    req = requests.get(url, headers=headers)
    soup = BeautifulSoup(req.text, 'html.parser')

    brand_page = soup.select('div.product-thumb div.image a')
    brand_list = []

    for b in brand_page:
      brand_list.append(b.get("href"))

  except AttributeError as e:
    print(e)

  return brand_list


# Get all the Phones' Urls
def get_products_link(url):
  url = url
  product = []
  while True:
    req = requests.get(url, headers=headers)
    soup = BeautifulSoup(req.text, 'html.parser')

    link_find = soup.select('div.product-thumb a')

    list_link = []
    for l in link_find:
      list_link.append(l.get("href"))
    
    product.extend(list_link)

    # Pagination
    try:
      if soup.select("ul.pagination"):
        page = soup.select("ul.pagination li a")
        url = page[-1].get("href")

        if url == "#":
          break
      else:
        break
    except AttributeError as e:
      print(e)
		  
  return product


# Get the Specifications of all the phones'
def get_products_spec(all_product_link):
  product_count = 1
  discarded_count = 1
  all_product = []
  for link in all_product_link:
    try:
      req = requests.get(link, headers=headers)
      soup = BeautifulSoup(req.text, 'html.parser')
      spec = {}
      for table in soup.find_all('table','table table-striped'):
        temp = ""
        for t in table:
          head = unicodedata.normalize('NFKD', t.th.text)
          body = unicodedata.normalize('NFKD', t.td.text).strip()
          
          # Checking if the specification belongs to the previous category
          if head.isspace():
            head = temp
            if isinstance(spec[temp], list):
              spec[head].append(body)
            else:
              spec[temp]= [spec[temp]]
              spec[head].append(body)  
          else:
            #head = head.strip()
            spec[head] = body
          temp = head

      # Removing Smart Watches and Feature Phones (Non-Smart Phones) from the list
      if spec['Category'].lower() ==  'smart watch':
        print("Smart Watch: ", discarded_count)
        discarded_count = discarded_count + 1
      elif spec['Category'].lower() == 'feature phone':
        print("Feature phone Discarged: ", discarded_count)
        discarded_count = discarded_count + 1
      else:
        spec["url"] = link
        all_product.append(spec)
        print(product_count)
        product_count = product_count + 1

    except  AttributeError as e:
      print(e)

  return all_product


start = timeit.default_timer()

brands = get_brands()
all_product_link = []
for brand in brands:
  all_product_link.extend(get_products_link(brand))

all_product = get_products_spec(all_product_link)

# Storing the specifications in a json file
with open('all_product_spec_06_09_21.json', 'w') as outfile:
    json.dump(all_product, outfile, indent = 4)

stop = timeit.default_timer()
print('Time: ', stop - start)