SCRAPING JUMIA FOR DISCOUNTED ITEMS

In [1]:
import requests
from bs4 import BeautifulSoup
import re

#Define your website --Jumia--
base_url = "https://www.jumia.co.ke/"
all_products = "https://www.jumia.co.ke/all-products/"

#Send a HTTP GET request to all_products url using requests library
response = requests.get(all_products)

#Create a BeautifulSoup object by parsing the HTML content retrieved by requests library
soup = BeautifulSoup(response.text, "html.parser")

- Get products in the current page

In [2]:
products = soup.find_all('article', class_ = "prd _fb col c-prd")
print(len(products))

40


- Get the link to each product in that page

In [3]:
links = []
for product in products:
    links.append(base_url+product.find('a', class_ = "core")['href'])

links

['https://www.jumia.co.ke//nivea-radiant-beauty-advanced-care-lotion-for-women-400ml-pack-of-2-94993120.html',
 'https://www.jumia.co.ke//vitron-htc4388fs-43-smart-android-frameless-tv-black-93348943.html',
 'https://www.jumia.co.ke//nivea-perfect-radiant-even-tone-day-and-night-cream-for-women-50ml-68528030.html',
 'https://www.jumia.co.ke//ailyons-fk-0301-stainless-steel-1.8l-electric-kettle-black-34080919.html',
 'https://www.jumia.co.ke//nunix-r5-hot-normal-free-standing-water-dispenser-white-160082072.html',
 'https://www.jumia.co.ke//garnier-anti-blemish-charcoal-serum-with-aha-bha-niacinamide-for-acne-prone-skin-119519704.html',
 'https://www.jumia.co.ke//tecno-spark-10-6.6-fhd128gb-rom8gb-ram-5000mahblue-146192465.html',
 'https://www.jumia.co.ke//ailyons-fk-0301-stainless-steel-1.8l-electric-kettle-silver.-187952560.html',
 'https://www.jumia.co.ke//nokia-c32-64gb-4gb-upto-7gb-ram-5000mah-android-13-black-charcoal-150536966.html',
 'https://www.jumia.co.ke//ailyons-elp2404k-2.

- Loop through each product and get the name, price, discount, image*

In [4]:
product_response = requests.get('https://www.jumia.co.ke//fashion-couple-canvas-low-top-lace-up-shoes-classic-casual-sneakers-black-154042530.html')
product_soup = BeautifulSoup(product_response.text, "html.parser")
sku = product_soup.find('li', class_="-pvxs").text.split(':')[1].strip()

name = product_soup.find('h1', class_="-fs20 -pts -pbxs").text.strip()
price = product_soup.find('span', class_="-b -ubpt -tal -fs24 -prxs").text
price =  int(re.search(r'(\d[\d,]*)', price).group(1).replace(',', ''))
discount = int(product_soup.find('span', class_="bdg _dsct _dyn -mls").text.replace('%',''))



details = {
    'sku' : sku,
    'name' : name,
    'price' : price,
    'discount': discount
}
details

{'sku': 'FA113FS1PMVNXNAFAMZ',
 'name': 'Fashion Couple Canvas Low Top Lace-up Shoes Classic Casual Sneakers Black',
 'price': 999,
 'discount': 33}

- Loop through pages collecting links

In [5]:
products_list = []
for page in range(1,10):
    response = requests.get(f'https://www.jumia.co.ke/all-products/?page={page}#catalog-listing')
    products = soup.find_all('article', class_ = "prd _fb col c-prd")
    for product in products:
        products_list.append(product)
    print(f"Getting products for page {page}")
    

Getting products for page 1
Getting products for page 2
Getting products for page 3
Getting products for page 4
Getting products for page 5
Getting products for page 6
Getting products for page 7
Getting products for page 8
Getting products for page 9


- Combine Link collection and data collection

- Create an SQL connection to help save the data into a database

In [6]:
import mysql.connector
try:
    connection = mysql.connector.connect(
        host = 'localhost',
        port = 3306,
        user = 'root',
        database = 'jumia',
        password = 'qwerty'
    )
except Exception as e:
    print(e)
else:
    print('Successfully connected')

Successfully connected


- Create a cursor for executing queries

In [10]:
cursor = connection.cursor()
data_entry_query = f'INSERT INTO jumia_products (SKU, name, price, old_price, discount) VALUES (%s, %s,%s,%s,%s) ON DUPLICATE KEY UPDATE name = VALUES(name), price = VALUES(price), old_price = VALUES(old_price), discount = VALUES(discount);'

In [13]:
products_list = []
product_count = 0
for page in range(1,3000):
    response = requests.get(f'https://www.jumia.co.ke/all-products/?page={page}#catalog-listing')
    products = soup.find_all('article', class_ = "prd _fb col c-prd")
    for product in products:
        link = base_url+product.find('a', class_ = "core")['href']
        product_response = requests.get(link)
        product_soup = BeautifulSoup(product_response.text, "html.parser")

        sku = product_soup.find('li', class_="-pvxs").text.split(':')[1].strip()
        name = product_soup.find('h1', class_="-fs20 -pts -pbxs").text.strip()
        current_price = product_soup.find('span', class_="-b -ubpt -tal -fs24 -prxs").text
        current_price =  int(re.search(r'(\d[\d,]*)', current_price).group(1).replace(',', ''))
        old_price = product_soup.find('span', class_="-tal -gy5 -lthr -fs16 -pvxs -ubpt")
        if old_price is not None:
            old_price = old_price = int(re.search(r'(\d[\d,]*)', old_price.text).group(1).replace(',', ''))
        else:
            old_price = None
        discount = product_soup.find('span', class_="bdg _dsct _dyn -mls")
        if discount is not None:
            discount = int(discount.text.replace('%',''))
        else:
            discount = None
        

        details = {
            'sku' : sku,
            'name' : name,
            'price' : current_price,
            'old_price': old_price,
            'discount': discount
        }
        values = (sku, name, current_price, old_price, discount)

        print(f"Saving Page:{page} product: {product_count}")
        cursor.execute(data_entry_query, values)
        connection.commit()
        product_count += 1
   
        
        
    

Saving Page:1 product: 0
Saving Page:1 product: 1
Saving Page:1 product: 2
Saving Page:1 product: 3
Saving Page:1 product: 4
Saving Page:1 product: 5
Saving Page:1 product: 6
Saving Page:1 product: 7
Saving Page:1 product: 8
Saving Page:1 product: 9
Saving Page:1 product: 10
Saving Page:1 product: 11
Saving Page:1 product: 12
Saving Page:1 product: 13
Saving Page:1 product: 14
Saving Page:1 product: 15
Saving Page:1 product: 16
Saving Page:1 product: 17
Saving Page:1 product: 18
Saving Page:1 product: 19
Saving Page:1 product: 20
Saving Page:1 product: 21
Saving Page:1 product: 22
Saving Page:1 product: 23
Saving Page:1 product: 24
Saving Page:1 product: 25
Saving Page:1 product: 26
Saving Page:1 product: 27
Saving Page:1 product: 28
Saving Page:1 product: 29
Saving Page:1 product: 30
Saving Page:1 product: 31
Saving Page:1 product: 32
Saving Page:1 product: 33
Saving Page:1 product: 34
Saving Page:1 product: 35
Saving Page:1 product: 36
Saving Page:1 product: 37
Saving Page:1 product: