In [1]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

import sys
import os
from pypdf import PdfReader
from pypdf import PdfWriter
from pathlib import Path

In [2]:
pd.set_option('display.max.columns', 7)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max.rows', 2000)
pd.set_option("display.float_format", lambda x: "%.2f" % x )

# Scraping Jumia Website

Scraping the jumia website by cateory

## Core variables

In [3]:
#The host url
host_url = "https://www.jumia.com.ng/"


categoryName = "laptops"
categoryTitle = categoryName.replace("-", " ").title()
outputFileName = categoryName.replace("-", "_")

#The host category url
host_category_url = f"{host_url}/{categoryName}"

#Total PageCount
pagesCount = 50

## Core functions

In [4]:
def loadSoup(host_url):
    #Set headers User-Agent variables for the get request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
    }
    
    try:
        # Make the HTTP GET request with the specified headers
        response = requests.get(host_url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes
    
        # Parse the HTML content using Beautiful Soup
        soup = BeautifulSoup(response.content, 'html.parser')
    
        # Now you can work with the 'soup' object to extract data
        # print(soup.prettify())

        return(soup)
    
    except requests.exceptions.RequestException as e:
        return(False)
        print(f"Error making request: {e}")

In [5]:
#A function that accepts the itemsList and returns a list of structured dicts
# containing 'name', 'price', 'currency', 'rating', 'link', 'img'
def loadItem(itemsList, categoryTitle):
    #the array that contains all objects
    thisItemsObjs = []

    #loop through the item list
    for thisItem in itemsList:
        #This thisItem Link
        thisItem_link = thisItem.find("a", class_="core")['href']

        #This thisItem img
        thisItem_img = thisItem.find("div", class_="img-c").find("img", class_="img")['data-src']

        #This thisItem name
        thisItem_name = thisItem.find("div", class_="info").find("h3", class_="name").text.strip()

        #This thisItem price
        thisItem_price = thisItem.find("div", class_="info").find("div", class_="prc").text.split(" ")

        #This thisItem rating
        thisItem_rating = thisItem.find("div", class_="info").find("div", class_="rev").find("div", class_="stars _s").text if (thisItem.find("div", class_="info").find("div", class_="rev")) else "No Rating"

        #Setting up this ItemObj dict
        thisItemObj = {
            "name": f"{thisItem_name}",
            "price": f"{thisItem_price[1]}".replace(",", ""),
            "currency": "NGN" if (thisItem_price[0].strip() == "₦") else "NGN",
            "rating": f"{thisItem_rating}".split(" out of ")[0],
            "link": f"{host_url}{thisItem_link}",
            "img": f"{thisItem_img}", 
            "categpry": f"{categoryTitle}"
        }
        # Append the dict to the thisItemsObjs list
        thisItemsObjs.append(thisItemObj)
    
    #return thisItemsObjs list
    return(thisItemsObjs)

## Load Items

In [7]:
#itemsObjs that hold all itemObj
itemsObjs = []

# a lost of pageLinks
pageLinks = []


#Loop through the pages
print("Loading...Please wait...")
for thisPageIdx in np.arange(pagesCount):
    #set up the page_url 
    page_url = "{}/?page={}".format(host_category_url, thisPageIdx+1) if (thisPageIdx+1 > 1) else "{}/".format(host_category_url)

    #load the soup using the loadSoup function above
    soup = loadSoup(page_url)

    #if the soup is loaded
    if (soup != False):
        #Get the itemSections
        itemSections = soup.find(class_="-pvs col12").find("div", class_="-phs -pvxs row _no-g _4cl-3cm-shs")

        #get the itemsList
        itemsList = itemSections.find_all("article")

        #load the ItemObj and append to itemsObjs
        itemsObjs.extend(loadItem(itemsList, categoryTitle))

    # THe page url obj
    page_url_obj = {}
    page_url_obj['page_url'] = page_url
    page_url_obj['items_found'] = len(itemsList)

    #Append the page url obj to the global pageLinks
    pageLinks.append(page_url_obj)


    # Completion Message
    if (thisPageIdx == pagesCount - 1): print(len(itemsObjs), " records found!", "Proceed")

Loading...Please wait...
2000  records found! Proceed


## Dataframe for the pageLinks

In [8]:
#Dataframe of the list of pageLinks extracted from
df1_pageLinks = pd.DataFrame(pageLinks)
df1_pageLinks

Unnamed: 0,page_url,items_found
0,https://www.jumia.com.ng//laptops/,40
1,https://www.jumia.com.ng//laptops/?page=2,40
2,https://www.jumia.com.ng//laptops/?page=3,40
3,https://www.jumia.com.ng//laptops/?page=4,40
4,https://www.jumia.com.ng//laptops/?page=5,40
5,https://www.jumia.com.ng//laptops/?page=6,40
6,https://www.jumia.com.ng//laptops/?page=7,40
7,https://www.jumia.com.ng//laptops/?page=8,40
8,https://www.jumia.com.ng//laptops/?page=9,40
9,https://www.jumia.com.ng//laptops/?page=10,40


## Dataframe for the itemObjs

In [25]:
#Dataframe of the itemObjs
df1_itemsObjs = pd.DataFrame(itemsObjs)
df1_itemsObjs = df1_itemsObjs[['name', 'price', 'categpry', 'currency', 'rating', 'img', 'link']]
df1_itemsObjs.head()

Unnamed: 0,name,price,categpry,currency,rating,img,link
0,Ace Elec 14.1'' Intel(R)Pentium(R) CPU N3700 16GB+128GB - Silver,213999,Laptops,NGN,3.6,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/19/8487963/1.jpg?3957,https://www.jumia.com.ng//ace-elec-14.1-intelrpentiumr-cpu-n3700-16gb128gb-silver-369784891.html
1,Macbook PRO Laptop A1278 13.3 Inch Core I5 2.5GHz 8GB RAM 500GB EN/AR Keyboard,185999,Laptops,NGN,3.5,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/54/8858693/1.jpg?9433,https://www.jumia.com.ng//renewed-macbook-pro-laptop-a1278-13.3-inch-core-i5-2.5ghz-8gb-ram-500gb-enar-keyboard-396858845.html
2,"Blueing 15.6"" Laptop J4125 8GB+256GB SSD Student Portable Computer",234061,Laptops,NGN,3.9,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/10/8071473/1.jpg?0416,https://www.jumia.com.ng//blueing-15.6-laptop-j4125-8gb256gb-ssd-student-portable-computer-374170801.html
3,DELL Latitude 11 Intel Celeron 4GB RAM- 64GB HDD WIN 10+ BAG,135000,Laptops,NGN,3.7,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/12/4395104/1.jpg?7396,https://www.jumia.com.ng//stream-11-pro-intel-celeron-4gb-ram-64gb-hdd-win-10-bag-hp-mpg11437537.html
4,Hp Hp Hp EliteBook 840 G7 10th Gen Intel Core I5 Touchscreen 16GB RAM/512GB SSD Win 11 Pro,529200,Laptops,NGN,4.1,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/38/6886101/1.jpg?2059,https://www.jumia.com.ng//elitebook-840-g6-intel-core-i5-16gb-ram512gb-ssdbacklit-keyboardfp-reader-windows-11-pro-bag-hp-mpg4091619.html


## Cleansing the df1_itemsObjs

In [26]:
df1_itemsObjs.shape

(2000, 7)

In [27]:
df1_itemsObjs_clickable = df1_itemsObjs.style.format(hyperlinks = "html")

In [35]:
df1_itemsObjs_clickable

Unnamed: 0,name,price,categpry,currency,rating,img,link
0,Ace Elec 14.1'' Intel(R)Pentium(R) CPU N3700 16GB+128GB - Silver,213999,Laptops,NGN,3.6,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/19/8487963/1.jpg?3957,https://www.jumia.com.ng//ace-elec-14.1-intelrpentiumr-cpu-n3700-16gb128gb-silver-369784891.html
1,Macbook PRO Laptop A1278 13.3 Inch Core I5 2.5GHz 8GB RAM 500GB EN/AR Keyboard,185999,Laptops,NGN,3.5,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/54/8858693/1.jpg?9433,https://www.jumia.com.ng//renewed-macbook-pro-laptop-a1278-13.3-inch-core-i5-2.5ghz-8gb-ram-500gb-enar-keyboard-396858845.html
2,"Blueing 15.6"" Laptop J4125 8GB+256GB SSD Student Portable Computer",234061,Laptops,NGN,3.9,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/10/8071473/1.jpg?0416,https://www.jumia.com.ng//blueing-15.6-laptop-j4125-8gb256gb-ssd-student-portable-computer-374170801.html
3,DELL Latitude 11 Intel Celeron 4GB RAM- 64GB HDD WIN 10+ BAG,135000,Laptops,NGN,3.7,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/12/4395104/1.jpg?7396,https://www.jumia.com.ng//stream-11-pro-intel-celeron-4gb-ram-64gb-hdd-win-10-bag-hp-mpg11437537.html
4,Hp Hp Hp EliteBook 840 G7 10th Gen Intel Core I5 Touchscreen 16GB RAM/512GB SSD Win 11 Pro,529200,Laptops,NGN,4.1,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/38/6886101/1.jpg?2059,https://www.jumia.com.ng//elitebook-840-g6-intel-core-i5-16gb-ram512gb-ssdbacklit-keyboardfp-reader-windows-11-pro-bag-hp-mpg4091619.html
5,Hp EliteBook 840 G6 TOUCHSCREEN Core I5-16GB RAM/1TB SSD/Backlit Keyboard/Windows 11 Pro+BAG,585000,Laptops,NGN,3.5,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/72/7080982/1.jpg?3106,https://www.jumia.com.ng//hp-elitebook-840-g6-touchscreen-core-i5-16gb-ram1tb-ssdbacklit-keyboardwindows-11-probag-289080727.html
6,Hp Stream 11 - Intel Celeron- Education Edition - 4GB RAM - 64GB HDD Windows 10 Pro+ Keyboard USB Light,160000,Laptops,NGN,4.1,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/83/7448603/1.jpg?3935,https://www.jumia.com.ng//hp-stream-11-intel-celeron-education-edition-4gb-ram-64gb-hdd-windows-10-pro-keyboard-usb-light-306844738.html
7,Hp Stream 11 Intel Celeron Quad Core - 64GB SSD 4GB RAM Windows 10 PRO+ Mouse &USB Light For Keyboard,140000,Laptops,NGN,4,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/49/7488463/1.jpg?9385,https://www.jumia.com.ng//stream-11-intel-celeron-quad-core-64gb-ssd-4gb-ram-windows-10-pro-mouse-usb-light-for-keyboard-hp-mpg6919499.html
8,"Hp CHROMEBOOK 11 , INTEL CELERON, 4GB RAM,16GB EMMC + usb light",98000,Laptops,NGN,3.8,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/24/3136404/1.jpg?9439,https://www.jumia.com.ng//hp-chromebook-11-intel-celeron-4gb-ram16gb-emmc-usb-light-404631342.html
9,"Blueing 14"" Laptop N3350 6GB+192GB SSD Portable Computer Student Pc",223775,Laptops,NGN,4,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/14/5971473/1.jpg?4360,https://www.jumia.com.ng//blueing-14-laptop-n3350-6gb192gb-ssd-portable-computer-student-pc-374179541.html


In [36]:
df1_itemsObjs.head()

Unnamed: 0,name,price,categpry,currency,rating,img,link
0,Ace Elec 14.1'' Intel(R)Pentium(R) CPU N3700 16GB+128GB - Silver,213999,Laptops,NGN,3.6,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/19/8487963/1.jpg?3957,https://www.jumia.com.ng//ace-elec-14.1-intelrpentiumr-cpu-n3700-16gb128gb-silver-369784891.html
1,Macbook PRO Laptop A1278 13.3 Inch Core I5 2.5GHz 8GB RAM 500GB EN/AR Keyboard,185999,Laptops,NGN,3.5,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/54/8858693/1.jpg?9433,https://www.jumia.com.ng//renewed-macbook-pro-laptop-a1278-13.3-inch-core-i5-2.5ghz-8gb-ram-500gb-enar-keyboard-396858845.html
2,"Blueing 15.6"" Laptop J4125 8GB+256GB SSD Student Portable Computer",234061,Laptops,NGN,3.9,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/10/8071473/1.jpg?0416,https://www.jumia.com.ng//blueing-15.6-laptop-j4125-8gb256gb-ssd-student-portable-computer-374170801.html
3,DELL Latitude 11 Intel Celeron 4GB RAM- 64GB HDD WIN 10+ BAG,135000,Laptops,NGN,3.7,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/12/4395104/1.jpg?7396,https://www.jumia.com.ng//stream-11-pro-intel-celeron-4gb-ram-64gb-hdd-win-10-bag-hp-mpg11437537.html
4,Hp Hp Hp EliteBook 840 G7 10th Gen Intel Core I5 Touchscreen 16GB RAM/512GB SSD Win 11 Pro,529200,Laptops,NGN,4.1,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/38/6886101/1.jpg?2059,https://www.jumia.com.ng//elitebook-840-g6-intel-core-i5-16gb-ram512gb-ssdbacklit-keyboardfp-reader-windows-11-pro-bag-hp-mpg4091619.html


In [37]:
#Check for duplicates
duplicateCount = df1_itemsObjs.duplicated().sum()
df1_itemsObjs_duplicate = False
if (duplicateCount > 0):
    #Save duplicates to a seperate dataframe
    df1_itemsObjs_duplicate = df1_itemsObjs[df1_itemsObjs.duplicated()]

    #Filter the duplicates off the main dataframe
    df1_itemsObjs = df1_itemsObjs[~df1_itemsObjs.duplicated()]
    print(duplicateCount, "Duplicate Removed!")
else: print("No Duplicate Found!")

No Duplicate Found!


In [47]:
#Check the datatype of the columns
df1_itemsObjs.dtypes

name         object
price       float64
categpry     object
currency     object
rating      float64
img          object
link         object
dtype: object

In [39]:
#Clean the rating column change no rating to 0
df1_itemsObjs['rating'] = df1_itemsObjs['rating'].apply(lambda x: x.replace("No Rating", "0"))

In [40]:
#Convert the price and rating column to float
df1_itemsObjs['price'] = df1_itemsObjs['price'].astype("float")
df1_itemsObjs['rating'] = df1_itemsObjs['rating'].astype("float")

In [41]:
df1_itemsObjs.dtypes

name         object
price       float64
categpry     object
currency     object
rating      float64
img          object
link         object
dtype: object

In [42]:
#Check for na
df1_itemsObjs.isna().sum()

name        0
price       0
categpry    0
currency    0
rating      0
img         0
link        0
dtype: int64

In [43]:
#Check for nulls
df1_itemsObjs.isnull().sum()

name        0
price       0
categpry    0
currency    0
rating      0
img         0
link        0
dtype: int64

In [44]:
df1_itemsObjs.sort_values(['price'], ascending = False)

Unnamed: 0,name,price,categpry,currency,rating,img,link
929,"Asus ROG STRIX G815 (G815LW-G18.U95080) INTEL CORE ULTRA 9 32GB DDR5 RAM 2TB SSD 18"" WQXGA SCREEN DISPLAY 16GB NVIDIA RTX 5080 GRAPHIC CARD",5500000.0,Laptops,NGN,0.0,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/39/3767814/1.jpg?2968,https://www.jumia.com.ng//asus-rog-strix-g815-g815lw-g18.u95080-intel-core-ultra-9-32gb-ddr5-ram-2tb-ssd-18-wqxga-screen-display-16gb-nvidia-rtx-5080-graphic-card-418767393.html
554,Samsung GALAXY BOOK5 PRO 360 (NP960QHA),4000000.0,Laptops,NGN,0.0,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/91/7257814/1.jpg?0961,https://www.jumia.com.ng//samsung-galaxy-book5-pro-360-np960qha-418752719.html
1479,"Asus ROG ZEPHYRUS GAMING G16 14TH GEN INTEL CORE ULTRA 9 16GB RAM 1TB SSD 16""OLED NVIDIA RTX 4070 (8GB) BACKLIT KEYBOARD WINDOWS 11",3900000.0,Laptops,NGN,0.0,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/56/2005814/1.jpg?5349,https://www.jumia.com.ng//rog-zephyrus-gaming-g16-14th-gen-intel-core-ultra-9-16gb-ram-1tb-ssd-16oled-nvidia-rtx-4070-8gb-backlit-keyboard-windows-11-asus-mpg11487751.html
468,"Alienware M18 R2 GAMING 14TH GEN,CORE I7-14700HX,1TB SSD/16GB RAM, RTX 4070,18"" WIN 11",3900000.0,Laptops,NGN,0.0,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/00/2261283/1.jpg?1208,https://www.jumia.com.ng//m18-r2-gaming-14th-gencore-i7-14700hx1tb-ssd16gb-ram-rtx-407018-win-11-alienware-mpg7383361.html
848,"Asus ROG ZEPHYRUS GAMING G16 14TH GEN INTEL CORE ULTRA 9 16GB RAM 1TB SSD 16""OLED NVIDIA RTX 4070 (8GB) BACKLIT KEYBOARD WINDOWS 11",3900000.0,Laptops,NGN,0.0,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/74/6293814/1.jpg?0146,https://www.jumia.com.ng//rog-zephyrus-gaming-g16-14th-gen-intel-core-ultra-9-16gb-ram-1tb-ssd-16oled-nvidia-rtx-4070-8gb-backlit-keyboard-windows-11-asus-mpg11114637.html
1061,"Lenovo Legion 5i 15IAX10 - 15.1"" WQXGA OLED Display - 32GB RAM / 1TB SSD - Nvidia GeForce RTX 5070 (8GB) - Intel Core Ultra 7 255HX - Windows 11",3800000.0,Laptops,NGN,0.0,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/19/7767814/1.jpg?3073,https://www.jumia.com.ng//lenovo-legion-5i-15iax10-15.1-wqxga-oled-display-32gb-ram-1tb-ssd-nvidia-geforce-rtx-5070-8gb-intel-core-ultra-7-255hx-windows-11-418767791.html
638,DELL XPS 9640 14TH GEN INTEL CORE ULTRA 7 16GB RAM 1TB SSD BACKLIT KEYBOARD NVIDIA RTX 4050 (6GB) FP READER WINDOWS 11,3800000.0,Laptops,NGN,0.0,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/15/6293814/1.jpg?7909,https://www.jumia.com.ng//xps-9640-14th-gen-intel-core-ultra-7-16gb-ram-1tb-ssd-backlit-keyboard-nvidia-rtx-4050-6gb-fp-reader-windows-11-dell-mpg11125084.html
895,"DELL XPS 14 9440 Intel Core Ultra 7 155H 1TB SSD 32GB 14.5"" 3.2k OLED Infinity Touchscreen Display (3200 X 2000) WINDOWS 11 FP Reader,Backlit Keyboard, Platinum Silver",3700000.0,Laptops,NGN,5.0,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/00/0593404/1.jpg?2185,https://www.jumia.com.ng//dell-xps-14-9440-intel-core-ultra-7-155h-1tb-ssd-32gb-14.5-3.2k-oled-infinity-touchscreen-display-3200-x-2000-windows-11-fp-readerbacklit-keyboard-platinum-silver-404395000.html
368,"Hp Omen 16 Gaming - 16GB RAM / 1TB SSD - Nvidia GeForce RTX 5060 (8GB) - Intel Core Ultra 7 255H - 16.0"" 2K (1920 x 1200) IPS LED Display - Windows 11.",3100000.0,Laptops,NGN,0.0,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/67/2217814/1.jpg?1646,https://www.jumia.com.ng//hp-omen-16-gaming-16gb-ram-1tb-ssd-nvidia-geforce-rtx-5060-8gb-intel-core-ultra-7-255h-16.0-2k-1920-x-1200-ips-led-display-windows-11.-418712276.html
843,"Apple MacBook Air 15"" - M4, 512 GB SSD, Silver - 2025",2999999.0,Laptops,NGN,0.0,https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/25/3909614/1.jpg?3161,https://www.jumia.com.ng//apple-macbook-air-15-m4-512-gb-ssd-silver-2025-416909352.html


## Save to File

In [45]:
df1_itemsObjs.shape

(2000, 7)

In [66]:
folder_name = outputFileName
def createDfOutputFiles():
    if (df1_pageLinks is not False): 
        df1_pageLinks.to_csv(f"{folder_name}/{outputFileName}_pagelinks.csv")
        print(f"Created {outputFileName}_pagelinks.csv")
    if (df1_pageLinks is not False): 
        df1_itemsObjs.to_csv(f"{folder_name}/{outputFileName}.csv")
        print(f"Created {outputFileName}.csv")
    
osHasDir = False
if os.path.isdir(folder_name):
    osHasDir = True
else:    
    try:
        if (os.mkdir(folder_name)):
            osHasDir = True
        
    except FileExistsError:
        osHasDir = False
        print(f"Folder '{folder_name}' already exists.")

if (osHasDir == True) :
    createDfOutputFiles()

Created laptops_pagelinks.csv
Created laptops.csv
