In [20]:
import requests
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.common.exceptions import NoSuchElementException
import time

In [4]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [5]:
def is_good_response(resp):
    """
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [6]:
def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [7]:
def getCategory():
    category = []

    page1 = simple_get('https://www.indonetwork.co.id/categories')
    soup1 = BeautifulSoup(page1, 'html.parser')
    findCategory = soup1.find_all('div', class_='sub1cat')

    for a in findCategory:
        href = a.find('a', href=True)
        category.append(href['href'])
        
    return category


In [8]:
def filterLinkProvince(category):
    province = []
    
    categoryPage = simple_get(category)
    soup2 = BeautifulSoup(categoryPage, 'html.parser')
    divProv = soup2.find('div', class_='filkat-sub')
    temp = divProv.find_all('a', href=True)

    for a in temp:

        province.append(a['href'])
    
    return province

In [9]:
def filterTextProvince(category):
    textProv = []
    
    categoryPage = simple_get(category)
    soup2 = BeautifulSoup(categoryPage, 'html.parser')
    divProv = soup2.find('div', class_='filkat-sub')
    temp = divProv.find_all('a', href=True)

    for a in temp:

        textProv.append(a.text)
    
    return textProv

In [10]:
def getLinkCompany(province):
    
    linkCompany = []
    
    perusahaanPage = simple_get(province + '/perusahaan')
    soup3 = BeautifulSoup(perusahaanPage, 'html.parser')
    listCompany = soup3.find_all('div', class_='list-item-company')

    for i in listCompany:
        productInfo = i.find('a', class_='link_product')
        if productInfo == None:
            continue
        else :
            linkCompany.append('https:'+ productInfo['href'])

    return linkCompany

In [11]:
def getLinkWhatsapp(url):
    
    driver = webdriver.Chrome('C:/Users/Swift3/Downloads/Scrapper/path/chromedriver')
    driver.get(url)
    try:
        driver.find_element_by_class_name('wa-call').click()
    except NoSuchElementException as exception:
        return None
   
    time.sleep(1)
    page_source = driver.page_source
    soup4 = BeautifulSoup(page_source, 'html.parser').find('a', class_='nobor')
    linkWa = soup4['href']
    
    return linkWa

In [12]:
def getPhoneWA(url):
    if url != None :
        whatsapp = simple_get(url)
        soup1 = BeautifulSoup(whatsapp, 'html.parser')
        findNumber = soup1.find('span', class_='').text

        return findNumber
    
    return None

In [13]:
# companyProfile = {
#     'Name':'',
#     'City':'',
#     'Province':'',
#     'Phone WA':''
# }

In [14]:
category = getCategory()
category

['https://www.indonetwork.co.id/perlengkapan-mesin-dan-aksesoris-lainnya',
 'https://www.indonetwork.co.id/alat-uji-ukur',
 'https://www.indonetwork.co.id/pengolahan-limbah-polusi',
 'https://www.indonetwork.co.id/alat-mesin-pertanian',
 'https://www.indonetwork.co.id/ledeng',
 'https://www.indonetwork.co.id/alat-penyaring',
 'https://www.indonetwork.co.id/pompa-perlengkapannya',
 'https://www.indonetwork.co.id/mesin-peralatan-farmasi',
 'https://www.indonetwork.co.id/apar',
 'https://www.indonetwork.co.id/alat-safety',
 'https://www.indonetwork.co.id/mesin-sistem-conveyor',
 'https://www.indonetwork.co.id/mesin-tekstil-pakaian',
 'https://www.indonetwork.co.id/mesin-peralatan-las',
 'https://www.indonetwork.co.id/mesin-cetak-perlengkapannya',
 'https://www.indonetwork.co.id/mesin-bending',
 'https://www.indonetwork.co.id/mesin-pengolahan-makanan-minuman',
 'https://www.indonetwork.co.id/mesin-cnc-mesin-bubut',
 'https://www.indonetwork.co.id/mesin-pengolahan-plastik',
 'https://www.in

In [15]:
linkProv = filterLinkProvince(category[10])
textProv = filterTextProvince(category[10])

linkProv

['https://www.indonetwork.co.id/kalimantan-timur/mesin-sistem-conveyor',
 'https://www.indonetwork.co.id/banten/mesin-sistem-conveyor',
 'https://www.indonetwork.co.id/jawa-tengah/mesin-sistem-conveyor',
 'https://www.indonetwork.co.id/sulawesi-selatan/mesin-sistem-conveyor',
 'https://www.indonetwork.co.id/jawa-barat/mesin-sistem-conveyor',
 'https://www.indonetwork.co.id/jawa-timur/mesin-sistem-conveyor',
 'https://www.indonetwork.co.id/daerah-istimewa-yogyakarta/mesin-sistem-conveyor',
 'https://www.indonetwork.co.id/dki-jakarta/mesin-sistem-conveyor']

In [16]:
linkCompany = {}
for i in range(len(textProv)):
    linkCompany[textProv[i]] = getLinkCompany(linkProv[i])

for j in textProv:
    for k in linkCompany[j]:
        
        print(k)


https://www.indonetwork.co.id/company/pavamandiri
https://www.indonetwork.co.id/company/sentrabuanacipta
https://www.indonetwork.co.id/company/indorayaabadi
https://www.indonetwork.co.id/company/majujayaelectric
https://www.indonetwork.co.id/company/jualpanellistrik
https://www.indonetwork.co.id/company/ptgresindomasutama
https://www.indonetwork.co.id/company/denkowahanasakti
https://www.indonetwork.co.id/company/berliansakaperkasaengineering01
https://www.indonetwork.co.id/company/semarangteknik
https://www.indonetwork.co.id/company/sarana_semarang
https://www.indonetwork.co.id/company/multicontrol
https://www.indonetwork.co.id/company/jaya_putra_alfarisi
https://www.indonetwork.co.id/company/bintangmakmurwiramandiri
https://www.indonetwork.co.id/company/pt-sam
https://www.indonetwork.co.id/company/multiindojaya
https://www.indonetwork.co.id/company/mitrabarugemilang
https://www.indonetwork.co.id/company/mandiriniagamas
https://www.indonetwork.co.id/company/romoratama
https://www.indo

In [21]:
for j in textProv:
    for k in linkCompany[j]:
        company = simple_get(k)
        soup = BeautifulSoup(company, 'html.parser')
        companyName = soup.find('h1', class_='sc-company__title').text
        companyCity = soup.find('span', class_='text-capitalize').text
        linkWA = getLinkWhatsapp(k)
        phoneWA = getPhoneWA(linkWA)
        
        print(companyName)
        print(companyCity)
        print(j)
        print(phoneWA)
        print('')

PT. PAVA MANDIRI
TANGERANG 
Banten
+62 813-1778-9949

Sentra Buana Cipta.PT
Tangerang
Banten
+62 813-1035-9238

CV. Indo Raya Abadi 
Tangerang  Selatan
Banten
+62 812-9198-7382

MAJU JAYA ELECTRIC
Bogor
Banten
None

PT. Elshaday Power Control 
Tangerang 
Banten
+62 812-8197-0841

PT GRESINDO MAS UTAMA
Tangerang
Banten
085310025656

PT. DENKO WAHANA SAKTI
Semarang
Jawa Tengah
+62 812-3592-407

CV. Berlian Saka Perkasa Engineering
Semarang
Jawa Tengah
+62 822-2661-7956

TOKO SEMARANG TEKNIK
Semarang
Jawa Tengah
+62 815-2260-1312

PT.SARANA TEKNIK GROUP
Semarang
Jawa Tengah
+62 815-2260-1312

Multi Control Indonesia
Semarang
Jawa Tengah
6585225548633

JAYA PUTRA AL FARISI
tegal
Jawa Tengah
+62 856-8683-354

PT. Bintang Makmur Wiramandiri
Bekasi
Jawa Barat
+62 813-1758-9536

PT. SURYA AGRO MANDIRI
Bekasi
Jawa Barat
+62 812-8562-6610

PT.MULTI INDOJAYA MAKMUR
Bogor
Jawa Barat
+62 812-9107-035

Mitra Baru Gemilang
Bekasi
Jawa Barat
None

PT. Mandiri Niagamas Cemerlang
Bekasi
Jawa Barat
+62 8