In [1]:
import requests
import pandas as pd
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.common.exceptions import NoSuchElementException
import time

In [2]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [3]:
def is_good_response(resp):
    """
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [4]:
def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [5]:
def getCategory():
    category = []
    text = []

    page1 = simple_get('https://www.indonetwork.co.id/categories')
    soup1 = BeautifulSoup(page1, 'html.parser')
    findCategory = soup1.find_all('div', class_='sub1cat')

    for a in findCategory:
        href = a.find('a', href=True)
        category.append(href['href'])
        text.append(href.text)
        
    return category,text


In [6]:
def filterLinkProvince(category):
    province = []
    checked = True
    
    categoryPage = simple_get(category+ '/perusahaan')
    while(checked):
        try:
            checked = False
            soup2 = BeautifulSoup(categoryPage, 'html.parser')
            divProv = soup2.find('div', class_='filkat-sub')
            temp = divProv.find_all('a', href=True)
        except Exception:
            checked = True

    for a in temp:

        province.append(a['href'])
    
    
    return province

In [7]:
def filterTextProvince(category):
    textProv = []
    
    categoryPage = simple_get(category + '/perusahaan')
    soup2 = BeautifulSoup(categoryPage, 'html.parser')
    divProv = soup2.find('div', class_='filkat-sub')
    temp = divProv.find_all('a', href=True)

    for a in temp:

        textProv.append(a.text)
    
    return textProv

In [8]:
def getLinkCompany(province):
    
    linkCompany = []
    checked = True
    
    for page in range(1,10):
        
        if page == 1:
            url = province
        else:
            url = province + '?page=' + str(page)
    
        try:
            perusahaanPage = simple_get(url)
            soup3 = BeautifulSoup(perusahaanPage, 'html.parser')
            listCompany = soup3.find_all('div', class_='list-item-company')
        except Exception:
            try:
                perusahaanPage = simple_get(url)
                soup3 = BeautifulSoup(perusahaanPage, 'html.parser')
                listCompany = soup3.find_all('div', class_='list-item-company')
            except Exception:
                print('error')

        for i in listCompany:
            productInfo = i.find('a', class_='link_product')
            if productInfo == None:
                continue
            else :
                linkCompany.append('https:'+ productInfo['href'])

    return linkCompany

In [9]:
def getLinkWhatsapp(url):
    
    linkWa = []
    
    driver = webdriver.Chrome('C:/Users/Swift3/Downloads/Scrapper/path/chromedriver')
    driver.get(url)
    try:
        driver.find_element_by_class_name('wa-call').click()
    except NoSuchElementException as exception:
        return None
   
    time.sleep(1)
    page_source = driver.page_source
    soup4 = BeautifulSoup(page_source, 'html.parser').find_all('a', class_='nobor')
    
    for i in soup4:
        linkWa.append(i['href'])
    
    return linkWa

In [10]:
def getPhoneWA(url):
    try:
        whatsapp = simple_get(url)
        soup1 = BeautifulSoup(whatsapp, 'html.parser')
        findNumber = soup1.find('span', class_='').text
    except Exception:
        try:
            whatsapp = simple_get(url)
            soup1 = BeautifulSoup(whatsapp, 'html.parser')
            findNumber = soup1.find('span', class_='').text
        except Exception:
            try:
                whatsapp = simple_get(url)
                soup1 = BeautifulSoup(whatsapp, 'html.parser')
                findNumber = soup1.find('span', class_='').text
            except Exception:
                return None
    
    return findNumber

In [11]:
companyProfile = {
    'Name':'',
    'City':'',
    'Province':'',
    'Phone WA':''
}

In [12]:
category = getCategory()
category[1]

['Perlengkapan Mesin Industri',
 'Alat Uji & Ukur',
 'Pengolahan Limbah & Polusi',
 'Mesin Pertanian',
 'Valve',
 'Alat Penyaring',
 'Pompa & Perlengkapannya',
 'Mesin & Peralatan Farmasi',
 'Alat Pemadam Kebakaran',
 'Alat Safety',
 'Mesin & Sistem Conveyor',
 'Mesin Tekstil & Pakaian',
 'Mesin & Peralatan Las',
 'Mesin Cetak & Perlengkapannya',
 'Mesin Bending',
 'Mesin Pengolahan Makanan & Minuman',
 'Mesin CNC & Mesin Bubut',
 'Mesin Pengolahan Plastik',
 'Material Handling',
 'Mesin Konstruksi & Bangunan',
 'Strapping & Sealer',
 'Peralatan Pengeboran',
 'Power Tools',
 'Pengering & Evaporators',
 'Kontainer',
 'Mesin Pengolah Karet',
 'Alat Survey',
 'Magnetic',
 'Valves & Fitting',
 'Mesin Bekas & Service',
 'Alat Penggilingan',
 'Elektronik Rumah Tangga',
 'Peralatan Komponen Listrik',
 'Lampu & Perlengkapannya',
 'Motor Listrik & Komponen',
 'Komponen Elektronik',
 'Perlengkapan AC, Pendingin & Sparepart',
 'CCTV & Sistem Pengamanan',
 'Kabel',
 'Tenaga Surya & Pembaharuan Ene

In [19]:
linkProv = filterLinkProvince(category[0][12])
textProv = filterTextProvince(category[0][12])

textProv

['Kepulauan Riau',
 'Kalimantan Timur',
 'Sumatera Barat',
 'Banten',
 'Sumatera Utara',
 'Jawa Tengah',
 'Papua',
 'Sulawesi Selatan',
 'Jawa Barat',
 'Jawa Timur',
 'Riau',
 'Daerah Istimewa Yogyakarta',
 'Sumatera Selatan',
 'DKI Jakarta',
 'Kalimantan Selatan']

In [20]:
linkProv

['https://www.indonetwork.co.id/kepulauan-riau/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/kalimantan-timur/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/sumatera-barat/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/banten/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/sumatera-utara/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/jawa-tengah/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/papua/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/sulawesi-selatan/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/jawa-barat/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/jawa-timur/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/riau/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/daerah-istimewa-yogyakarta/mesin-peralatan-las/perusahaan',
 'https://www.indonetwork.co.id/sumatera-selatan/mesin-peralat

In [None]:
linkCompany = {}
for i in range(len(textProv)):
    linkCompany[textProv[i]] = getLinkCompany(linkProv[i])

for j in textProv:
    for k in linkCompany[j]:
        
        print(k)


In [16]:
data_company = []
phone_wa = []
phone_temp = []

for j in textProv:
    for k in linkCompany[j]:
        company = simple_get(k)
        try:
            soup = BeautifulSoup(company, 'html.parser')
        except Exception:
            try:
                soup = BeautifulSoup(company, 'html.parser')
            except Exception:
                try:
                    soup = BeautifulSoup(company, 'html.parser')
                except Exception:
                    print('error')
                    continue
        companyName = soup.find('h1', class_='sc-company__title').text
        membershipLvl = soup.find('span', class_='sc-company__lb').text
        companyCtgry = category[1][12]
        companyDesc = soup.find('div', class_='rc-company__description').text
        companyAddr = soup.find('address').text
        companyCity = soup.find('span', class_='text-capitalize').text
        companyProv = j
        linkWA = getLinkWhatsapp(k)
        if linkWA is not None:
            for wa in linkWA:
                phone_temp.append(getPhoneWA(wa))
            if len(phone_temp) == 1:
                phone_temp.append('None')
                phone_temp.append('None')
            elif len(phone_temp) == 2:
                phone_temp.append('None')
            elif len(phone_temp) == 4:
                del phone_temp[3]
        else:
            phone_temp.append('None')
            phone_temp.append('None')
            phone_temp.append('None')
        
        data_company.append({
            'Name':companyName,
            'Membership':membershipLvl,
            'Category':companyCtgry,
            'Description':companyDesc,
            'Address':companyAddr,
            'City':companyCity,
            'Province':companyProv
        })
        
        phone_wa.append(phone_temp)
        
        print(companyName)
        print(companyCity)
        print(companyProv)
        print(phone_temp)
        print('')
        phone_temp = []

PT. Hanja Prima Loka
Tangerang
Banten
['08558800556', 'None', 'None']

SURVINDO . CV
TANGERANG
Banten
['+62 896-1574-0924', '+62 812-8855-0107', 'None']

JAYA PUTRA AL FARISI
tegal
Jawa Tengah
['+62 856-8683-354', 'None', 'None']

Multi Control Indonesia
Semarang
Jawa Tengah
['6585225548633', 'None', 'None']

PT. SURYA AGRO MANDIRI
Bekasi
Jawa Barat
['+62 812-8562-6610', '+62 812-9714-0113', 'None']

PD RAVIBORDIR Indramayu 
INDRAMAYU - JAWABARAT INDONESIA. 
Jawa Barat
['+62 813-2473-1399', 'None', 'None']

CV. SAMUDERA TEKNIKINDO SELARAS
Cikarang
Jawa Barat
['+62 812-1941-5790', 'None', 'None']

Selatan Jaya Plastik Surabaya
surabaya
Jawa Timur
['+62 813-3135-1580', 'None', 'None']

CAHAYA GARMENT and EMBROIDERY
surabaya
Jawa Timur
['+62 819-3842-5969', 'None', 'None']

UD Dwi Tunggal Perkasa
tulungagung
Jawa Timur
['+81 3-3587-7045', 'None', 'None']

TEKNIK MANDIRI
Jakarta Barat
DKI Jakarta
['+62 812-1214-3333', 'None', 'None']

pt.inti karya karisma
Jakarta
DKI Jakarta
['+62 812-867

In [17]:
tempData = data_company
for index in range(len(data_company)):
    for number in range(len(phone_wa[index])):
        tempData[index]['telp' + str(number+1)] = phone_wa[index][number]


In [None]:
company = simple_get('https://www.indonetwork.co.id/company/cv_herry_jaya_utama')
soup = BeautifulSoup(company, 'html.parser')
companyName = soup.find('h1', class_='sc-company__title').text
membershipLvl = soup.find('span', class_='sc-company__lb').text
companyCtgry = category[1][12]
companyDesc = soup.find('div', class_='rc-company__description').text
companyAddr = soup.find('address').text
companyCity = soup.find('span', class_='text-capitalize').text
companyProv = 'Jawa Barat'


In [None]:
tempData.insert(17, {
    'Name':companyName,
    'Membership':membershipLvl,
    'Category':companyCtgry,
    'Description':companyDesc,
    'Address':companyAddr,
    'City':companyCity,
    'Province':companyProv,
    'telp1':'+62 812-1226-5508',
    'telp2':'None',
    'telp3':'None'
})

In [18]:
df = pd.DataFrame(tempData)

df.to_csv('data_company13.csv', index=False, encoding="utf-8")