In [2]:
import requests
import pandas as pd
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.common.exceptions import NoSuchElementException
import time

In [3]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None
    """
    try:
        with closing(requests.get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None
    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [4]:
def is_good_response(resp):
    """
    Returns true if the response seems to be HTML, false otherwise
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [5]:
def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [7]:
def getCategory():
    category = []
    text = []

    page1 = simple_get('https://www.indotrading.com/productcatalog/')
    soup1 = BeautifulSoup(page1, 'html.parser')
    findCategory = soup1.find_all('div', class_='idt-head-catalog cat-product')

    for a in findCategory:
        href = a.find('a', class_='span-bold clr-red line-clamp-2', href=True)
        category.append(href['href'])
        text.append(href.text)
        
    return category,text


In [6]:
def filterLinkProvince(category):
    province = []
    checked = True
    
    categoryPage = simple_get(category+ '/perusahaan')
    while(checked):
        try:
            checked = False
            soup2 = BeautifulSoup(categoryPage, 'html.parser')
            divProv = soup2.find('div', class_='filkat-sub')
            temp = divProv.find_all('a', href=True)
        except Exception:
            checked = True

    for a in temp:

        province.append(a['href'])
    
    
    return province

In [7]:
def filterTextProvince(category):
    textProv = []
    
    categoryPage = simple_get(category + '/perusahaan')
    soup2 = BeautifulSoup(categoryPage, 'html.parser')
    divProv = soup2.find('div', class_='filkat-sub')
    temp = divProv.find_all('a', href=True)

    for a in temp:

        textProv.append(a.text)
    
    return textProv

In [8]:
def getLinkCompany(province):
    
    linkCompany = []
    checked = True
    
    for page in range(1,10):
        
        if page == 1:
            url = province
        else:
            url = province + '?page=' + str(page)
    
        try:
            perusahaanPage = simple_get(url)
            soup3 = BeautifulSoup(perusahaanPage, 'html.parser')
            listCompany = soup3.find_all('div', class_='list-item-company')
        except Exception:
            try:
                perusahaanPage = simple_get(url)
                soup3 = BeautifulSoup(perusahaanPage, 'html.parser')
                listCompany = soup3.find_all('div', class_='list-item-company')
            except Exception:
                try:
                    perusahaanPage = simple_get(url)
                    soup3 = BeautifulSoup(perusahaanPage, 'html.parser')
                    listCompany = soup3.find_all('div', class_='list-item-company')
                except Exception:
                    print('error')
                    continue

        for i in listCompany:
            productInfo = i.find('a', class_='link_product')
            if productInfo == None:
                continue
            else :
                linkCompany.append('https:'+ productInfo['href'])

    return linkCompany

In [9]:
def getLinkWhatsapp(url):
    
    linkWa = []
    
    driver = webdriver.Chrome('C:/Users/Swift3/Downloads/Scrapper/path/chromedriver')
    driver.get(url)
    try:
        driver.find_element_by_class_name('wa-call').click()
    except NoSuchElementException as exception:
        return None
   
    time.sleep(3)
    page_source = driver.page_source
    soup4 = BeautifulSoup(page_source, 'html.parser').find_all('a', class_='nobor')
    for i in soup4:
        linkWa.append(i['href'])
    
    return linkWa

In [10]:
def getPhoneWA(url):
    try:
        whatsapp = simple_get(url)
        soup1 = BeautifulSoup(whatsapp, 'html.parser')
        findNumber = soup1.find('span', class_='').text
    except Exception:
        try:
            whatsapp = simple_get(url)
            soup1 = BeautifulSoup(whatsapp, 'html.parser')
            findNumber = soup1.find('span', class_='').text
        except Exception:
            try:
                whatsapp = simple_get(url)
                soup1 = BeautifulSoup(whatsapp, 'html.parser')
                findNumber = soup1.find('span', class_='').text
            except Exception:
                try:
                    whatsapp = simple_get(url)
                    soup1 = BeautifulSoup(whatsapp, 'html.parser')
                    findNumber = soup1.find('span', class_='').text
                except Exception:
                    return None
    
    return findNumber

In [11]:
companyProfile = {
    'Name':'',
    'City':'',
    'Province':'',
    'Phone WA':''
}

In [42]:
category = getCategory()

In [44]:
listLink=[]
for item in category[0]:
    content = simple_get(item)
    soup = BeautifulSoup(content, 'html.parser')
    list_company = soup.find('a', class_='idt-elipsis')
    listLink.append(list_company['href'])

https://www.indotrading.com/company_jual-ac-industri-dan-perangkatnya_7705/
https://www.indotrading.com/company_jual-adaptor-saklar-dan-socket_7718/
https://www.indotrading.com/company_jual-aksesoris-genset_15308/
https://www.indotrading.com/company_aksesoriskabel_150/
https://www.indotrading.com/company_peralatanperlengkapanlistrik_867/
https://www.indotrading.com/company_jual-alat-pemanas_19769/
https://www.indotrading.com/company_alat-pengendali-industri_12717/
https://www.indotrading.com/company_jual-tenaga-surya-dan-energi-terbarukan_7692/
https://www.indotrading.com/company_baterai_132/
https://www.indotrading.com/company_produkcctv_667/
https://www.indotrading.com/company_pemutusrangkaian_138/
https://www.indotrading.com/company_jual-dinamo-motor_19754/
https://www.indotrading.com/company_jual-led-display-dan-sign-board_7721/
https://www.indotrading.com/company_home-appliances_4881/
https://www.indotrading.com/company_genset_141/
https://www.indotrading.com/company_gpsdannavigas

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://www.indotrading.com/company_jual-mesin-las_737/
https://www.indotrading.com/company_mesinmakanan_580/
https://www.indotrading.com/company_mesinmaterial_698/
https://www.indotrading.com/company_mesinminuman_666/
https://www.indotrading.com/company_jual-pengaduk-homogenizer_7742/
https://www.indotrading.com/company_jual-mesin-moulding_7743/
https://www.indotrading.com/company_mesinpackaging_581/
https://www.indotrading.com/company_mesinpembersihlantai_1267/
https://www.indotrading.com/company_jual-mesin-pembuat-es_3462/
https://www.indotrading.com/company_mesinpemotong_2824/
https://www.indotrading.com/company_jual-mesin-penanganan-polusi_7983/
https://www.indotrading.com/company_mesinpengolahkacangbiji_1235/
https://www.indotrading.com/company_jual-mesin-produksi-karet-ban_7764/
https://www.indotrading.com/company_jual-mesin-tambang-ekstraksi_7759/
https://www.indotrading.com/company_mesinkaretplastik_604/
https://www.indotrading.com/company_jual-mesin-pengupas_13954/
https://ww

https://www.indotrading.com/company_produkdanperalatanbayi_778/
https://www.indotrading.com/company_alatdapur_411/
https://www.indotrading.com/company_alatlaundry_420/
https://www.indotrading.com/company_jual-bahan-laundry_18066/
https://www.indotrading.com/company_dekorasi-rumah_4853/
https://www.indotrading.com/company_taman_776/
https://www.indotrading.com/company_kompor_772/
https://www.indotrading.com/company_pembersihruangan_409/
https://www.indotrading.com/company_jual-peralatan-kolam-renang_5796/
https://www.indotrading.com/company_peralatanmakan_4243/
https://www.indotrading.com/company_peralatan-rumah-tangga_4880/
https://www.indotrading.com/company_jual-perlengkapan-tidur-bayi_4144/
https://www.indotrading.com/company_produkhewanpeliharaan_849/
https://www.indotrading.com/company_tangga_4887/
https://www.indotrading.com/company_alatkecantikan_431/
https://www.indotrading.com/company_kertastissue_1156/
https://www.indotrading.com/company_jual-minyak-astiri-dan-aromatik_7926/


In [15]:
linkProv = filterLinkProvince(category[0][1])
textProv = filterTextProvince(category[0][1])

textProv

['Kepulauan Riau',
 'Kalimantan Timur',
 'Sumatera Barat',
 'Banten',
 'Bali',
 'Sumatera Utara',
 'Jawa Tengah',
 'Sulawesi Utara',
 'Aceh',
 'Sulawesi Selatan',
 'Jawa Barat',
 'Jawa Timur',
 'Riau',
 'Kalimantan Barat',
 'Nusa Tenggara Barat',
 'Daerah Istimewa Yogyakarta',
 'Sumatera Selatan',
 'Jambi',
 'Lampung',
 'DKI Jakarta',
 'Kalimantan Selatan']

In [16]:
linkProv

['https://www.indonetwork.co.id/kepulauan-riau/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/kalimantan-timur/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/sumatera-barat/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/banten/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/bali/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/sumatera-utara/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/jawa-tengah/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/sulawesi-utara/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/aceh/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/sulawesi-selatan/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/jawa-barat/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/jawa-timur/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/riau/alat-uji-ukur/perusahaan',
 'https://www.indonetwork.co.id/kalimantan-barat/alat-uji-ukur/perusahaan',
 'https

In [None]:
df = pd.DataFrame(category[1])

df.to_csv('list_category.csv', index=False, encoding="utf-8")

In [19]:
print(list_company['href'])

https://www.indotrading.com/company_jual-ac-industri-dan-perangkatnya_7705/


In [25]:

content2 = simple_get(category[0][0])
soup2 = BeautifulSoup(content2, 'html.parser')
filter_city = soup.find('div', class_='theme_menu cats a')

In [26]:
filter_city

<div class="theme_menu cats a"> <div><a href="/jual-ac-industri-dan-perangkatnya/">Semua Kota</a></div> <div><a href="/jakarta/jual-ac-industri-dan-perangkatnya/">Jakarta</a></div> <div><a href="/tangerang/jual-ac-industri-dan-perangkatnya/">Tangerang</a></div> <div><a href="/medan/jual-ac-industri-dan-perangkatnya/">Medan</a></div> <div><a href="/surabaya/jual-ac-industri-dan-perangkatnya/">Surabaya</a></div> </div>

In [40]:
print(category[0][0])

https://www.indotrading.com/jual-ac-industri-dan-perangkatnya/


In [58]:
companyAddress =[]
companyList = []

conte = simple_get(listLink[0])
soup4 = BeautifulSoup(conte, 'html.parser')
addr = soup4.find_all('p', class_='d-flex a-center')
tempList = soup4.find_all('a', class_='span-bold fs-18 product_title pr-10')

for i in range(len(tempList)):
    companyList.append(tempList[i]['href'])
    companyAddress.append(addr[i].text)
    

In [18]:
url = linkProv[5]

for page in range(1, 200):
    if page == 1:
        newUrl = url
        company = simple_get(newUrl)
    else:
        newUrl = url +'?page=' + str(page)
        company = simple_get(newUrl)
        
    try:
        soup = BeautifulSoup(company, 'html.parser')
    except Exception:
        continue

    listItem = soup.find_all('div', class_='product-info')
    for company in listItem:
        
        try:
            membershipLvl = company.find('div', class_='freemember').text
            membershipLvl = 'Free'
        except Exception:
            continue
            
        driver = webdriver.Chrome('C:/Users/Swift3/Downloads/Scrapper/path/chromedriver')
        driver.get(newUrl)
        
        try:
            driver.find_element_by_class_name('mask-phone-button').click()
        except NoSuchElementException as exception:
            continue

        companyName = company.find('h3').text
        companyDesc = company.find('div', class_='desc').text
        temp = company.find('div', class_='seller-name')
        companyAddr = temp.find_all('div')
        companyCity = company.find('div', class_='seller-info')

        print(companyName)
        print(membershipLvl)
        print(companyDesc)
        for i in companyAddr:
            print(i.text)
        print(companyCity.find('small').text)
    

Error during requests to https://www.indonetwork.co.id/sumatera-utara/alat-uji-ukur/perusahaan?page=47 : ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))


In [61]:
print(len(companyList))

12


In [None]:
category[1][76]

In [None]:
z = 0

for i in range(1, 10):
    linkProv = filterLinkProvince(category[0][z+ i])
    textProv = filterTextProvince(category[0][z+ i])
    
    linkCompany = {}
    
    for x in range(len(textProv)):
        linkCompany[textProv[x]] = getLinkCompany(linkProv[x])

    for m in textProv:
        for l in linkCompany[m]:

            print(l)
            
    data_company = []
    phone_wa = []
    phone_temp = []

    for j in textProv:
        for k in linkCompany[j]:
            company = simple_get(k)
            try:
                soup = BeautifulSoup(company, 'html.parser')
            except Exception:
                time.sleep(1)
                try:
                    soup = BeautifulSoup(company, 'html.parser')
                except Exception:
                    time.sleep(1)
                    try:
                        soup = BeautifulSoup(company, 'html.parser')
                    except Exception:
                        print('error')
                        continue
            companyName = soup.find('h1', class_='sc-company__title').text
            membershipLvl = soup.find('span', class_='sc-company__lb').text
            companyCtgry = category[1][z + i]
            companyDesc = soup.find('div', class_='rc-company__description').text
            companyAddr = soup.find('address').text
            companyCity = soup.find('span', class_='text-capitalize').text
            companyProv = j
            linkWA = getLinkWhatsapp(k)
            time.sleep(4)
            if linkWA is not None:
                for wa in linkWA:
                    phone_temp.append(getPhoneWA(wa))
                if len(phone_temp) == 1:
                    phone_temp.append('None')
                    phone_temp.append('None')
                elif len(phone_temp) == 2:
                    phone_temp.append('None')
                elif len(phone_temp) == 4:
                    del phone_temp[3]
                elif len(phone_temp) == 5:
                    del phone_temp[3]
                    del phone_temp[4]
                elif len(phone_temp) == 6:
                    del phone_temp[3]
                    del phone_temp[4]
                    del phone_temp[5]
            else:
                phone_temp.append('None')
                phone_temp.append('None')
                phone_temp.append('None')

            data_company.append({
                'Name':companyName,
                'Membership':membershipLvl,
                'Category':companyCtgry,
                'Description':companyDesc,
                'Address':companyAddr,
                'City':companyCity,
                'Province':companyProv
            })

            phone_wa.append(phone_temp)

            print(companyName)
            print(companyCity)
            print(companyProv)
            print(phone_temp)
            print('')
            phone_temp = []
    
    tempData = data_company
    for index in range(len(data_company)):
        for number in range(len(phone_wa[index])):
            tempData[index]['telp' + str(number+1)] = phone_wa[index][number]

    
    df = pd.DataFrame(tempData)
    df.to_csv('data_company' + str(z + i + 1) + '.csv', index=False, encoding="utf-8")
    time.sleep(1)

In [36]:
content = simple_get('https://www.indotrading.com/mahkotamasmandirimulia')
html = BeautifulSoup(content, 'html.parser')
companyName = html.find('a', class_='mb-5 text--black text--uppercase').text
companyAddress = html.find('div', class_='mt-10').text
companyProfile = html.find('div', class_='pb-4').find('div').text
linkWA = html.find('a', class_='hidden realwa mr-0 pr-12')

print(companyName)
print(companyAddress)
print(companyProfile)
print(linkWA['href'])

 PT. Mahkota Mas Mandiri Mulia
Green Ville BG/29 Duri KepaJakarta Barat DKI Jakarta
  Profil Perusahaan  PT. Mahkota Mas Mandiri Mulia berdiri pada tahun 1999. 
Kami menjual mesin plastik dan mesin mesin supporting untuk mesin plastik seperti :1. Chiller : Water Cooled Screw Type, Water Cooled Type, Air Cooled Screw Type, Air Cooled Type                                                   2. Crusher                                                                              3. Auto Loader   4. MTC              5. Mixer : Vertical Mixer, Horizontal Mixer, High Speed Drying Mixer 6. Hopper Dryer   7. Rack mould and many more
                                 Selain mesin mesin supporting kami juga menyidiakan spare parts seperti :1. Screen Filter 2. Magnet Separator3. Piala 4. Tang heater5. Kontaktor, dll 

Kami melayani pengiriman ke seluruh daerah jabodetabek dan seluruh Indonesia. Hubungi kami untuk info lebih lanjut. 
https://api.whatsapp.com/send?phone=628111931898&text=Halo,%20Kami

In [None]:
df = pd.DataFrame(tempData)

df.to_csv('data_company25.csv', index=False, encoding="utf-8")