# DENT SCRAPER

This Jupiter notebook is used to download, pre-process and save data about all dentists in the Czech Republic from the webpage of the Czech Chamber of Stomatology

## Import of packages

In [35]:
import os
print("Current Working Directory " , os.getcwd())

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


Current Working Directory  C:\Users\jhabetinek\Desktop\Škola\Python\Project_work


In [36]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm
import time

## Getting info about dentists

First we will create class "Dentist_Downloader" that will help us to download data effectively.

In [37]:
class Dentist_Downloader:
    
    '''
    Download manager class created for specific purpose: scraping informations about physician from https://www.dent.cz/zubni-lekari/ 
    
    It contains methods for collection of links, downloading and storing data
    '''
    
    def __init__(self,allowLog=True):
        
    '''
    Class creator
    
    Takes single argument : True or False, defining if you want to display status messages
    
    Creates class with attributes that are used to store data
    '''
        
        self.allowLog = allowLog
        self.links = {
            'pages':[],
            'dentists':[]
        }
        self.data = {
            'name':[],
            'wkp_name':[],
            'street':[],
            'pst_code':[],
            'town':[],
            'phone':[]
        }

        
        if self.allowLog:
            print('Succesfully initialized Dentist Downloader')
   
    def getSubPages(self, link):
    
    '''
    Method used to collect links to all sub-pages containing full list of dentists
    
    Takes single argument - starting URL
    
    Links to sub-pages stored in self.links['pages']
    '''
        
        if self.allowLog:
            print('Searching for sub-pages on {} ...'.format(link))
        
        page = requests.get(link)
        page.encoding = 'UTF-8'
        soup = bs(page.text,'lxml')
        
        self.links['pages'] = ['https://www.dent.cz' + a['href'] for a in soup.findAll('a', attrs={'class' : 'btn'})][1:]
        
        if self.allowLog:
            print('Found {} sub-pages'.format(len(self.links['pages'])))
            
    def getDentists(self, link):
    
    '''
    Method used to collect links to all individual dentists
    
    Takes single argument - list of links to sub-pages
    
    Links to individual dentists stored in self.links['dentists']
    '''
        
        if self.allowLog:
            print('Searching for dentists on {} ...'.format(link))
        
        self.links['dentists'] = self.getLinks(link)
        
        if self.allowLog:
            print('Found {} dentists'.format(len(self.links['dentists'])))
            
    def getData(self, link):
    
    '''
    Method used to collect data about dentists
    
    Takes single argument - list of links to individual dentists
    
    Data stored in self.data as a dictionary
    '''
    
        if self.allowLog:
            print('Searching for data about dentists on provided pages ...')
        
        self.data = self.scrapeChar(link)
        
        if self.allowLog:
            print('Successfully downloaded data about {} dentists'.format(len(self.data['name'])))
            
    def getLinks(self, links):
        
    '''
    Method used to collect links to individual dentists applied in getDentists
    
    It is created in order to extract href atributes from BeautifulSoup objects stored in list of lists
    
    Returns single list with links to all dentists
    '''
        
        HCP_links = []
        
        for page in tqdm(links):
            content = requests.get(page)
            content.encoding = 'UTF-8'
            soup = bs(content.text,'lxml')

            a_list = [ul.findAll('a') for ul in soup.findAll('ul', attrs={'class' : 'list-unstyled text-col-3'})]
            a_merged = [val for sublist in a_list for val in sublist]
            output = ['https://www.dent.cz' + a['href'] for a in a_merged]
            HCP_links.append(output);
            
        HCP_links = [val for sublist in HCP_links for val in sublist]
        
        return HCP_links
    
    def scrapeChar(self, link, pause=0.1):
    
    '''
    Method used to collect all data about dentists and their workplaces
    
    It is created in order to download all information at once
    
    Returns single dictionary
    '''
        
        names = []
        wkp_names = []
        streets = []
        pst_codes = []
        towns = []
        phones = []
        
        for page in tqdm(link):
           
            try:
                content = requests.get(page)
                content.encoding = 'UTF-8'
                soup = bs(content.text,'lxml')

                h_1 = soup.find('h1')
                name = h_1.find('span', attrs={'itemprop' : 'name'}).text

                subset = soup.findAll('address', attrs={'class' : 'text--condensed-vertical'})
                wkp_name = self.parseSpan('name', subset)
                street = self.parseSpan('streetAddress', subset)
                pst_code = self.parseSpan('postalCode', subset)
                town = self.parseSpan('addressLocality', subset)
                phone = self.parseSpan('telephone', subset)
    
                names.append(name)
                wkp_names.append(wkp_name)
                streets.append(street)
                pst_codes.append(pst_code)
                towns.append(town)
                phones.append(phone)
            
                time.sleep(pause)
            
            except: pass

        stomatologists = {'name':names,
                         'wkp_name':wkp_names,
                         'street':streets,
                         'pst_code':pst_codes,
                         'town':towns,
                         'phone':phones
                         }
        
        return stomatologists
    
    def parseSpan(self, at, info):
    
    '''
    Sub-method used to collect all data about dentists and their workplaces
    
    It is created in order to deal with the fact that not all information is always avaiable
    
    Returns single list of values
    '''
        chars = []
        for item in info:
            try:
                char = item.find('span', attrs= {'itemprop' : at}).text
                chars.append(char)
            except:
                char = 'NA'
                chars.append(char)
        
        return chars
       

Using our class, we will download the data

In [38]:

URL = 'https://www.dent.cz/zubni-lekari/'

dentists = Dentist_Downloader()

dentists.getSubPages(URL)

dentists.getDentists(dentists.links['pages'])

dentists.getData(dentists.links['dentists'])


Succesfully initialized Dentist Downloader
Searching for sub-pages on https://www.dent.cz/zubni-lekari/ ...
Found 12 sub-pages
Searching for dentists on ['https://www.dent.cz/zubni-lekari/A-B/', 'https://www.dent.cz/zubni-lekari/C-D/', 'https://www.dent.cz/zubni-lekari/E-F/', 'https://www.dent.cz/zubni-lekari/G-H/', 'https://www.dent.cz/zubni-lekari/I-J/', 'https://www.dent.cz/zubni-lekari/K-L/', 'https://www.dent.cz/zubni-lekari/M-N/', 'https://www.dent.cz/zubni-lekari/O-P/', 'https://www.dent.cz/zubni-lekari/Q-R/', 'https://www.dent.cz/zubni-lekari/S-T/', 'https://www.dent.cz/zubni-lekari/U-V/', 'https://www.dent.cz/zubni-lekari/W-Z/'] ...


100%|██████████| 12/12 [00:21<00:00,  1.64s/it]


Found 10702 dentists
Searching for data about dentists on provided pages ...


100%|██████████| 10702/10702 [1:46:17<00:00,  1.92it/s]


Successfully downloaded data about 10699 dentists


## Pre-processing and saving data

From now on, the generally used methods and classes are sufficient - specificaly we use pandas to pre-process and save the data

In [57]:
data = pd.DataFrame(dentists.data)
print(data.head(5))

                       name  \
0           MUDr. Amer Abed   
1  DDS. Ahmad Amin Abosaleh   
2  MUDr. Fatemeh Aboutorabi   
3    MUDr. Marie Abrahamová   
4     MUDr. Ahmad Abu Baker   

                                            wkp_name  \
0  [MUDr. Amer Abed - Ortodoncie s.r.o., MUDr. Am...   
1  [ARIES,  spol. s r.o. , keramické zubní středi...   
2                                    [FAdent s.r.o.]   
3                           [MUDr. Abrahamová Marie]   
4                            [MUDr. Abu Baker Ahmad]   

                                street          pst_code  \
0  [Masarykova 1132/62, Krokova 22/12]  [312 00, 360 01]   
1                                   []          [552 03]   
2             [Stroupežnického 522/18]          [150 00]   
3                      [Francouzská 4]          [326 00]   
4                 [Boleslavova 1136/4]          [460 06]   

                    town                                  phone  
0  [Plzeň, Karlovy Vary]  [377421542  , 353228572

### Getting rid of lists in the dataframe

In [58]:
# examinig how many elements there are in one list at maximum
print(data['wkp_name'].map(lambda x: len(x)).max())

9


In [59]:
# checking that such row indeed exists
print(data[data['wkp_name'].map(lambda x: len(x)) == 9])

                 name                                           wkp_name  \
5301  MUDr. Jan Macko  [Stomatologie Macko s.r.o., MUDr. Macko Jan, S...   

                                                 street  \
5301  [Soukalova 3355/3, Pštrossova 198/23, Pěnčín 3...   

                                               pst_code  \
5301  [143 00, 110 00, 468 21, 155 00, 110 00, 468 4...   

                                                   town  \
5301  [Praha 4, Praha 1, Bratříkov, Praha 5, Praha 1...   

                                                  phone  
5301  [241047230, NA, 483397693 , NA, NA, NA, NA, NA...  


In [60]:
# spliting individual elements in lists into separate columns

colnames_wkp = ['wkp_name_' + str(i) for i in range(10)[1:10]]
colnames_street = ['wkp_street_' + str(i) for i in range(10)[1:10]]
colnames_psc = ['wkp_psc_' + str(i) for i in range(10)[1:10]]
colnames_town = ['wkp_town_' + str(i) for i in range(10)[1:10]]
colnames_phone = ['wkp_phone_' + str(i) for i in range(10)[1:10]]

data[colnames_wkp] = pd.DataFrame(data['wkp_name'].values.tolist(), index= data.index)
data[colnames_street] = pd.DataFrame(data['street'].values.tolist(), index= data.index)
data[colnames_psc] = pd.DataFrame(data['pst_code'].values.tolist(), index= data.index)
data[colnames_town] = pd.DataFrame(data['town'].values.tolist(), index= data.index)
data[colnames_phone] = pd.DataFrame(data['phone'].values.tolist(), index= data.index)

data = data.drop(['wkp_name','street','pst_code','town','phone'], axis = 1)
data = data.replace(to_replace = 'NA', value = None)


### Saving the data as CSV file

In [61]:
data.to_csv('dentists_wide.csv')