In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from IES_Downloader import IES_Downloader

# Lecture 6 - IES Web scraper

by Vítek Macháček

November 12th, 2019

* Putting it all together
* OOP + Pandas + Requests + BeautifulSoup

## Object-oriented Programming
* Brief reminder of how objects work

### Docstring + Objects + Inheriting + Constructors

In [3]:
class Parent:
    '''
    A very brief and helpful description of the Parent class
    '''
    def __init__(self,arg):
        '''
        Exact and brief description of parent's constructor
        '''
        self.attribute = 'x'
        self.id = arg
        
    def parentMethod(self):
        '''
        Good description of parentMethod
        '''
        pass
    
class Child(Parent):
    '''
    This is what Child is good for
    '''
    def __init__(self,arg,desc):
        '''
        Exact and brief description of Child's contructor
        '''
        super().__init__(arg)
        self.desc = desc
    
    def childMethod(self,arg):
        '''
        Good description of childMethod
        '''
        return len(arg)

    
par = Parent('A parent')
ch = Child(999,'A child')


In [4]:
ch.id

999

In [6]:
?ch

[0;31mType:[0m           Child
[0;31mString form:[0m    <__main__.Child object at 0x7fc4a1077710>
[0;31mDocstring:[0m      This is what Child is good for
[0;31mInit docstring:[0m Exact and brief description of Child's contructor


## Task:
* A parser of IES websites with following features:
    * All info about people from [Internal faculty](http://ies.fsv.cuni.cz/en/node/48), [External lecturers](http://ies.fsv.cuni.cz/en/node/49), [Ph.D. candidates](http://ies.fsv.cuni.cz/en/node/51) and [Administration](http://ies.fsv.cuni.cz/en/node/50)
    * All info about [all](http://ies.fsv.cuni.cz/en/node/109) theses between 1994 and 2019 won'be covered as we have problems with the website
    * Also all courses! But no list of courses available ...

### Robots.txt

* Is it OK to scrape?
* Guidance for search engines etc.


In [8]:
requests.get('http://ies.fsv.cuni.cz/robots.txt')

<Response [404]>

In [7]:
print(requests.get('http://sreality.cz/robots.txt').text)

User-agent: *
Disallow: /advertpdf/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region=*,
Disallow: /rk-detail
Disallow: *bez-aukce=
Disallow: *without-auction=


User-agent: SeznamBot
Disallow: /advertpdf/
Disallow: /en/
Disallow: /ru/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region=*,
Disallow: /rk-detail
Disallow: *bez-aukce=
Disallow: *without-auction=

Sitemap: https://www.sreality.cz/site

## Pages

### Find all persons?
[Current faculty](http://ies.fsv.cuni.cz/en/node/48)

In [9]:
def getSoup(link):
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text,'lxml')

In [10]:
def getAllLinks(link):
    soup = getSoup(link)
    tds = soup.findAll('td', {'class':'peopleTableCellName'})
    return ['http://ies.fsv.cuni.cz' + td.find('a')['href'] for td in tds]

links = getAllLinks('http://ies.fsv.cuni.cz/en/node/48')
links

['http://ies.fsv.cuni.cz/en/staff/barunik',
 'http://ies.fsv.cuni.cz/en/staff/bauerm',
 'http://ies.fsv.cuni.cz/en/staff/baxajaromir',
 'http://ies.fsv.cuni.cz/en/staff/bertoli',
 'http://ies.fsv.cuni.cz/en/staff/antosova',
 'http://ies.fsv.cuni.cz/en/staff/cahlik',
 'http://ies.fsv.cuni.cz/en/staff/fcech',
 'http://ies.fsv.cuni.cz/en/staff/cervinka',
 'http://ies.fsv.cuni.cz/en/staff/chytilova',
 'http://ies.fsv.cuni.cz/en/staff/dedek',
 'http://ies.fsv.cuni.cz/en/staff/dolezalova',
 'http://ies.fsv.cuni.cz/en/staff/gersl',
 'http://ies.fsv.cuni.cz/en/staff/gregor',
 'http://ies.fsv.cuni.cz/en/staff/havranek',
 'http://ies.fsv.cuni.cz/en/staff/irsova',
 'http://ies.fsv.cuni.cz/en/staff/hlavacekm',
 'http://ies.fsv.cuni.cz/en/staff/hollmannov',
 'http://ies.fsv.cuni.cz/en/staff/holub',
 'http://ies.fsv.cuni.cz/en/staff/horvath',
 'http://ies.fsv.cuni.cz/en/staff/jakubik',
 'http://ies.fsv.cuni.cz/en/staff/janda',
 'http://ies.fsv.cuni.cz/en/staff/jansky',
 'http://ies.fsv.cuni.cz/en/st

## Person's characteristics?

In [11]:
def getName(link):
    soup = getSoup(link)
    return soup.find('h2').text
[getName(link) for link in links]

['doc. PhDr. Jozef Baruník Ph.D.',
 'doc. PhDr. Michal Bauer Ph.D.',
 'PhDr. Jaromír Baxa Ph.D.',
 ' Paola Bertoli M.A., MSc., Ph.D.',
 'PhDr. Lucie Bryndová ',
 'doc. Ing. Tomáš Cahlík CSc.',
 'PhDr. František Čech Ph.D.',
 'RNDr. Michal Červinka Ph.D.',
 'doc. PhDr. Julie Chytilová Ph.D.',
 'prof. Ing. Oldřich Dědek CSc.',
 'doc. PhDr. Ing. Antonie Doležalová Ph.D.',
 'doc. PhDr. Adam Geršl Ph.D.',
 'doc. PhDr. Martin Gregor Ph.D.',
 'doc. PhDr. Tomáš Havránek Ph.D.',
 'doc. PhDr. Zuzana Havránková Ph.D.',
 'PhDr. Michal Hlaváček Ph.D.',
 'Ing. Monika Hollmannová ',
 'doc. Mgr. Tomáš Holub Ph.D.',
 'prof. Roman Horváth Ph.D.',
 'doc. PhDr. Ing. Ing. Petr Jakubík Ph.D. Ph.D.',
 'prof. Ing. Karel Janda M.A., Dr., Ph.D.',
 'doc. Petr Janský Ph.D.',
 'Ing. Irena Kemény ',
 'prof. Ing. Evžen Kočenda M.A., Ph.D., DSc.',
 'prof. Ing. et Ing. Luboš Komárek Ph.D., MSc., MBA',
 'doc. PhDr. Ladislav Krištoufek Ph.D.',
 'PhDr. Jiří Kukačka Ph.D.',
 'Mgr. Jan Mareš ',
 'prof. Ing. Michal Mejstřík

In [12]:
def getID(link):
    #http://ies.fsv.cuni.cz/en/staff/barunik
    return link.split('/')[-1]
[getID(link) for link in links]

['barunik',
 'bauerm',
 'baxajaromir',
 'bertoli',
 'antosova',
 'cahlik',
 'fcech',
 'cervinka',
 'chytilova',
 'dedek',
 'dolezalova',
 'gersl',
 'gregor',
 'havranek',
 'irsova',
 'hlavacekm',
 'hollmannov',
 'holub',
 'horvath',
 'jakubik',
 'janda',
 'jansky',
 'kemenyova',
 'kocenda',
 'komarek',
 'kristoufek',
 'kukacka',
 'maresj',
 'mejstrik',
 'nevrla',
 'novakji',
 'opatrny',
 'malirova',
 'palansky',
 'paulus',
 'neprasova',
 'gebicka',
 'pinter',
 'scasny',
 'schneider',
 'schwarz',
 'sedivy',
 'semerak',
 'reckova',
 'gregorovalenka',
 'reichlova',
 'teply',
 'vacek',
 'vacha',
 'visek',
 'vosvrda',
 'prochazkova',
 'zacek']

In [13]:
def getNextSiblingOfStrong(link,characteristic):
    soup = getSoup(link)
    strong = soup.find('strong',text=characteristic)
    return strong.next_sibling

def getMoreCharacteristics(link, characteristics):
    return [getNextSiblingOfStrong(link,char) for char in characteristics]



[getMoreCharacteristics(link,['Phone:','Office:']) for link in links[:2]]

[[' +420(776)259273', ' 503'], [' 222 112 329', ' 402']]

In [None]:
def getAnotherCharacteristic(link,characteristic):
    pass

* Let's do an object!

In [14]:
class Person:
    def __init__(self,link):
        self.soup = getSoup(link)
        self.office = self.getNextSiblingOfStrong('Office:')
        self.phone = self.getNextSiblingOfStrong('Phone:')
        
    def getNextSiblingOfStrong(self,characteristic):
        strong = self.soup.find('strong',text=characteristic)
        return strong.next_sibling

people = [Person(link) for link in links[:2]]

In [17]:
[p.soup.find('h2') for p in people]

[<h2>doc. PhDr. Jozef Baruník Ph.D.</h2>,
 <h2>doc. PhDr. Michal Bauer Ph.D.</h2>]

## Thesis characteristics?

In [None]:
dl = IES_Downloader(allowLog=False)
dl.getThesesLinksForCategory('http://ies.fsv.cuni.cz/en/node/270/','Doctoral')
thesesLinks = dl.links['theses']['Doctoral']

In [None]:
class Thesis:
    def __init__(self,link):
        pass

theses = []
for t in tqdm(thesesLinks):
    theses.append(Thesis(t))

* We are lazy programmers!

In [None]:
class Parent:
    pass

class Thesis(Parent):
    pass

class Person(Parent):
    pass

See **IES_Pages.py**

### Last object missing

see **IES_Downloader.py**

# Object Structure