In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from IES_Downloader import IES_Downloader
from IPython.core.debugger import Tracer

# Lecture 6 - IES Web scraper

by Vítek Macháček

March 27th and 28th, 2019

* Putting it all together
* OOP + Pandas + Requests + BeautifulSoup

## Object-oriented Programming
* Brief reminder of how objects work

### Docstring + Objects + Inheriting + Constructors

In [50]:
class Parent:
    '''
    A very brief and helpful description of the Parent class
    '''
    def __init__(self,arg):
        '''
        Exact and brief description of parent's constructor
        '''
        self.attribute = 'x'
        self.id = arg
        
    def parentMethod(self):
        '''
        Good description of parentMethod
        '''
        pass
    
class Child(Parent):
    '''
    This is what Child is good for
    '''
    def __init__(self,arg,desc):
        '''
        Exact and brief description of Child's contructor
        '''
        super().__init__(arg)
        self.desc = desc
    
    def childMethod(self,arg):
        '''
        Good description of childMethod
        '''
        return len(arg)

    
par = Parent('A parent')
ch = Child(999,'A child')


In [52]:
?par

[0;31mType:[0m           Parent
[0;31mString form:[0m    <__main__.Parent object at 0x7fc586e1b278>
[0;31mDocstring:[0m      A very brief and helpful description of the Parent class
[0;31mInit docstring:[0m Exact and brief description of parent's constructor


## Task:
* A parser of IES websites with following features:
    * All info about people from [Internal faculty](http://ies.fsv.cuni.cz/en/node/48), [External lecturers](http://ies.fsv.cuni.cz/en/node/49), [Ph.D. candidates](http://ies.fsv.cuni.cz/en/node/51) and [Administration](http://ies.fsv.cuni.cz/en/node/50)
    * All info about [all](http://ies.fsv.cuni.cz/en/node/109) theses between 1994 and 2019
    * Also all courses! But no list of courses available ...

### Robots.txt

* Is it OK to scrape?
* Guidance for search engines etc.


In [53]:
requests.get('http://ies.fsv.cuni.cz/robots.txt')

<Response [404]>

In [54]:
print(requests.get('http://sreality.cz/robots.txt').text)

User-agent: *
Disallow: /advertpdf/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region=*,
Disallow: /rk-detail
Disallow: *bez-aukce=
Disallow: *without-auction=


User-agent: SeznamBot
Disallow: /advertpdf/
Disallow: /en/
Disallow: /ru/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region=*,
Disallow: /rk-detail
Disallow: *bez-aukce=
Disallow: *without-auction=

Sitemap: https://www.sreality.cz/site

## Pages

### Find all persons?
[Current faculty](http://ies.fsv.cuni.cz/en/node/48)

In [None]:
def getSoup(link):
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text,'lxml')

In [60]:
def getAllLinks(link):
    soup = getSoup(link)
    #Tracer()()
    tds = soup.findAll('td', {'class':'peopleTableCellName'})
    return ['http://ies.fsv.cuni.cz' + td.find('a')['href'] for td in tds]

links = getAllLinks('http://ies.fsv.cuni.cz/en/node/48')


## Person's characteristics?

In [63]:
def getName(link):
    soup = getSoup(link)
    return soup.find('h2').text
[getName(link) for link in links]

['doc. PhDr. Jozef Baruník Ph.D.',
 'doc. PhDr. Michal Bauer Ph.D.',
 'PhDr. Jaromír Baxa Ph.D.',
 'PhDr. Lucie Bryndová ',
 'doc. Ing. Tomáš Cahlík CSc.',
 'PhDr. František Čech ',
 'RNDr. Michal Červinka Ph.D.',
 'doc. PhDr. Julie Chytilová Ph.D.',
 'prof. Ing. Oldřich Dědek CSc.',
 'doc. PhDr. Ing. Antonie Doležalová Ph.D.',
 'doc. PhDr. Adam Geršl Ph.D.',
 'doc. PhDr. Martin Gregor Ph.D.',
 'doc. PhDr. Tomáš Havránek Ph.D.',
 'doc. PhDr. Zuzana Havránková Ph.D.',
 'PhDr. Michal Hlaváček Ph.D.',
 'Ing. Monika Hollmannová ',
 'doc. Mgr. Tomáš Holub Ph.D.',
 'prof. Roman Horváth Ph.D.',
 'doc. PhDr. Ing. Ing. Petr Jakubík Ph.D. Ph.D.',
 'prof. Ing. Karel Janda M.A., Dr., Ph.D.',
 'doc. Petr Janský Ph.D.',
 'Ing. Irena Kemény ',
 'prof. Ing. Evžen Kočenda M.A., Ph.D., DSc.',
 'prof. Ing. et Ing. Luboš Komárek Ph.D., MSc., MBA',
 'Mgr. Vědunka Kopečná ',
 'doc. PhDr. Ladislav Krištoufek Ph.D.',
 'PhDr. Jiří Kukačka Ph.D.',
 'prof. Ing. Michal Mejstřík CSc.',
 'Mgr. Ing. Matěj Nevrla ',


In [65]:
def getID(link):
    #http://ies.fsv.cuni.cz/en/staff/barunik
    return link.split('/')[-1]
[getID(link) for link in links]

['barunik',
 'bauerm',
 'baxajaromir',
 'antosova',
 'cahlik',
 'fcech',
 'cervinka',
 'chytilova',
 'dedek',
 'dolezalova',
 'gersl',
 'gregor',
 'havranek',
 'irsova',
 'hlavacekm',
 'hollmannov',
 'holub',
 'horvath',
 'jakubik',
 'janda',
 'jansky',
 'kemenyova',
 'kocenda',
 'komarek',
 'kopecna',
 'kristoufek',
 'kukacka',
 'mejstrik',
 'nevrla',
 'novakji',
 'malirova',
 'paulus',
 'neprasova',
 'gebicka',
 'pinter',
 'scasny',
 'schneider',
 'schwarz',
 'semerak',
 'reckova',
 'gregorovalenka',
 'reichlova',
 'teply',
 'turnovec',
 'vacek',
 'vacha',
 'visek',
 'vosvrda',
 'prochazkova',
 'zacek']

In [72]:
def getNextSiblingOfStrong(link,characteristic):
    soup = getSoup(link)
    strong = soup.find('strong',text=characteristic)
    return strong.next_sibling

def getMoreCharacteristics(link, characteristics):
    return [getOneCharacteristic(link,char) for char in characteristics]



[getMoreCharacteristics(link,['Phone:','Office:']) for link in links]

[[' +420(776)259273', ' 503'],
 [' 222 112 329', ' 402'],
 [' 222 112 309', ' 311'],
 [' ', ' '],
 [' 222 112 318', ' 410'],
 [' +420 776 535 106', ' 503'],
 ['  +420 26605 2345 ', ' O408(IES), 225(UTIA)'],
 [' 222 112 318', ' 410'],
 [' 222 112 325', ' 510'],
 [' n.a.', ' 311'],
 [' n.a.', ' on leave at JVI'],
 [' +420 222 112 306', ' 107'],
 [' 2.2441 2318', ' 2P312 (CNB), 311 IES'],
 [' 222 112 309', ' 311'],
 [' 736 524 520', ' 408'],
 [' 222 112 324', ' 509'],
 [' +420/22441-2340', ' 311'],
 [' 222 112 317', ' 409'],
 [' +49 69 9511 19393', ' 311'],
 [' +420 222 112 316', ' 408'],
 [' ', ' 510'],
 [' 222 112 323', ' 509'],
 [' 222 112 321', ' 508'],
 [' 736524516', ' '],
 [' ', ' 602'],
 [' line 312 (IES), line 2243 (UTIA)', ' 406'],
 [' +420 602 767 305', ' 406 IES, 247 UTIA'],
 [' +420 222 112 326', ' 511'],
 [' ', ' 602'],
 [' +420 222 112 314', ' 402'],
 [' +420 602 396 703', ' IES 602'],
 [' ', ' 602'],
 [' 222 112 309', ' 311'],
 [' ', ' 408'],
 [' ', ' 402'],
 [' (+420) 220

In [None]:
def getAnotherCharacteristic(link,characteristic):
    pass

* Let's do an object!

In [76]:
class Person:
    def __init__(self,link):
        self.soup = getSoup(link)
        self.office = self.getNextSiblingOfStrong('Office:')
        self.phone = self.getNextSiblingOfStrong('Phone:')
        
    def getNextSiblingOfStrong(self,characteristic):
        strong = self.soup.find('strong',text=characteristic)
        return strong.next_sibling

people = [Person(link) for link in links]

In [78]:
[p.office for p in people]

[' 503',
 ' 402',
 ' 311',
 ' ',
 ' 410',
 ' 503',
 ' O408(IES), 225(UTIA)',
 ' 410',
 ' 510',
 ' 311',
 ' on leave at JVI',
 ' 107',
 ' 2P312 (CNB), 311 IES',
 ' 311',
 ' 408',
 ' 509',
 ' 311',
 ' 409',
 ' 311',
 ' 408',
 ' 510',
 ' 509',
 ' 508',
 ' ',
 ' 602',
 ' 406',
 ' 406 IES, 247 UTIA',
 ' 511',
 ' 602',
 ' 402',
 ' IES 602',
 ' 602',
 ' 311',
 ' 408',
 ' 402',
 ' #408, EnvCntr #1617',
 ' Bethesda, Maryland',
 ' 311',
 ' 311',
 ' 602',
 ' 406',
 ' ',
 ' 511',
 ' 206a',
 ' 508',
 ' 503',
 ' 407',
 ' 402',
 ' 602',
 ' 602']

## Thesis characteristics?

In [None]:
dl = IES_Downloader(allowLog=False)
dl.getThesesLinksForCategory('http://ies.fsv.cuni.cz/en/node/270/','Doctoral')
thesesLinks = dl.links['theses']['Doctoral']

In [None]:
class Thesis:
    def __init__(self,link):
        pass

theses = []
for t in tqdm(thesesLinks):
    theses.append(Thesis(t))

* We are lazy programmers!

In [None]:
class Parent:
    pass

class Thesis(Parent):
    pass

class Person(Parent):
    pass

See **IES_Pages.py**

### Last object missing

see **IES_Downloader.py**

# Object Structure