In [122]:
import requests
from bs4 import BeautifulSoup
from time import sleep
from tqdm import tqdm
from IES_Downloader import IES_Downloader

In [71]:
!pip install tqdm



# Lecture 6 - IES Web scraper

by Jan Šíla, based heavily on work of Vítek Macháček

April 1st, 2020

* Putting it all together
* OOP + Pandas + Requests + BeautifulSoup

Midterm April 7th as planned -> mainly scraping. More info to come

## Object-oriented Programming
* Brief reminder of how objects work

### Docstring + Objects + Inheriting + Constructors

In [74]:
class NoConstructorClasss:
    def meth(self):
        print('hi')
        
nc = NoConstructorClasss()
nnc = NoConstructorClasss()
nnnc = NoConstructorClasss()
nc.meth()
nnnc.meth()

hi
hi


In [77]:
class Parent:
    '''
    A very brief and helpful description of the Parent class
    '''
    def __init__(self,arg):
        '''
        Exact and brief description of parent's constructor
        '''
        self.attribute = 'x'
        self.id = arg

    def main_method(self):
        raise NotImplementedError("override this in child class")
    
    def parentMethod(self):
        '''
        Good description of parentMethod
        '''
        pass
    
class Child(Parent):
    '''
    This is what Child is good for
    '''
    def __init__(self,arg,desc):
        '''
        Exact and brief description of Child's contructor
        '''
        super().__init__(arg) #id is 999 in the example
        self.desc = desc
        self.id = 2 #id gets overridden here and set as 2
    
    def main_method(self):
        """ Overriding parent method """
        print('I am the main method!')
        
    def childMethod(self,arg):
        '''
        Good description of childMethod
        '''
        return len(arg)

    
par = Parent('A parent')
ch = Child(999,'A child')


In [78]:
ch.id

2

In [79]:
?ch

[0;31mType:[0m           Child
[0;31mString form:[0m    <__main__.Child object at 0x1185715d0>
[0;31mDocstring:[0m      This is what Child is good for
[0;31mInit docstring:[0m Exact and brief description of Child's contructor


In [80]:
par.main_method() #this should fail

NotImplementedError: override this in child class

In [81]:
ch.main_method() #this should work ok

I am the main method!


## Task:
* A parser of IES websites with following features:
    * All info about people from [Internal faculty](http://ies.fsv.cuni.cz/en/node/48), [External lecturers](http://ies.fsv.cuni.cz/en/node/49), [Ph.D. candidates](http://ies.fsv.cuni.cz/en/node/51) and [Administration](http://ies.fsv.cuni.cz/en/node/50)
    * All info about [all](http://ies.fsv.cuni.cz/en/node/109) theses between 1994 and 2019 won'be covered as we have problems with the website
    * Also all courses! But no list of courses available ...

### Robots.txt

* Is it OK to scrape?
* Guidance for search engines etc.


https://www.promptcloud.com/blog/how-to-read-and-respect-robots-file/

In [82]:
requests.get('http://ies.fsv.cuni.cz/robots.txt')

<Response [404]>

In [83]:
print(requests.get('http://sreality.cz/robots.txt').text)

User-agent: *
Disallow: /advertpdf/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region=*,
Disallow: /rk-detail
Disallow: *bez-aukce=
Disallow: *without-auction=


User-agent: SeznamBot
Disallow: /advertpdf/
Disallow: /en/
Disallow: /ru/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region=*,
Disallow: /rk-detail
Disallow: *bez-aukce=
Disallow: *without-auction=

Sitemap: https://www.sreality.cz/site

## Pages

### Find all persons?
[Current faculty](http://ies.fsv.cuni.cz/en/node/48)

1. understand structure of the website

In [92]:
def getSoup(link):
    sleep(0.1) #to be kind to the website
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text,'lxml')

In [109]:
bf = getSoup('https://ies.fsv.cuni.cz/en/node/48')

In [110]:
tds = bf.findAll('td', {'class':'peopleTableCellName'})

'/en/staff/barunik'

In [120]:
def getAllLinks(link):
    soup = getSoup(link)
    tds = soup.findAll('td', {'class':'peopleTableCellName'})
    return ['https://ies.fsv.cuni.cz' + td.find('a')['href'] for td in tds]

links = getAllLinks('http://ies.fsv.cuni.cz/en/node/48')
links

['https://ies.fsv.cuni.cz/en/staff/barunik',
 'https://ies.fsv.cuni.cz/en/staff/bauerm',
 'https://ies.fsv.cuni.cz/en/staff/baxajaromir',
 'https://ies.fsv.cuni.cz/en/staff/bertoli',
 'https://ies.fsv.cuni.cz/en/staff/antosova',
 'https://ies.fsv.cuni.cz/en/staff/cahlik',
 'https://ies.fsv.cuni.cz/en/staff/fcech',
 'https://ies.fsv.cuni.cz/en/staff/cervinka',
 'https://ies.fsv.cuni.cz/en/staff/chytilova',
 'https://ies.fsv.cuni.cz/en/staff/dedek',
 'https://ies.fsv.cuni.cz/en/staff/dolezalova',
 'https://ies.fsv.cuni.cz/en/staff/gersl',
 'https://ies.fsv.cuni.cz/en/staff/gregor',
 'https://ies.fsv.cuni.cz/en/staff/havranek',
 'https://ies.fsv.cuni.cz/en/staff/irsova',
 'https://ies.fsv.cuni.cz/en/staff/hlavacekm',
 'https://ies.fsv.cuni.cz/en/staff/hollmannov',
 'https://ies.fsv.cuni.cz/en/staff/holub',
 'https://ies.fsv.cuni.cz/en/staff/horvath',
 'https://ies.fsv.cuni.cz/en/staff/jakubik',
 'https://ies.fsv.cuni.cz/en/staff/janda',
 'https://ies.fsv.cuni.cz/en/staff/jansky',
 'https:

## Person's characteristics?

In [121]:
def getName(link):
    soup = getSoup(link)
    return soup.find('h2').text
names = [getName(link) for link in links]

In [123]:
names

['doc. PhDr. Jozef Baruník Ph.D.',
 'doc. PhDr. Michal Bauer Ph.D.',
 'PhDr. Jaromír Baxa Ph.D.',
 'doc. Paola Bertoli M.A., MSc., Ph.D.',
 'PhDr. Lucie Bryndová ',
 'doc. Ing. Tomáš Cahlík CSc.',
 'PhDr. František Čech Ph.D.',
 'RNDr. Michal Červinka Ph.D.',
 'doc. PhDr. Julie Chytilová Ph.D.',
 'prof. Ing. Oldřich Dědek CSc.',
 'doc. PhDr. Ing. Antonie Doležalová Ph.D.',
 'doc. PhDr. Adam Geršl Ph.D.',
 'doc. PhDr. Martin Gregor Ph.D.',
 'doc. PhDr. Tomáš Havránek Ph.D.',
 'doc. PhDr. Zuzana Havránková Ph.D.',
 'PhDr. Michal Hlaváček Ph.D.',
 'Ing. Monika Hollmannová ',
 'doc. Mgr. Tomáš Holub Ph.D.',
 'prof. Roman Horváth Ph.D.',
 'doc. PhDr. Ing. Ing. Petr Jakubík Ph.D. Ph.D.',
 'prof. Ing. Karel Janda M.A., Dr., Ph.D.',
 'doc. Petr Janský Ph.D.',
 'Ing. Irena Kemény ',
 'prof. Ing. Evžen Kočenda M.A., Ph.D., DSc.',
 'prof. Ing. et Ing. Luboš Komárek Ph.D., MSc., MBA',
 'doc. PhDr. Ladislav Krištoufek Ph.D.',
 'PhDr. Jiří Kukačka Ph.D.',
 'Mgr. Jan Mareš ',
 'prof. Ing. Michal Mejs

In [127]:
"http://ies.fsv.cuni.cz/en/staff/barunik".split('/')[-1]

'barunik'

In [43]:
def getID(link):
    #http://ies.fsv.cuni.cz/en/staff/barunik
    return link.split('/')[-1]
[getID(link) for link in links]

['barunik',
 'bauerm',
 'baxajaromir',
 'bertoli',
 'antosova',
 'cahlik',
 'fcech',
 'cervinka',
 'chytilova',
 'dedek',
 'dolezalova',
 'gersl',
 'gregor',
 'havranek',
 'irsova',
 'hlavacekm',
 'hollmannov',
 'holub',
 'horvath',
 'jakubik',
 'janda',
 'jansky',
 'kemenyova',
 'kocenda',
 'komarek',
 'kristoufek',
 'kukacka',
 'maresj',
 'mejstrik',
 'nevrla',
 'novakji',
 'opatrny',
 'malirova',
 'palansky',
 'paulus',
 'neprasova',
 'gebicka',
 'pinter',
 'scasny',
 'schneider',
 'schwarz',
 'sedivy',
 'semerak',
 'reckova',
 'gregorovalenka',
 'reichlova',
 'teply',
 'vacek',
 'vacha',
 'visek',
 'vosvrda',
 'prochazkova',
 'zacek']

In [128]:
links[:2]

['https://ies.fsv.cuni.cz/en/staff/barunik',
 'https://ies.fsv.cuni.cz/en/staff/bauerm']

In [145]:
values = getMoreCharacteristics('https://ies.fsv.cuni.cz/en/staff/barunik', ['Phone:','Office:'])

In [146]:
values = [el.strip() for el in values]

In [147]:
values

['+420(776)259273', '503']

In [141]:
obj.next_sibling

' +420(776)259273'

In [151]:
def getNextSiblingOfStrong(link,characteristic):
    soup = getSoup(link)
    strong = soup.find('strong',text=characteristic)
    return strong.next_sibling.strip()

def getMoreCharacteristics(link, characteristics):
    return [getNextSiblingOfStrong(link,char) for char in characteristics]



[getMoreCharacteristics(link,['Phone:','Office:']) for link in links[:2]]

[['+420(776)259273', '503'], ['222 112 329', '402']]

In [25]:
def getAnotherCharacteristic(link,characteristic):
    pass

* Let's do an object!

In [152]:
class Person:
    def __init__(self,link):
        self.soup = getSoup(link)
        self.office = self.getNextSiblingOfStrong('Office:')
        self.phone = self.getNextSiblingOfStrong('Phone:')
        
    def getNextSiblingOfStrong(self,characteristic):
        strong = self.soup.find('strong',text=characteristic)
        return strong.next_sibling

people = [Person(link) for link in links[:2]]

In [154]:
people

[<__main__.Person at 0x1189e6e50>, <__main__.Person at 0x118e08d90>]

In [155]:
#forgot to add an attribute to the constructor?
[p.soup.find('h2') for p in people]

[<h2>doc. PhDr. Jozef Baruník Ph.D.</h2>,
 <h2>doc. PhDr. Michal Bauer Ph.D.</h2>]

In [158]:
for p_instance in people:
    p_instance.full_name = p_instance.soup.find('h2')

In [162]:
people[1].full_name

<h2>doc. PhDr. Michal Bauer Ph.D.</h2>