In [4]:
import requests
from bs4 import BeautifulSoup
from time import sleep
from tqdm import tqdm
import re
from IES_Downloader import IES_Downloader

# Lecture 6 - IES Web scraper

by Vítek Macháček

November 3rd, 2020

* Putting it all together
* OOP + Pandas + Requests + Scraping


## Object-oriented Programming
* Brief reminder of how objects work

### Docstring + Objects + Inheriting + Constructors

![Simple Object Structure](./img/class_example.png)

In [2]:
x = Person()

ZeroDivisionError: division by zero

In [1]:
try:
    5/0
except:
    print('cannot divide with zero')

cannot divide with zero


In [7]:
class PersonException(Exception):
    pass

class TeacherException(Exception):
    pass

class StudentException(Exception):
    pass


class Person:
    '''
    A Person class is *Abstract* - Intended not to be used directly, but rather to be inherited.
    '''
    def __init__(self,name,email):
        '''
        Person's constructor accepts name and e-mail and set it as class attributes, but first it checks whether they are in valid format
        '''

        if self._verify_name(name):
            self.name = name
        else:
            raise PersonException('Name is invalid')

        if self._verify_email(email):
            self.email = email
        else:
            raise PersonException('Email is invalid')
    
    def _verify_name(self,name):
        '''
        Method for ensuring that name is not empty and is string
        '''
        return len(name) > 0 and type(name) == type('')

    def _verify_email(self,email):
        '''
        Method validating that email is truly an email.
        '''

        email_pattern = '^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
        if(re.search(email_pattern,email)):  
            return True 
        else:  
            return False
    
    def _verify_courses(self,courses):
        '''
        Is list of courses a list containing valid course ident?
        '''
        def _verify_course(course):
            if type(course) != type(''):
                return False
            elif len(course) != 6:
                return False
            elif not (course.startswith('JEM') or course.startswith('JEB')):
                return False
            else:
                return True
        
        if type(courses) != type([]):
            return False

        if not all([_verify_course(course) for course in courses]):
            return False
        return True

    def describe(self):
        raise PersonException("Cannot describe parent Person class. Override this in Teacher or Student class")
    
    def get_name(self):
        return self.name
    def get_email(self):
        return self.email
someone = Person('Name','email@somewhere')

PersonException: Email is invalid

In [6]:
someone

<__main__.Person at 0x1eb66333208>

In [8]:
someone.describe()

PersonException: Cannot describe parent Person class. Override this in Teacher or Student class

In [9]:
 class Teacher(Person):
    '''
    Teacher class inherits from Person and contains name, email and list of courses taught
    '''
    def __init__(self,name,email,teaching_courses):
        '''
        Teacher's constructor calls the Person's constructor first and then do his stuff
        '''
        super().__init__(name,email)
        
        if self._verify_courses(teaching_courses):
            self.teaching_courses = teaching_courses
        else:
            raise TeacherException('Cannot validate courses.')
    
    def describe(self):
        """ Overriding parent method """
        print('I am {}, my email is {} and I teach following courses: {}'.format(self.name,self.email,self.teaching_courses))
    
        
class Student(Person):
    '''
    Student class inherits from Person and contains name, email and list of courses taught
    '''
    def __init__(self,name,email,studying_courses):
        '''
        Student's constructor calls the Person's constructor first and then do his stuff
        '''
        super().__init__(name,email)
        
        if self._verify_courses(studying_courses):
            self.studying_courses = studying_courses
        else:
            raise StudentException('Cannot validate courses.')

    def describe(self):
        """ Overriding parent method """
        print("I am Vítek's student. My name is not important, although I have one. It will be important during the final evaluation though. I study following courses: {}".format(self.studying_courses))
        
vitek = Teacher('Vítek','vitezkytek@gmail.com',['JEM207'])

vitek.describe()

I am Vítek, my email is vitezkytek@gmail.com and I teach following courses: ['JEM207']


In [11]:
my_student = Student('Honza','honza@fsv.cuni.cz',['JEM20','JEB111'])
my_student.describe()

StudentException: Cannot validate courses.

In [None]:
my_student.get_name()

In [12]:
?Person

[1;31mInit signature:[0m [0mPerson[0m[1;33m([0m[0mname[0m[1;33m,[0m [0memail[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m      A Person class is *Abstract* - Intended not to be used directly, but rather to be inherited.
[1;31mInit docstring:[0m Person's constructor accepts name and e-mail and set it as class attributes, but first it checks whether they are in valid format
[1;31mType:[0m           type
[1;31mSubclasses:[0m     Teacher, Student


In [None]:
?Teacher

### Robots.txt

* Is it OK to scrape?
* Guidance for search engines etc.


https://www.promptcloud.com/blog/how-to-read-and-respect-robots-file/

In [13]:
requests.get('http://ies.fsv.cuni.cz/robots.txt')

<Response [404]>

In [14]:
print(requests.get('http://sreality.cz/robots.txt').text)

User-agent: *
Disallow: /

User-agent: Googlebot
Allow: /
Disallow: /advertpdf/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region=*,
Disallow: /rk-detail
Disallow: *bez-aukce=
Disallow: *without-auction=
Disallow: *pois_in_place=
Disallow: *pois_in_place_distance=

User-agent: SeznamBot
Allow: /
Disallow: /advertpdf/
Disallow: /en/
Disallow: /ru/
Disallow: /favourites-info
Disallow: *_buri=
Disallow: /adresar/*page=
Disallow: /adresar/*perPage=
Disallow: /adresar/*search=
Disallow: /adresar/*letter=
Disallow: /adresar/*id=
Disallow: /firma/*page=
Disallow: /firma/*perPage=
Disallow: /firma/*search=
Disallow: /firma/*letter=
Disallow: /firma/*id=
Disallow: /hledani/*,
Allow: /hledani/*region

## Task:
* A parser of IES websites with following features:
    * All info about people from [Internal faculty](http://ies.fsv.cuni.cz/en/node/48), [External lecturers](http://ies.fsv.cuni.cz/en/node/49), [Ph.D. candidates](http://ies.fsv.cuni.cz/en/node/51) and [Administration](http://ies.fsv.cuni.cz/en/node/50)
    * All info about [all](http://ies.fsv.cuni.cz/en/node/109) theses between 1994 and 2019 won'be covered as we have problems with the website
    * Also all courses! But no list of courses available ...

## Pages

### Find all persons?
[Current faculty](http://ies.fsv.cuni.cz/en/node/48)

1. understand structure of the website

In [16]:
def getSoup(link):
    sleep(0.1) #to be kind to the website
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text,'lxml')

In [17]:
soup = getSoup('https://ies.fsv.cuni.cz/en/node/48')

In [18]:
names = soup.findAll('td', {'class':'peopleTableCellName'})
names

[<td class="peopleTableCellName">
 <a href="/en/staff/barunik"><b>doc. PhDr. Jozef Baruník Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/bauerm"><b>doc. PhDr. Michal Bauer Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/baxajaromir"><b>PhDr. Jaromír Baxa Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/bertoli"><b>doc. Paola Bertoli M.A., MSc., Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/antosova"><b>PhDr. Lucie Bryndová </b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/cahlik"><b>doc. Ing. Tomáš Cahlík CSc.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/fcech"><b>PhDr. František Čech Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/cervinka"><b>RNDr. Michal Červinka Ph.D.</b></a>
 </td>,
 <td class="peopleTableCellName">
 <a href="/en/staff/chytilova"><b>doc. PhDr. Julie Chytilová Ph.D.</b></a>
 </td

In [19]:
def getAllLinks(link):
    soup = getSoup(link)
    tds = soup.findAll('td', {'class':'peopleTableCellName'})
    return ['https://ies.fsv.cuni.cz' + td.find('a')['href'] for td in tds]

links = getAllLinks('http://ies.fsv.cuni.cz/en/node/48')
links

['https://ies.fsv.cuni.cz/en/staff/barunik',
 'https://ies.fsv.cuni.cz/en/staff/bauerm',
 'https://ies.fsv.cuni.cz/en/staff/baxajaromir',
 'https://ies.fsv.cuni.cz/en/staff/bertoli',
 'https://ies.fsv.cuni.cz/en/staff/antosova',
 'https://ies.fsv.cuni.cz/en/staff/cahlik',
 'https://ies.fsv.cuni.cz/en/staff/fcech',
 'https://ies.fsv.cuni.cz/en/staff/cervinka',
 'https://ies.fsv.cuni.cz/en/staff/chytilova',
 'https://ies.fsv.cuni.cz/en/staff/dedek',
 'https://ies.fsv.cuni.cz/en/staff/dolezalova',
 'https://ies.fsv.cuni.cz/en/staff/gersl',
 'https://ies.fsv.cuni.cz/en/staff/gregor',
 'https://ies.fsv.cuni.cz/en/staff/havranek',
 'https://ies.fsv.cuni.cz/en/staff/irsova',
 'https://ies.fsv.cuni.cz/en/staff/hlavacekm',
 'https://ies.fsv.cuni.cz/en/staff/hollmannov',
 'https://ies.fsv.cuni.cz/en/staff/holub',
 'https://ies.fsv.cuni.cz/en/staff/horvath',
 'https://ies.fsv.cuni.cz/en/staff/jakubik',
 'https://ies.fsv.cuni.cz/en/staff/janda',
 'https://ies.fsv.cuni.cz/en/staff/jansky',
 'https:

## Person's characteristics?

[A TEACHER!](https://ies.fsv.cuni.cz/en/staff/barunik)

In [21]:
def getName(link):
    soup = getSoup(link)
    return soup.find('h2').text
if True:
    names = [getName(link) for link in links]
    names

In [25]:
soup = getSoup(links[0])
node = soup.find('strong',text='Phone:')
node.next_sibling.strip()

'+420(776)259273'

In [26]:
def getNextSiblingOfStrong(soup,characteristic):
    strong = soup.find('strong',text=characteristic)
    return strong.next_sibling.strip()
getNextSiblingOfStrong(soup,'Phone:')

'+420(776)259273'

In [27]:
link

NameError: name 'link' is not defined

In [33]:
import pandas as pd
def getMoreCharacteristics(link, characteristics):
    soup = getSoup(link)
    return pd.Series({char.replace(':',''):getNextSiblingOfStrong(soup,char) for char in characteristics})

#[getMoreCharacteristics(link,['Phone:','Office:','Position:']) for link in links[:2]]
getMoreCharacteristics(links[0],['Phone:','Office:','Position:'])

Phone           +420(776)259273
Office                      503
Position    Associate Professor
dtype: object

* Let's do an object!

In [42]:
class Person:
    def __init__(self,link):
        self.soup = getSoup(link)
        self.name = self.getName()
        self.office = self.getNextSiblingOfStrong('Office:')
        self.phone = self.getNextSiblingOfStrong('Phone:')
        
    def getNextSiblingOfStrong(self, characteristic):
        strong = self.soup.find('strong',text=characteristic)
        return strong.next_sibling.strip()
    
    def getName(self):
        return self.soup.find('h2').text

people = [Person(link) for link in links[:2]]

In [43]:
[p.name for p in people]

['doc. PhDr. Jozef Baruník Ph.D.', 'doc. PhDr. Michal Bauer Ph.D.']

In [39]:
#forgot to add an attribute to the constructor?
[p.soup.find('h2').text for p in people]

['doc. PhDr. Jozef Baruník Ph.D.', 'doc. PhDr. Michal Bauer Ph.D.']

In [44]:
for p_instance in people:
    p_instance.full_name = p_instance.soup.find('h2').text

In [45]:
people[0].full_name

'doc. PhDr. Jozef Baruník Ph.D.'

In [46]:
Person(links[5]).full_name

AttributeError: 'Person' object has no attribute 'full_name'