In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from tqdm import tqdm
import re
from IES_Downloader import IES_Downloader

# Lecture 6 - IES Web scraper

by Vítek Macháček

March 23, 2020

* Putting it all together
* OOP + Pandas + Requests + Scraping


## Object-oriented Programming
* Brief reminder of how objects work

### Docstring + Objects + Inheriting + Constructors

![Simple Object Structure](./img/class_example.png)

In [None]:
class Person:
    '''
    A Person class is *Abstract* - Intended not to be used directly, but rather to be inherited.
    '''
    def __init__(self,name,email):
        '''
        Person's constructor accepts name and e-mail and set it as class attributes, but first it checks whether they are in valid format
        '''

        if self._verify_name(name):
            self.name = name
        else:
            raise PersonException('Name is invalid')

        if self._verify_email(email):
            self.email = email
        else:
            raise PersonException('Email is invalid')
    
    def _verify_name(self,name):
        '''
        Method for ensuring that name is not empty and is string
        '''
        return len(name) > 0 and type(name) == type('')

    def _verify_email(self,email):
        '''
        Method validating that email is truly an email.
        '''

        email_pattern = '^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
        if(re.search(email_pattern,email)):  
            return True 
        else:  
            return False
    
    def _verify_courses(self,courses):
        '''
        Is list of courses a list containing valid course idents?
        '''
        def _verify_course(course):
            if type(course) != type(''):
                return False
            elif len(course) != 6:
                return False
            elif not (course.startswith('JEM') or course.startswith('JEB')):
                return False
            else:
                return True
        
        if type(courses) != type([]):
            return False

        if not all([_verify_course(course) for course in courses]):
            return False
        return True

    def describe(self):
        
        raise PersonException("Cannot describe parent Person class. Override this in Teacher or Student class")
    
    def get_name(self):
        if self.name:
            return self.name
        else:
            return PersonException('No name defined')
    def get_email(self):
        if self.email:
            return self.email
        else: 
            return PersonException('No email defined')
    
    
class PersonException(Exception):
    '''
    Object handling Person related errors
    '''
    pass

class TeacherException(Exception):
    '''
    Object handling Teacher related errors
    '''
    pass

class StudentException(Exception):
    '''
    Object handling Student related errors
    '''
    pass

someone = Person('Name','email@somewhere.com')

In [None]:
someone

In [None]:
someone.describe()

In [None]:
 class Teacher(Person):
    '''
    Teacher class inherits from Person and contains name, email and list of courses taught
    '''
    def __init__(self,name,email,teaching_courses):
        '''
        Teacher's constructor calls the Person's constructor first and then do his stuff
        '''
        super().__init__(name,email)
        
        if self._verify_courses(teaching_courses):
            self.teaching_courses = teaching_courses
        else:
            raise TeacherException('Cannot validate courses.')
    
    def describe(self):
        """ Overriding parent method """
        print('I am {}, my email is {} and I teach following courses: {}'.format(self.name,self.email,self.teaching_courses))
    
        
class Student(Person):
    '''
    Student class inherits from Person and contains name, email and list of courses taught
    '''
    def __init__(self,name,email,studying_courses):
        '''
        Student's constructor calls the Person's constructor first and then do his stuff
        '''
        super().__init__(name,email)
        
        if self._verify_courses(studying_courses):
            self.studying_courses = studying_courses
        else:
            raise StudentException('Cannot validate courses.')

    def describe(self):
        """ Overriding parent method """
        print("I am Vítek's student. My name is not important, although I have one. It will be important during the final evaluation though. I study following courses: {}".format(self.studying_courses))
        
vitek = Teacher('Vítek','vitezkytek@gmail.com',['JEM207'])

vitek.describe()

In [None]:
my_student = Student('Honza','honza@fsv.cuni.cz',['JEM20','JEB111'])
my_student.describe()

In [None]:
my_student.get_name()

In [None]:
?Person

In [None]:
?Teacher

### Robots.txt

* Is it OK to scrape?
* Guidance for search engines etc.


https://www.promptcloud.com/blog/how-to-read-and-respect-robots-file/

In [None]:
requests.get('http://ies.fsv.cuni.cz/robots.txt')

In [None]:
print(requests.get('http://sreality.cz/robots.txt').text)

## Task:
* A parser of IES websites with following features:
    * All info about people from [Internal faculty](http://ies.fsv.cuni.cz/en/node/48), [External lecturers](http://ies.fsv.cuni.cz/en/node/49), [Ph.D. candidates](http://ies.fsv.cuni.cz/en/node/51) and [Administration](http://ies.fsv.cuni.cz/en/node/50)
    * All info about [all](http://ies.fsv.cuni.cz/en/node/109) theses between 1994 and 2019 won'be covered as we have problems with the website
    * Also all courses! But no list of courses available ...

## Pages

### Find all persons?
[Current faculty](http://ies.fsv.cuni.cz/en/node/48)

1. understand structure of the website

In [None]:
def getSoup(link):
    sleep(0.1) #to be kind to the website
    r = requests.get(link)
    r.encoding = 'UTF-8'
    return BeautifulSoup(r.text,'lxml')

In [None]:
soup = getSoup('https://ies.fsv.cuni.cz/en/node/48')

In [None]:
names = soup.findAll('td', {'class':'peopleTableCellName'})
names

In [None]:
def getAllLinks(link):
    soup = getSoup(link)
    tds = soup.findAll('td', {'class':'peopleTableCellName'})
    return ['https://ies.fsv.cuni.cz' + td.find('a')['href'] for td in tds]

links = getAllLinks('http://ies.fsv.cuni.cz/en/node/48')
links

## Person's characteristics?

[A TEACHER!](https://ies.fsv.cuni.cz/en/staff/barunik)

In [None]:
def getName(link):
    soup = getSoup(link)
    return soup.find('h2').text.strip()
if True:
    names = [getName(link) for link in links]
    names

In [None]:
names

In [None]:
soup = getSoup(links[0])
node = soup.find('strong',text='Phone:')
node.next_sibling.strip()

In [None]:
def getNextSiblingOfStrong(soup,characteristic):
    strong = soup.find('strong',text=characteristic)
    return strong.next_sibling.strip()
getNextSiblingOfStrong(soup,'Phone:')

In [None]:
def getMoreCharacteristics(link, characteristics):
    soup = getSoup(link)
    return pd.Series({char.replace(':',''):getNextSiblingOfStrong(soup,char) for char in characteristics})

#[getMoreCharacteristics(link,['Phone:','Office:','Position:']) for link in links[:2]]
getMoreCharacteristics(links[0],['Phone:','Office:','Position:'])

* Let's do an object!

In [None]:
class Person:
    def __init__(self,link):
        self.soup = getSoup(link)
        self.name = self.getName()
        self.office = self.getNextSiblingOfStrong('Office:')
        self.phone = self.getNextSiblingOfStrong('Phone:')
        
    def getNextSiblingOfStrong(self, characteristic):
        strong = self.soup.find('strong',text=characteristic)
        return strong.next_sibling.strip()
    
    def getName(self):
        return self.soup.find('h2').text
    
    def getCharacteristics(self):
        return pd.Series({
            'name':self.name,
            'office':self.office,
            'phone':self.phone
        })

people = [Person(link) for link in links[:2]]

In [None]:
[p.name for p in people]

In [None]:
#We still have raw data if needed!
[p.soup.find('h2').text for p in people]

In [None]:
pd.DataFrame([p.getCharacteristics() for p in people])