In [1]:
import requests
import pandas as pd
import re
import time
from bs4 import BeautifulSoup as bs
from IPython.core.debugger import set_trace

In [6]:
class Researcher():
    def __init__(self,link):
        self.link = link
        self.request = requests.get(link)
        self.request.encoding='UTF-8'
        self.soup = bs(self.request.text,'lxml')
        self.name = self.parseH2()
        self.characteristics = self.parseCharacteristics()
        self.courses = self.parseCourses()
        self.theses = self.parseTheses()
        
    def parseCharacteristics(self):
        nextSiblings = ['Akademická funkce:','Odborné zaměření:','Členství:','Kancelář:','Email:','Telefon:','Konzultační hodiny:']       
        ser_nextSiblings = self.parseNextSiblings(nextSiblings)
        
        parentsNextSiblings = ['Člen organizací','Vzdělání','Odborná praxe','Veřejné aktivity']
        ser_parentsNextSiblings = self.parseParentsNextSibling(parentsNextSiblings)
        
        return pd.concat([ser_nextSiblings,ser_parentsNextSiblings])
        
    def parseH2(self):
        return self.soup.find('h2').text.strip()
    
    def parseNextSiblings(self,attrs):
        d = {}
        for attr in attrs:
            el = self.soup.find('strong',text=attr)
            if el:
                text = el.next_sibling
                if text is not None:
                    d[attr] = text.strip()
                else:
                    d[attr] = None
        return pd.Series(d)
        
    def parseParentsNextSibling(self,attrs):
        d = {}
        for key in attrs:
            el = self.soup.find('strong',text=key)
            if el:
                d[key] = el.parent.next_sibling
            
        return pd.Series(d)
    
    def parseCourses(self):
        courses_html = ''
        h3s = self.soup.find("h3",text='Předměty')
        if h3s:
            for tag in h3s.next_siblings:
                if tag.name == "h3":
                    break
                else:
                    courses_html += str(tag)
            courses_soup = bs(courses_html,'lxml')
            courses = courses_soup.select('a[href*=syllab]')
            roles = [course.parent.find_previous_sibling('h4').text for course in courses]
            names = [course.text.split(' - ')[1].strip() for course in courses]
            researchers = [self.name] * len(courses)
            idents = [course.text.split(' - ')[0].strip() for course in courses]

            return pd.DataFrame({'role':roles,'name':names,'researcher':researchers,'ident':idents})
        else:
            return pd.DataFrame({'role':[],'name':[],'researcher':[],'ident':[]})
    
    def parseTheses(self):
        d = {'bakalářské':'Vedoucí bakalářských prací','diplomové':'Vedoucí diplomových prací'}
        theses = pd.DataFrame()
        for key in d:
            el = self.soup.find('h3',text=d[key])
            if el:
                counts = el.next_sibling.next_sibling.find('strong').next_sibling.strip()
                total = int(counts.split('/')[0])
                marked = int(counts.split('/')[1])
                self.characteristics.loc[key + '_vše'] = total
                self.characteristics.loc[key + '_oceněné'] = marked
                
                if marked > 0:
                    works = el.next_sibling.next_sibling.select('a[href*=work]')
                    names = [work.text for work in works]
                    links = [work['href'] for work in works]
                    bcmgr = [key] * len(works)

                    df = pd.DataFrame({'name':names,'link':links,'type':bcmgr,'researcher':[self.name] * len(works)})
                    theses = pd.concat([theses,df])
        return theses

In [18]:
r = requests.get('http://ies.fsv.cuni.cz/cs/node/48')
r.encoding = 'UTF-8'
soup = bs(r.text,'lxml')
tds = soup.find_all('td',{'class':'peopleTableCellName'})
links = ['http://ies.fsv.cuni.cz/' + td.a['href'] for td in tds]

researchers = []
lcourses = []
ltheses = []
i=0
for link in links:
    res = Researcher(link)
    researchers.append(res)
    lcourses.append(res.courses)
    ltheses.append(res.theses)
    print('Success: {},({})'.format(res.name,i))
    i += 1
    time.sleep(0.5)
    
courses = pd.concat(lcourses)
theses = pd.concat(ltheses)
    

Success: doc. PhDr. Jozef Baruník Ph.D.,(0)
Success: doc. PhDr. Michal Bauer Ph.D.,(1)
Success: PhDr. Jaromír Baxa Ph.D.,(2)
Success: PhDr. Lucie Bryndová ,(3)
Success:  Martin Burda M.A., Ph.D.,(4)
Success: doc. Ing. Tomáš Cahlík CSc.,(5)
Success: RNDr. Michal Červinka Ph.D.,(6)
Success: doc. PhDr. Julie Chytilová Ph.D.,(7)
Success: prof. Ing. Oldřich Dědek CSc.,(8)
Success: doc. PhDr. Ing. Antonie Doležalová Ph.D.,(9)
Success: doc. PhDr. Adam Geršl Ph.D.,(10)
Success: doc. PhDr. Martin Gregor Ph.D.,(11)
Success: doc. PhDr. Tomáš Havránek Ph.D.,(12)
Success: PhDr. Zuzana Havránková Ph.D.,(13)
Success: PhDr. Michal Hlaváček Ph.D.,(14)
Success: Ing. Monika Hollmannová ,(15)
Success: doc. Mgr. Tomáš Holub Ph.D.,(16)
Success: prof. Roman Horváth Ph.D.,(17)
Success: doc. PhDr. Ing. Ing. Petr Jakubík Ph.D. Ph.D.,(18)
Success: prof. Ing. Karel Janda M.A., Dr., Ph.D.,(19)
Success:  Petr Janský Ph.D.,(20)
Success: PhDr. Jiří Kameníček CSc.,(21)
Success: Ing. Irena Kemény ,(22)
Success: prof. I

In [19]:
courses

Unnamed: 0,ident,name,researcher,role
0,JEM005,Advanced Econometrics,doc. PhDr. Jozef Baruník Ph.D.,Garant
1,JED414,Kvantitativní metody I,doc. PhDr. Jozef Baruník Ph.D.,Garant
2,JED415,Kvantitativní metody II,doc. PhDr. Jozef Baruník Ph.D.,Garant
3,"JED412,413",Nonlinear Dynamic Economic Systems: Theory and...,doc. PhDr. Jozef Baruník Ph.D.,Garant
4,JEM059,Quantitative Finance I,doc. PhDr. Jozef Baruník Ph.D.,Garant
5,JEM061,Quantitative Finance II,doc. PhDr. Jozef Baruník Ph.D.,Garant
6,JEM005,Advanced Econometrics,doc. PhDr. Jozef Baruník Ph.D.,Vyučující
7,JEM116,Applied Econometrics,doc. PhDr. Jozef Baruník Ph.D.,Vyučující
8,JED414,Kvantitativní metody I,doc. PhDr. Jozef Baruník Ph.D.,Vyučující
9,JED415,Kvantitativní metody II,doc. PhDr. Jozef Baruník Ph.D.,Vyučující
