In [None]:
#-------------------------------------------------------------------------------------------------------------------#
#ALGORITHMS TO DO THE FOLLOWING TASKS. It lasted 5 or 6 hours to get all necessary data from this webpages.         #
#                                                                                                                   #
#  1. Browse the web https://dspace.cvut.cz/?locale-attribute=en and find out how to download data on               #
#     Bachelor and Master theses.                                                                                   #
#  2. Download or scrape the data such that for each thesis you know the following: Faculty name, department name,  #
#     thesis title, thesis type (bachelor/master), supervisor name, reviewer name, year (or date) of the defence,   #
#     study programme and discipline, link to a webpage with details.                                               #
#  3. Store these data in one _csv_ file (should be handed in along with this notebook).                            #
#-------------------------------------------------------------------------------------------------------------------#

from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import pandas as pd
import numpy as np
import sklearn as skit
import seaborn as sns
import requests

#--------------------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------------#
#--------FUNCTIONS--------FUNCTIONS--------FUNCTIONS--------FUNCTIONS--------FUNCTIONS--------FUNCTIONS--------#
#--------------------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------------#


def getTagFromPage(url, set_number) : #set number is to get informations from special part of html
    #we get data about webpage with given url
    html_page = urlopen(url)
    soup = BeautifulSoup(html_page)

    #type is bs4.element.ResultSet
    result_set = soup.findAll('ul', attrs={'class' : 'ds-artifact-list list-unstyled'})

    if len(result_set)>0 :
        #type is bs4.element.Tag
        return result_set[set_number]
    else :
        return None
    
def takeValueOfColumnFromDataFrame(data_frame_name, column_name) :
    try :
        value = data_frame_name[column_name].values[0]
        return value

    except KeyError :
        return "N/A"

def putDataIntoFrame(th,data_frame) :
    #getting values
    name = th.get("name")
    link = th.get("link")
    th_type = th.get("type")
    fac = th.get("fac_name")
    dep = th.get("dep_name")

    #preparing record
    r=requests.get(link)        
    r.encoding='utf-8'
    #cp1250
    #r.encoding='ISO8859_2'
    ldf = pd.read_html(r.text, flavor='html5lib')
    df = ldf[0]        
    df.set_index([0], inplace=True)
    dfT = df.transpose()
    dfT = dfT.drop(dfT.index[1])            

    #prepare data to insert    
    supervisor = takeValueOfColumnFromDataFrame(dfT, "dc.contributor.advisor") #supervisor
    reviewer  = takeValueOfColumnFromDataFrame(dfT, "dc.contributor.referee") #reviewer
    date_of_def = takeValueOfColumnFromDataFrame(dfT, "dc.date.issued") #date of defece
    s_prog = takeValueOfColumnFromDataFrame(dfT, "theses.degree.programme") #study programme
    s_disc = takeValueOfColumnFromDataFrame(dfT, "theses.degree.discipline") #study discipline

    #inserting
    df_temp = pd.DataFrame([[fac,dep,name,th_type,supervisor,reviewer,date_of_def,s_prog,s_disc,link]], columns=listOfColumns)
    data_frame = pd.concat([data_frame,df_temp],ignore_index=True)
    
    return data_frame 

#--------------------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------------#
#-----END OF FUNCTIONS-----END OF FUNCTIONS-----END OF FUNCTIONS-----END OF FUNCTIONS-----END OF FUNCTIONS-----#
#--------------------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------------#

### GETTING DATA ABOUT FACULTY NAME, DEPARTMENT NAME AND TYPE OF THESIS ###

#url from which we will start to search
url = 'https://dspace.cvut.cz/handle/10467/3641'

#each address contains this
mainPartOfAddress = "https://dspace.cvut.cz"
    
#get html of main page
tag_with_faculties = getTagFromPage(url, 0)

print()
print()
print("Start of algorithm. Time to get data about faculties' names!!!")
print()
print()

#to store name of faculty
faculties = []

for link in tag_with_faculties.findAll('a') :
    
    temp_addr = mainPartOfAddress + link.get('href')
    
    faculty = {
        "name" : link.string,
        "link" : temp_addr
    }
    faculties.append(faculty)

print()
print()
print("Time to get data about departments' names!!!")
print()
print()
    
#to store name of department
departments = []  
for faculty in faculties :
    
    fac_name = faculty.get("name")
    fac_link = faculty.get("link")
    
    tag_with_departments = getTagFromPage(fac_link, 0)
    
    if tag_with_departments is None:
        continue 

    for link in tag_with_departments.findAll('a') :
        temp_addr = mainPartOfAddress + link.get('href')

        department = {
            "name" : link.string,
            "link" : temp_addr,
            "fac_name" : fac_name
        }
        departments.append(department)

print()
print()
print("Time to get data about types of theses!!!")
print()
print()
        
#to store type of thesis        
thesis_types = []
i=0 #just to confirm that algorith is working
for department in departments :
    
    #-----------------------------------------#
    #just to confirm that algoritm is working #
    if i%10==0 :                              #
        print(i)                              #
    i+=1                                      #
    #-----------------------------------------#
    
    dep_name = department.get("name")
    dep_link = department.get("link")
    fac_name = department.get("fac_name")

    tag_with_thesis_types = getTagFromPage(dep_link,0)

    if tag_with_thesis_types is None:
        continue 

    for link in tag_with_thesis_types.findAll('a') :
        temp_str = link.string #to check type of thesis
        
        #we need to store informations only about Bachelor and Master Theses
        if temp_str.startswith("Baka") or temp_str.startswith("Diplomo") :
            if temp_str.startswith("Baka") :
                th_type = "Bachelor"
            else :
                th_type = "Master"

            temp_addr = mainPartOfAddress + link.get('href')

            thesis_type = {
                "type" : th_type,
                "link" : temp_addr,
                "fac_name" : fac_name,
                "dep_name" : dep_name
            }
            thesis_types.append(thesis_type)
            
#-------------------------------------------------------------------------------------------

### GETTING REST OF THE DATA ABOUT THESIS FROM THE OWN PAGE OF THESIS WITH THEIR DETAILS ###

print()
print("------------------------------------------------")
print("Now it is time to get final data about theses!!!")
print("------------------------------------------------")
print()

#to store informations about theses
theses = []
listOfColumns = ['FACULTY_NAME','DEPARTMENT_NAME', 'THESIS_TITLE', 'THESIS_TYPE','SUPERVISOR_NAME','REVIEWER_NAME', 'DATE_OF_DEFENCE','STUDY_PROGRAMME','STUDY_DISCIPLINE','LINK_TO_WEBPAGE']
data_frame = pd.DataFrame(columns=listOfColumns)

file_to_save = "pdd_1st_homework_wz_{}.csv"

i=1 # to write current number of thesis_type
j=1 # to write current number of thesis

for thesis_type in thesis_types : 
    
    
    print(i)
    i+=1
    
    th_type = thesis_type.get("type")
    th_type_link = thesis_type.get("link") + "?offset={}"
    fac_name = thesis_type.get("fac_name")
    dep_name = thesis_type.get("dep_name")
    
    offset = 0
    
    #on one page there is just 20 theses. I have to take them from all pages
    while True :
        
        temp_th_type_link = th_type_link.format(offset)
        
        tag_with_theses = getTagFromPage(temp_th_type_link,1)

        if tag_with_theses is None:
            break 

        #not to make infinitive loop. We want to check this pages where are records
        num_of_th_on_page = 0
            
        for link in tag_with_theses.findAll('a') :
            
            #---------------------------------------------------------#
            #just to confirm that algoritm is working                 #
            if j%20 == 0 :                                            #
                print("   {} link : {}".format(j,temp_th_type_link))  #
            #---------------------------------------------------------#
            j+=1
            
            #every 1000 records I want to save them in case of any failure during the operation of algorithm
            if j%1000==0 :
                temp_filename = file_to_save.format(j)
                data_frame.to_csv(temp_filename, sep='|', encoding='utf-8', index=False)
                print(temp_filename)
            
            #address with details of thesis stored in a table
            temp_addr = mainPartOfAddress + link.get('href') + "?show=full"

            thesis = {
                "name" : link.string,
                "link" : temp_addr,
                "type" : th_type,
                "fac_name" : fac_name,
                "dep_name" : dep_name
            }
            data_frame = putDataIntoFrame(thesis,data_frame)
            theses.append(thesis)
            num_of_th_on_page+=1
        
        #if there was no theses on page we just stop loop
        if num_of_th_on_page == 0 :
            break
            
        offset+=20 #there is 20 records on one page

temp_filename = file_to_save.format("all_data")

data_frame.to_csv(temp_filename, sep='|', encoding='utf-8')

print('successfully saved ',temp_filename)

print()
print("--------------------")
print("END OF ALGORITHM !!!")
print("--------------------")
print()