## Getting data from Hinton's website

In [1]:
import requests
from bs4 import BeautifulSoup

URL = "http://www.cs.toronto.edu/~hinton/papers.html"
URL_PAPERS = "http://www.cs.toronto.edu/~hinton/"
response = requests.get(URL)
if(response.status_code == 200):
    print("Correctly loaded website\nExtracting source code...")
    src = response.text
    soup = BeautifulSoup(src)
    print("Finished!")
print("Extracting papers...\n")
papers = []
raw_papers = soup.find('table').find_all('tr')

for row in raw_papers:
    year = row.td.text
    if row.select('td')[1].b != None:
        title = row.select('td')[1].b.text
        title = " ".join(title.split())
    else:
        title = "title_missing"
        
    authors = row.contents[2].contents[0]
    authors = " ".join(authors.split())
    if row.find('a', href=True) != None:
        paper_url = row.find('a', href=True).attrs['href']
    else:
        paper_url = "missing"
        
    papers.append([str(year), str(authors), str(title), str(paper_url)])
print("Finished preprocessing articles!")
print("Removing whitespaces from the name and title fields...")

# cleaning whitespaces and special characters in years authors and titles
for x in range(0, len(papers)):
    papers[x][0] = " ".join(papers[x][0].split())
    for r in (("\n", ""), ("\r", ""), (" ", "_"), (",",""), (".","")):
        papers[x][1] = papers[x][1].replace(*r)
        papers[x][2] = papers[x][2].replace(*r)
print('\nAn example of the paper from our list:\n1.', papers[0], "\n2.", papers[1])

Correctly loaded website
Extracting source code...
Finished!
Extracting papers...

Finished preprocessing articles!
Removing whitespaces from the name and title fields...

An example of the paper from our list:
1. ['2018', 'Hinton_G_E_Sabour_S_and_Frosst_N', 'Matrix_Capsules_with_EM_Routing', 'absps/EMcapsules.pdf'] 
2. ['2018', 'Kiros_J_R_Chan_W_and_Hinton_G_E', 'Illustrative_Language_Understanding:_Large-Scale_Visual_Grounding_with_Image_Search', 'absps/picturebook.pdf']


## Folder management...
End-goal Structure:
- hinton/
    - 2018/title_name.pdf
    - 2017/title_name.pdf
    - 2016/title_name.pdf
    - ...
    - 1986/title_name.pdf
   
So our application has to be able to change current working directories, according to the 'year' field in our *papers* array.

## Creating empty folders

In [2]:
import os
root_path = 'hinton/'
years = set() # efficient use of the set data structure / future ver. should contain set compreh.

for paper in papers:
    years.add(paper[0])
for year in years:
    os.makedirs(root_path + year, exist_ok=True)

## Now, having all folders created for us, we can proceed to downloading the pdfs

In [143]:
# if url starts with http or "missing" then pass
for paper in papers:
    download_url = ""
    if(("http" in paper[3] and  "pdf" not in paper[3]) or paper[3] == 'missing'):
        print("Can't download - ", paper, "\n")
        
    elif ("http" in paper[3] and "pdf" in paper[3]):
        download_url = paper[3]
    else:
        download_url = URL_PAPERS + paper[3]
        save_path = root_path + paper[0] +"/"+ paper[1] + "-" + paper[2] + ".pdf"
        
        file = requests.get(download_url)
        if(file.status_code == 200):
            with open(save_path, "wb") as output:
                output.write(file.content)
            print("Downloaded: ", paper)
        else:
            print("File does not exist!")
print("Finished downloading!")

Downloaded:  ['2018', 'Hinton_G_E_Sabour_S_and_Frosst_N', 'Matrix_Capsules_with_EM_Routing', 'absps/EMcapsules.pdf']
Downloaded:  ['2018', 'Kiros_J_R_Chan_W_and_Hinton_G_E', 'Illustrative_Language_Understanding:_Large-Scale_Visual_Grounding_with_Image_Search', 'absps/picturebook.pdf']
Downloaded:  ['2018', 'Anil_R_Pereyra_G_Passos_A_Ormandi_R_Dahl_G_and_Hinton_G_E', 'Large_scale_distributed_neural_network_training_through_online_distillation', 'absps/OnlineDistillation.pdf']
Downloaded:  ['2018', 'Guan_M_Y_Gulshan_V_Dai_A_M_and_Hinton_G_E', 'Who_Said_What:_Modeling_Individual_Labelers_Improves_Classification', 'absps/WhoSaidWhat.pdf']
Downloaded:  ['2017', 'Sabour_S_Frosst_N_and_Hinton_G_E', 'Dynamic_Routing_between_Capsules', 'absps/DynamicRouting.pdf']
Downloaded:  ['2017', 'Shazeer_N_Mirhoseini_A_Maziarz_K_Davis_A_Le_Q_Hinton_G_&_Dean_J', 'Outrageously_large_neural_networks:_The_sparsely-gated_mixture-of-experts_layer', 'absps/Outrageously.pdf']
Downloaded:  ['2017', 'Frosst_N_and_H

Downloaded:  ['2010', 'Hinton_G_E', 'Learning_to_represent_visual_input', 'absps/Philtrans2010.pdf']
Downloaded:  ['2010', 'Mnih_V_and_Hinton_G_E', 'Learning_to_detect_roads_in_high-resolution_aerial_images', 'absps/road_detection.pdf']
Can't download -  ['2010', 'Sutskever_I_and_Hinton_G_E', 'Temporal_Kernel_Recurrent_Neural_Networks', 'http://www.sciencedirect.com/science?_ob=ArticleURL&_udi=B6T08-4XMD5HV-2&_user=994540&_coverDate=03%2F31%2F2010&_rdoc=1&_fmt=high&_orig=search&_sort=d&_docanchor=&view=c&_acct=C000050024&_version=1&_urlVersion=0&_userid=994540&md5=9b78a6369ae8a8bdbdca443c8277de76'] 

Downloaded:  ['2010', 'Ranzato_M_and_Hinton_G_E', 'Modeling_pixel_means_and_covariances_using_factored_third-order_Boltzmann_machines', 'absps/ranzato_cvpr2010.pdf']
Downloaded:  ['2010', 'Taylor_G_Sigal_L_Fleet_D_and_Hinton_G_E', 'Dynamic_binary_latent_variable_models_for_3D_human_pose_tracking', 'absps/gwtaylor_cvpr2010.pdf']
Downloaded:  ['2010', 'Ranzato_M_Krizhevsky_A_and_Hinton_G_E',

Downloaded:  ['2003', 'Hinton_G_E_and_Roweis_S', 'Stochastic_Neighbor_Embedding', 'absps/sne.ps.gz']
Downloaded:  ['2003', 'Welling_M_Zemel_R_S_and_Hinton_G_E', 'Efficient_parametric_projection_pursuit_density_estimation', 'absps/ppp.ps.gz']
Downloaded:  ['2003', 'Welling_M_Zemel_R_and_Hinton_G_E', 'Self-Supervised_Boosting', 'absps/UboostNips.pdf']
Downloaded:  ['2003', 'Welling_M_Hinton_G_E_and_Osindero_S', 'Learning_Sparse_Topographic_Representations_with_Products_of_Student-t_Distributions', 'absps/PoT.ps.gz']
Downloaded:  ['2003', 'Hinton_G_E', 'The_ups_and_downs_of_Hebb_synapses', 'absps/hebbdot.pdf']
Downloaded:  ['2002', 'Hinton_G_E_(2002)', 'Training_Products_of_Experts_by_Minimizing_Contrastive_Divergence', 'absps/nccd.pdf']
Downloaded:  ['2002', 'Friston_KJ_Penny_W_Phillips_C_Kiebel_S_Hinton_G_E_and_Ashburner_J', 'Classical_and_Bayesian_Inference_in_Neuroimaging:_Theory', 'absps/neuroimaging.pdf']
Downloaded:  ['2002', 'Brown_A_D_and_Hinton_G_E', "Relative_Density_Nets:_A_Ne

Downloaded:  ['1995', 'Williams_C_K_I_Hinton_G_E_and_Revow_M', 'Using_a_neural_net_to_instantiate_a_deformable_model', 'absps/nips-ckiw.html']
Downloaded:  ['1995', 'Xu_L_Jordan_M_I_and_Hinton_G~E', 'An_alternative_model_for_mixtures_of_experts', 'absps/xu.pdf']
Downloaded:  ['1995', 'Fels_S_S_and_Hinton_G~E', 'GloveTalkII:_Mapping_hand_gestures_to_speech_using_neural_networks', 'absps/glove2nips.pdf']
Downloaded:  ['1995', 'Zemel_R_S_and_Hinton_G_E', 'Learning_population_codes_by_minimizing_description_length', 'absps/mdlpop.html']
Downloaded:  ['1995', 'Hinton_G_E_Dayan_P_Frey_B_J_and_Neal_R', 'The_wake-sleep_algorithm_for_unsupervised_Neural_Networks', 'absps/ws.htm']
Downloaded:  ['1995', 'Dayan_P_Hinton_G_E_Neal_R_and_Zemel_R_S', 'The_Helmholtz_machine', 'absps/helmholtz.htm']
Downloaded:  ['1995', 'Hinton_G_E_Dayan_P_To_A_and_Neal_R_M', 'The_Helmholtz_machine_through_time', 'absps/hmtt.htm']
Downloaded:  ['1995', 'Hinton_G_E_and_Frey_B_J', 'Using_neural_networks_to_monitor_for_ra

Downloaded:  ['1986', 'Hinton_G_E_and_Sejnowski_T_J', 'Learning_and_relearning_in_Boltzmann_machines', 'absps/pdp7.pdf']
Downloaded:  ['1986', 'Rumelhart_D_E_Hinton_G_E_and_Williams_R_J', 'Learning_internal_representations_by_error_propagation', 'absps/pdp8.pdf']
Downloaded:  ['1986', 'Rumelhart_D_E_Smolensky_P_McClelland_J_L_and_Hinton_G_E', 'Parallel_distributed_models_of_schemata_and_sequential_thought_processes', 'absps/pdp14.pdf']
Downloaded:  ['1986', 'Pearlmutter_B_A_and_Hinton_G_E', 'G-maximization:_An_unsupervised_learning_procedure_for_discovering_regularities', 'absps/gmax.pdf']
Downloaded:  ['1985', 'Ackley_D_H_Hinton_G_E_and_Sejnowski_T_J', 'A_learning_algorithm_for_Boltzmann_machines', 'absps/cogscibm.pdf']
Downloaded:  ['1985', 'Touretzky_D_S_and_Hinton_G_E', 'Symbols_among_the_neurons:_Details_of_a_connectionist_inference_architecture', 'absps/symbolsIJCAI.pdf']
Downloaded:  ['1985', 'Hinton_GE_and_Lang_KJ', 'Shape_recognition_and_illusory_conjunctions', 'absps/illusory