In [1]:
import requests
from bs4 import BeautifulSoup
# pip install lxml
import pickle
import random
from collections import defaultdict
import json

# debug
import sys
from pympler import asizeof

## Graph des directeurs de thèses --> thésards

Avec les données du site theses.fr, et l'api:
http://documentation.abes.fr/aidethesesfr/accueil/index.html#RecupererDonneesPagePersonne

info sur les Parser XML de BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser


- Les info sur les thèses sont récupérées avec des requètes sur une personne.
    - liste des thèses (la sienne, en tant que directeur, que rapporteur, et membre du jury)
    

puis sur les personnes connexes. 



In [2]:
def parse_foaf(tag):
    try:
        name = tag.text
        person_id = tag.findChild().attrs['rdf:about']
    except KeyError:
        # print(tag)
        name, person_id = (tag.text, None)
    return (name, person_id)


def short_id(auth, year):
    """ Robust id for people 
    """
    # th:   (('Name Name', 'http://www.theses.fr/0123456789/id'), 1296)
    name = auth[0]
    initials = ''.join( [u[0] for u in name.split()] )
    idx = auth[1].split('/')[-2]
    return initials+'_'+str(year)+'_'+idx


def query_someone(person_id):
    ''' Request info about the person
        returns the person info (name and id) and list of related thesis
    '''
    url = person_id.replace('/id', '.xml')

    r = requests.get(url)
    
    soup = BeautifulSoup(r.text, 'lxml') # Set the parser
    
    # Person info:
    agent = soup.find('foaf:agent')
    person_id = agent.attrs['rdf:about']
    name = agent.find('foaf:name').text
    person = (name, person_id)
    
    # Thesis list:
    linked_thesis = []
    for th in soup.find_all('bibo:thesis'):
        info = dict()
        info['title'] = th.find('dc:title').text
        info['author'] = parse_foaf( th.find('marcrel:aut') )
        info['id'] = th.attrs['rdf:about']
        info['url'] = th.find('dc:identifier').text if th.find('dc:identifier') else ''
        try:
            info['year'] = int(th.find('dc:date').text)
        except ValueError:
            info['year'] = 0
        info['directors'] = [parse_foaf(ths) for ths in th.find_all('marcrel:ths')]
        info['doctoral_school'] = parse_foaf(th.find('dcterms:contributor')) if th.find('dcterms:contributor') else ()
        info['univ'] = parse_foaf(th.find('marcrel:dgg'))
        info['short_id'] = short_id(info['author'], info['year'])
        
        linked_thesis.append(info)

    return person, linked_thesis

In [3]:
def is_valid_id(foaf_id):
    return foaf_id.endswith('/id') and not foaf_id.endswith('//id')

In [4]:
# Objet qui stocke les données explorées
# et les requetes suivante à faire
class Blob():
    def __init__(self, seed_id):
        self.already_asked = set()
        self.to_search = set([seed_id, ])
        self.thesis = dict()
        self.seed = seed_id
        self.chains = []
        self.nbr_gen = 0
        self.name = seed_id.split('/')[-2]
        
    def grow(self):
        person_id = self.to_search.pop()
       
        full_id, results = query_someone(person_id)
        print('\r', full_id[0], '(%i theses)'%len(results), end=' '*20)
        self.already_asked.add(person_id)
        self.nbr_gen += 1
        
        # default value
        own_thesis = {'author':full_id,
                      'year':0,
                      'id':None,
                      'directors':[],
                      'short_id':short_id(full_id, 0)} 
        
        # Sort the thesis (own, as a director, other)
        a_dirige = []
        related = []     
        for th in results:
            if full_id == th['author']:
                own_thesis = th
            elif full_id in (d for d in th['directors']):
                a_dirige.append(th)
            else:
                related.append(th)
        
        a_dirige.sort(key=lambda x:x['year'])

        # Update new persons to search:
        co_directors = {d for th in a_dirige+[own_thesis, ]
                          for d in th['directors'] if d!=full_id}
        students = {th['author'] for th in a_dirige}

        new_person = [p[1] for p in co_directors | students
                      if is_valid_id(p[1]) and p[1] not in self.already_asked]

        self.to_search.update(new_person)
        
        # Save thesis:
        self.thesis[own_thesis['short_id']] = own_thesis
        self.thesis.update({th['short_id']:th for th in a_dirige})
        
        # Build 'chain' (for subway graph)
        if len(a_dirige) > 0:
            chain = [own_thesis['short_id'], ] + [th['short_id'] for th in a_dirige]
            self.chains.append(chain)
            
        
    def save(self, name='blob.p'):
        pickle.dump(self, open( name, "wb" ))
        print("saved")
        
    def print_info(self):
        print('nbr thesis:', len(blob.thesis))
        print('nbr to search:', len(blob.to_search))
        print(self.seed, '+', self.nbr_gen, 'generations')

In [5]:
# Get a random seed
seedlist_file = "data/seedlist.pick"
try:
    seedlist = pickle.load( open( seedlist_file, "rb" ) )
    print('seedlist loaded:', len(seedlist), "seeds")
except FileNotFoundError:
    seedlist = ['http://www.theses.fr/068670648/id', ]
    pickle.dump(seedlist, open( seedlist_file, "wb" ))
    seedlist = pickle.load( open( seedlist_file, "rb" ) )
    print('seedlist loaded:', len(seedlist), "seeds")
    
seed_id = random.choice(seedlist)
print(seed_id)

# New graph
blob = Blob(seed_id)

seedlist loaded: 256 seeds
http://www.theses.fr/028295730/id


In [6]:
blob.grow()

 Jean-Claude Gatina (21 theses)                    

In [7]:
# Grow
for _ in range(30):
    blob.grow()

print('\n')
blob.print_info()

 Étienne Labyt (2 theses)                                              

nbr thesis: 129
nbr to search: 124
http://www.theses.fr/028295730/id + 31 generations


In [8]:
# Export the graph (for the layout)
filename_prefix = 'data/'+blob.name+'_'+str(len(blob.thesis))
print(filename_prefix)

# Chains
chs = sorted(blob.chains, key=len, reverse=True)
print(len(chs), "chains")

with open(filename_prefix + '_chains.json', 'w') as outfile:
    json.dump(chs, outfile)

data/028295730_129
13 chains


In [9]:
# Extract chain info and update
chain_info = dict()
for chain in blob.chains:
    chain_info[chain[0]] = {'length':len(chain),
                            'color':'red',
                            'name':blob.thesis[chain[0]]['author'][0],
                            'year':blob.thesis[chain[0]]['year']}
    blob.thesis[chain[0]]['nbr_thesis'] = len(chain)
    #blob.thesis[chain[0]]['color'] = red
    
# Export the info
with open(filename_prefix + '_chaininfo.json', 'w') as outfile:
    json.dump(chain_info, outfile)

# Export the thesis info
with open(filename_prefix + '_thesis.json', 'w') as outfile:
    json.dump(blob.thesis, outfile)
    
print(len(blob.thesis), 'thesis')

129 thesis


In [10]:
# Save
#blobname = 'data/grosblob6.p'
#blob.save(blobname)