In [11]:
import requests
import pandas as pd
from lxml import etree
import re

def get_body_text(root):
    '''Extraxts the text from the "body" tag of a etree xml TEI'''
    
    for child in root.getchildren():
        if 'text' in child.tag:
            for subchild in child.getchildren():
                if 'body' in subchild.tag:
                    return ' '.join(subchild.itertext())

def pdf_to_xml(url):
    '''Takes an url of a pdf as an input and returns its parsed xml TEI'''
    
    r = requests.get(url)                  # fetching the pdf
    r = requests.post('http://cloud.science-miner.com/grobid/api/processFulltextDocument', 
                      files={'input': r.content})
    return r.text

def extract_entities(xml):
    '''Takes an xml string and returns a json with the entities'''
    
    pattern = re.compile(r'<\?xml.*\?>')   # we need to get rid of the xml declaration
    xml = pattern.sub('', xml)
    root = etree.fromstring(xml)
    fulltext = get_body_text(root)
    alphanumeric = re.compile("([^\w\s']|_|\n|\t)+")
    fulltext = alphanumeric.sub(' ', fulltext)
    query = '{"text": "' + fulltext + '", "language": {"lang": "fr"} }'
    r = requests.post('http://localhost:8090/service/disambiguate', 
                      files={'query': query})
    return r.text

In [9]:
from Levenshtein import distance, hamming, jaro, ratio, setratio 

def compare(voc_one, voc_two):
    storage = {}
    result = {}
    for string1 in voc_one:
        for string2 in voc_two:
            storage['levenshtein'].append(distance(string1, string2))
            storage['hamming'].append(hamming(string1, string2))
            storage['jaro'].append(jaro(string1, string2))
    result = {'levenshtein-max': max(storage['levenshtein']),
              'levenshtein-mean': sum(storage['levenshtein'])/len(storage['levenshtein']),
              'levenshtein-min': min(storage['levenshtein']),
              'hamming-max': max(result['hamming']),
              'hamming-mean': sum(result['hamming'])/len(result['hamming']),
              'hamming-min': min(result['levenshtein']),
              'jaro-max': max(result['jaro']),
              'jaro-mean': sum(result['jaro'])/len(result['jaro']),
              'jaro-min': min(result['jaro']),
              'set-ratio': setratio(a, b)}
    return result

In [14]:
import pandas as pd
import json
import numpy as np

file = pd.read_excel('2020_export_Projet_Indexation_Automatique_Notice_accesTI_public_depuis2010_20200204.xlsx')
file = file.loc[file.LANGUE_DOC=="fre"]

results = pd.DataFrame()

for i, row in file.iterrows():
    pdf = row['ACCES_TEXTE_INTEGRAL']
    if type(row["DESCRIPTEURS"]) == str:
        result = {}
        descriptors = list(map(lambda x: x.strip(), row["DESCRIPTEURS"].split(";")))
        agris = list(map(lambda x: x.strip(), row["AGRIS"].split(";")))
        xml = pdf_to_xml(pdf)
        text_json = extract_entities(xml)
        text = json.loads(text_json)
        entities = [part["rawName"].strip() for part in text["entities"]]
        categories = [part["category"].strip() for part in text["global_categories"]]
        for i, a, b in enumerate([(descriptors, entities),
                                  (descriptors, categories),
                                  (agris, entities),
                                  (agris, categories)]):
            result = compare(a, b)
            result['compairison'] = i
            result['pdf'] = pdf
            results.append(result, ignore_index=True)
    else:
        print('not doing')

    if i == 3:
        break

results


ValueError: not enough values to unpack (expected 3, got 2)

In [21]:
print(row["DESCRIPTEURS"])

nan
