# Extraire la legende depuis la description

In [251]:
import pickle
import regex   # permet overlapping matching
import copy

In [252]:
def print_legend( legend ):
    sortfun = lambda x: ( x['number'] , len(x['position']), x['label'][::-1] )
    legendsorted = sorted( legend, key=sortfun  )

    current_number = legendsorted[0]['number']
    for match in legendsorted:
        if current_number != match['number']:
            print('     --')
            current_number = match['number']

        match['label'] = match['label'].replace('\n', 'NL')
        match['context'] = match['context'].replace('\n', 'NL')
        
        n_merged = len(match['position'])
        n_merged_str = '  ' if n_merged==1 else '(%i)'%n_merged
        print( '{label:>42} {number:>4}  {n}  {context:>35}'.format( **match, n=n_merged_str) )

In [253]:
# test: load pre-processed data
data = pickle.load( open( "../web/data/patent_infos.pickle", "rb" ) )
len(data)

379

In [254]:
# test: select a patent

k = 45  #70 #44 #41
patent = sorted( data.values(), key=lambda x:x['patent_number'] )[k]

patent = data['US20040200073']
description = patent['description']

print( patent['patent_number'] )
print( 'https://www.google.com/patents/%s'%patent['patent_number'] )

print('nombre de mots :', len(description.split(' ')))
print('nombre de caractères :', len(description))

US20040200073
https://www.google.com/patents/US20040200073
nombre de mots : 5378
nombre de caractères : 23679


In [255]:
#description = 'ins as u.s. pat. no. 5,775,340 on jul. 7, 1998 and u.s. pat. no. 5,926,956 was issued to kirk langmen et al. on jul. 27, 1999. while these nail clippers may be suitable for the particular purpose to which they address, they would not be as suitable for the purposes of the present invention as heretofore described. while these toenail clippers may be suitable for'
#print( description )
#print( len(description) )

In [256]:
def findandmask_date( description ):
    # date :  'Dec. 18, 1956'
    months_abbrv = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'jul', 'july', 'Aug', 'Sept', 'Oct', 'Nov' , 'Dec']
    pattern = regex.compile( r"((%s)\. [0-9]{0,2}, [0-9]{4})" % '|'.join(months_abbrv), regex.I  )

    replacementFunction = lambda matchobj : 'A'*len(matchobj.group(1))
    newdescription = pattern.sub(  replacementFunction, description  )
    
    return newdescription

In [257]:
def findandmask_patentnum( description ):
    # Patent number : '1,702,137'  '454,074'
    pattern = regex.compile( r"(([0-9],)?[0-9]{3},[0-9]{3})" )

    replacementFunction = lambda matchobj : 'N'*len(matchobj.group(1))
    newdescription = pattern.sub(  replacementFunction, description  )
    
    return newdescription                   

#print( newdescription )
#print( len(newdescription) )

In [258]:
def fullTextTrimming( description ):
    """ Travail sur le texte entier
        en gardant le nombre de caractère constant
    """
    #description = description.replace('\n', ' ')
    description = description.lower() 
    
    # description = regex.sub('\s+',' ', description) # remove multiple space

    # patent number XX
    # patent_number_pattern = r'(\d{1, 4}[,-])?\d{3, 8}[,-]\d{3, 8}'
    # description = regex.sub(patent_number_pattern, '', description  )

    # remove non-AASII caracters
    description = regex.sub(r'[^\x00-\x7F]', ' ', description)
    
    return description

In [259]:
# test
description = fullTextTrimming( description )
print('nombre de caractères :', len(description))

nombre de caractères : 23679


In [260]:
def findAllCandidates( description ):
    """ Recherche  les chiffres isolés pouvant correspondre à un numéro de légende

        par exemple   " 4, 5 and 6 ", ou " 6 and 7 ", " 6, 7 "

        retourne une legend : liste avec les infos pour chaque 'match'
    """ 
    
    pattern =  r' ([0-9]{1,3}(( *, *| and | or )[0-9]{1,3})*)[^a-z0-9\-]'

    # Explication:
    # la forme générale est E(SE) avec Element  Separator
    # et le séparateur est une virgule ou and ou or 
   
    allmatches = list( regex.finditer(pattern, description) )
    splitpattern = regex.compile(r'[^\d]+')
    
    legend = []
    for m in allmatches:
        capture = m.group(1)
        numbers = splitpattern.split( capture )
        start = m.start()

        label = description[ max(0, m.start()-40 ): m.start() ]  # to prevent overlapping
        label = label.strip(' ')  # cas avec plusieurs espace consécutifs
        
        context = description[ max(0, m.start()-16 ): min( len(description), m.end()+10 ) ]  # to debug
        
        for n in numbers:
            item = {'number':int(n), 'label':label, 'position':[m.span(1)], 'context':context}
            legend.append( item )

    return legend

In [261]:
# test
legend = findAllCandidates( description )
print(len(legend))

256


In [262]:
print_legend( legend )

  with the fixed blade member  2 . in fig.    1            ber  2 . in fig. 1, the nail 
 e.  NL         [0041]    as shown in fig.    1            as shown in fig. 1, the other
   nail clipper body  1 . as shown in fig.    1            as shown in fig. 1, a nail ch
 rawings.  NL         [0031]    [0031]fig.    1            1]    [0031]fig. 1 shows the 
 e drawings  NL       [0018]    [0018]fig.    1            8]    [0018]fig. 1 is a secti
    nd of the invention  NL         [0001]    1                   [0001]    1. field of 
    shown in fig. 1, a nail chip reservoir    1             chip reservoir  1   a  may b
   l chips kept in the nail chip reservoir    1             chip reservoir  1   a  can b
   tachably mounted on a nail clipper body    1            il clipper body  1  which is 
   erring to fig. 2, the nail clipper body    1            il clipper body  1  includes 
   be defined inside the nail clipper body    1            il clipper body  1  by this b
   p and down inside 

In [263]:
a = list( range(5) )
b = list( a )
a[4] = 10
b

[0, 1, 2, 3, 4]

In [264]:
def disqualify( legend, pattern, n=-1 ):
    for i, row in enumerate( legend ):
        if pattern.search( row['label'] ):
            legend[ i ]['number'] = n

KEYWORDS = ['january', 'february', 'march', 'april', 'may', 'june',\
            'july', 'august', 'september', 'october', 'november', 'december',\
            'fig', 'figure', 'figures', 'claim', 'claims', 'at', 'and',\
            'numeral', 'embodiment', 'invention', 'part']

KEYWORDS.extend( ['the', 'a', 'an', 'these', 'their', 'when', 'with', 'by', 'this', \
                   'have', 'having', 'has', 'is', 'are', 'over', 'its', 'of said', 'and', 'as',\
                   'of', 'in', 'to', 'but', 'another', 'through', 'on'] )
            
def firstScreening( legend ):
    """ Premier filtrage en regardant comment se termine le label,
        Critères éliminants:
        
            - ne se termine pas par une lettre (Typiquement Fig. 5)
            - se termine par un des mots clés ('claim') ou un mot commun
            
        change le numéro pour 998 ou 999..
    """
    newlegend = copy.deepcopy(legend) # copy (pour ne pas ecrasser Legend)
    
    pattern = regex.compile( r'[^a-z]$', regex.I )
    disqualify( newlegend, pattern )
   

    pattern = regex.compile( r'\W(%s)$' % '|'.join(KEYWORDS), regex.I )
    disqualify( newlegend, pattern )

    return newlegend

In [265]:
legend = firstScreening(legend) 
print_legend( legend )

   f the aforesaid guide grooves  12   a ,   -1           ooves  12   a ,  12   a  of th
    a . behind the guide grooves  12   a ,   -1           ooves  12   a ,  12   a , ther
   rted into these guide grooves  12   a ,   -1           ooves  12   a ,  12   a . behi
   y provided with guide grooves  12   a ,   -1           ooves  12   a ,  12   a  at fo
   mber  3  while the guide ribs  12   b ,   -1            ribs  12   b ,  12   b  are e
   ted to receive the guide ribs  12   b ,   -1            ribs  12   b ,  12   b  of th
    provided vertical guide ribs  12   b ,   -1            ribs  12   b ,  12   b  for g
  the nail insertion groove  23  (see fig.   -1           e  23  (see fig. 11). as shown
  contacted with the curved face (see fig.   -1           d face (see fig. 12), a nail i
  dapted to cut a nail portion n (see fig.   -1           tion n (see fig. 12) which is 
  with the fixed blade member  2 . in fig.   -1            ber  2 . in fig. 1, the nail 
 e.  NL         [0041

## Coupe sur certain mot clé (the, a, an, ...)

In [266]:
def cut_legend_using_pattern( legend, pattern ):
    for i, row in enumerate( legend ):
        label = row['label']

        match = pattern.search(label)

        if match:
            newlabel = match.group(2)
            legend[i]['label'] = newlabel
            
SMALL_WORDS = ['the', 'a', 'an', 'these', 'their', 'when', 'with', 'by', 'this', 'that', \
               'have', 'having', 'has', 'is', 'are', 'should', 'over', 'its', 'of said', 'and', 'as',\
               'of', 'in', 'to', 'at',  'but', 'another', 'through', 'on', 'same', 'from',\
               'include', 'includes', 'beyond', 'between']

In [267]:
def cutLabels( legend ):
    """ coupe les labels sur certain mots commun ('the', 'a'... etc)
        et sur certains caractères spéciaux (. , ;)
    """
    newlegend = copy.deepcopy(legend) # copy (pour ne pas ecrasser Legend)
    
    pattern_small_words = regex.compile( r'^.* (%s) (.+)$' % '|'.join(SMALL_WORDS), regex.I )
    cut_legend_using_pattern( newlegend, pattern_small_words )
    
    pattern = regex.compile( '^.*([\.,;] )(.+)$' )
    cut_legend_using_pattern( newlegend, pattern )
    
    return newlegend

### Merge : regroupe les candidats ayant un label identique

Rq: utile pour le dev. mais pas pour l'identification

In [268]:
def get_positions( label, candidats ):
    positions = []
    for item in candidats:
        if item['label']==label:
            positions.extend( item['position'] )
    return positions

def get_context( label, candidats ):
    items = [ item for item in candidats if item['label']==label ]
    
    return items[0]['context']

def merge( legend ):
    # merge identical label
    
    numbers_unique = { item['number'] for item in legend  } # set

    new_legend = []
    for n in numbers_unique:

        candidats = [ line for line in legend if line['number']==n ]

        labels_unique = { item['label'] for item in candidats  }

        new_candidats = [ { 'number':n, 'label':label_u,\
                           'position':get_positions(label_u, candidats), 'context':get_context(label_u, candidats) }
                          for label_u in labels_unique ]
        
        new_legend.extend( new_candidats )

    return new_legend

In [269]:
print_legend( merge( legend ) )

   f the aforesaid guide grooves  12   a ,   -1           ooves  12   a ,  12   a  of th
    a . behind the guide grooves  12   a ,   -1           ooves  12   a ,  12   a , ther
   rted into these guide grooves  12   a ,   -1           ooves  12   a ,  12   a . behi
   y provided with guide grooves  12   a ,   -1           ooves  12   a ,  12   a  at fo
   mber  3  while the guide ribs  12   b ,   -1            ribs  12   b ,  12   b  are e
   ted to receive the guide ribs  12   b ,   -1            ribs  12   b ,  12   b  of th
    provided vertical guide ribs  12   b ,   -1            ribs  12   b ,  12   b  for g
  the nail insertion groove  23  (see fig.   -1           e  23  (see fig. 11). as shown
  contacted with the curved face (see fig.   -1           d face (see fig. 12), a nail i
  dapted to cut a nail portion n (see fig.   -1           tion n (see fig. 12) which is 
  with the fixed blade member  2 . in fig.   -1            ber  2 . in fig. 1, the nail 
 e.  NL         [0041

## Identification du meilleur candidats

In [270]:
def candidates_for_onenumber( legend, i ):
    candidates = [ c for c in legend if c['number'] == i ]

    for c in candidates:
        c['reversed'] = c['label'].split(' ')[::-1]
        c['weight'] = len( c['position'] )
        
    return candidates

In [271]:
print( candidates_for_onenumber(legend, 34) )

[{'weight': 1, 'context': ' a cutting edge  34  formed at', 'label': 'ed blade member  2 , and a cutting edge', 'reversed': ['edge', 'cutting', 'a', 'and', ',', '2', '', 'member', 'blade', 'ed'], 'number': 34, 'position': [(10767, 10769)]}, {'weight': 1, 'context': 'he cutting edge  34  having a ', 'label': 'ixed blade member  2 . the cutting edge', 'reversed': ['edge', 'cutting', 'the', '.', '2', '', 'member', 'blade', 'ixed'], 'number': 34, 'position': [(15315, 15317)]}, {'weight': 1, 'context': 'he cutting edge  34  of the mo', 'label': '.  NL         [0040]    the cutting edge', 'reversed': ['edge', 'cutting', 'the', '', '', '', '[0040]', '', '', '', '', '', '', '', '', 'NL', '', '.'], 'number': 34, 'position': [(16353, 16355)]}, {'weight': 1, 'context': 'he cutting edge  34  and the c', 'label': 'o cut the nail between the cutting edge', 'reversed': ['edge', 'cutting', 'the', 'between', 'nail', 'the', 'cut', 'o'], 'number': 34, 'position': [(16829, 16831)]}, {'weight': 1, 'context

In [272]:
def increment_dico( dico, key, value ):
    if key in dico:
        dico[key] += value
    else:
        dico[key] = value
        
def append_dico( dico, key, value ):
    if key in dico:
        dico[key].append( value )
    else:
        dico[key] = [value]

def who_are_the_winners( scoreboard ):
    scoreMax = max( scoreboard.values() )
    winners = [ sb[0] for sb in scoreboard.items() if sb[1] == scoreMax ]
    return winners

In [273]:
def choose_best_label( remainingcandidates ):
    """ Choisi un candidat gagnant unique
    
        choisi le label mot par mot, en partant de la fin :
            - si un mot apparait plus souvent que tout les autres, alors il est choisi
            - si plusieurs mots apparaissent aussi souvent, alors le label est coupé ici
    """
    N = len( remainingcandidates )
    
    if N == 1:
        return remainingcandidates[0]['label']
    
    previouschoice=[]
    for i in range(20):
        current_indice = len(previouschoice)

        scoreboard = {}
        nextword2candidates ={}
        for c in remainingcandidates:
            if current_indice < len( c['reversed'] ):
                next_word = c['reversed'][ current_indice ]
            else:
                next_word = ''

            append_dico( nextword2candidates, next_word, c )
            increment_dico( scoreboard, next_word, c['weight'] )

            #print(next_word, c['weight'])

        #print( 'scoreboard: ', scoreboard )
        winners = who_are_the_winners( scoreboard )
        #print( 'winners:', winners )
        if len(winners) == 1 and winners[0]:
            choice = winners[0]
            remainingcandidates = nextword2candidates[choice]
            previouschoice.append( choice )
            #print(' ')

        else: # exaequo ou fin -> stop
            selectedlabel = ' '.join( previouschoice[::-1] )
            #print( selectedlabel )
            break

    return selectedlabel

In [274]:
# test
candidates = candidates_for_onenumber(legend, 50)
choose_best_label( candidates )

ValueError: max() arg is an empty sequence

In [None]:
allnumbers = sorted( { c['number'] for c in legend if c['number']>0 } ) # unique et triés
print( allnumbers )

In [None]:
def getUniqueLabel(legend):
    allnumbers = sorted( { c['number'] for c in legend if c['number']>0 } ) # unique et triés

    final_legend = []
    for numero in allnumbers:
        candidates = candidates_for_onenumber(legend, numero)
        thelabel = choose_best_label( candidates )

        if not thelabel: continue
        line = {'numero':numero , 'label':thelabel  }
        final_legend.append( line )
        
    return final_legend

In [None]:
final_legend = getUniqueLabel(legend)
for line in final_legend:
    print( '{numero:>5}  {label}'.format( **line ) )

In [None]:
def findAndFilter( description ):
    description = fullTextTrimming( description )
    legend = findAllCandidates( description )
    legend = firstScreening(legend) 
    legend = cutLabels(legend) 

    return legend

def extract_finale_legend( description ):
    legend = findAndFilter( description )
    final_legend = getUniqueLabel(legend)
    
    return final_legend

In [None]:
# test
description = patent['description']
extract_finale_legend( description )

# Remarques
### Comment selectionner le meilleurs candidats ?

 * si unique:  ok
       et si unique et len()==40... ? c.a.d. pas coupé
 * si max unique: ok
 * sinon : selectionne les qualifiés, puis...
 * exaequo...  construit un arbre de comptage ?




### To look
* patents number: 3,803,713  
* comment ne pas modifier le texte, pour pouvoir l'annoter ?
* paranthese: between the elongated plates ( 1 , 2 ).
* line 5 - 5
*      march   15                 march 15,   
en debut de la description ...
* generally... adverbes ?
* " respectivement A et B " ... respective
* letter sur les vieux brevets
* merge keywords and small_words ??
* pluriel

* sort of recursive... identifier un permet d'en déduire un autre...

## other things
* Find values like "110 volts"... 

In [None]:
# Find values
units = ['inch', 'volt', 'm', 'mm']
find_sci_values =  r" (\d{1,4}([\.,]\d{1,4})? (%s)\W)" % '|'.join(units)

print( regex.findall(find_sci_values, description) )
description = regex.sub(find_sci_values, '<value>', description  )

In [None]:
# Find values
units = ['inch', 'volt', 'm', 'mm']
find_sci_values =  r" (\d{1,4}([\.,]\d{1,4})? (%s)\W)" % '|'.join(units)

print( regex.findall(find_sci_values, description) )
description = regex.sub(find_sci_values, '<value>', description  )

In [None]:
# Find floats
find_sci_floats =  r" (\d{1,4}([\.,]\d{1,4})\W)" 

print( regex.findall(find_sci_floats, description) )
description = regex.sub(find_sci_floats, '<value>', description  )

In [None]:
# " between 0.4 to 2.5 " 
betweenpattern = r"(between [\d\.,]{1,4} (to|and) [\d\.,]{1,4}\W)"

print( regex.findall(betweenpattern, description) )
description = regex.sub(betweenpattern, '<between>', description  )