# Extraire la legende depuis la description

In [441]:
import pickle
import regex   # permet overlapping matching
import copy

In [442]:
def print_legend( legend ):
    sortfun = lambda x: ( x['number'] , len(x['position']), x['label'][::-1] )
    legendsorted = sorted( legend, key=sortfun  )

    current_number = legendsorted[0]['number']
    for match in legendsorted:
        if current_number != match['number']:
            print('     --')
            current_number = match['number']

        n_merged = len(match['position'])
        n_merged_str = '  ' if n_merged==1 else '(%i)'%n_merged
        print( '{label:>42} {number:>4}  {n}  {context:>35}'.format( **match, n=n_merged_str) )

In [443]:
# test: load pre-processed data
data = pickle.load( open( "../web/data/patent_infos.pickle", "rb" ) )
len(data)

379

In [444]:
# test: select a patent

k = 44  #70 #44 #41
patent = sorted( data.values(), key=lambda x:x['patent_number'] )[k]
description = patent['description']

print( patent['patent_number'] )
print( 'https://www.google.com/patents/%s'%patent['patent_number'] )

print('nombre de mots :', len(description.split(' ')))
print('nombre de caractères :', len(description))

US20030094183
https://www.google.com/patents/US20030094183
nombre de mots : 7150
nombre de caractères : 33448


In [445]:
def fullTextTrimming( description ):
    """ Travail sur le texte entier
        en gardant le nombre de caractère constant
    """
    description = description.replace('\n', ' ')
    description = description.lower() 
    
    # description = regex.sub('\s+',' ', description) # remove multiple space
    description = description.lower()

    # patent number XX
    # patent_number_pattern = r'(\d{1, 4}[,-])?\d{3, 8}[,-]\d{3, 8}'
    # description = regex.sub(patent_number_pattern, '', description  )

    # remove non-AASII caracters
    description = regex.sub(r'[^\x00-\x7F]', ' ', description)
    
    return description

In [446]:
# test
description = fullTextTrimming( description )
print('nombre de caractères :', len(description))

nombre de caractères : 33448


In [447]:
def findAllCandidates( description ):
    """ Recherche  les chiffres isolés pouvant correspondre à un numéro de légende

        par exemple   " 4, 5 and 6 ", ou " 6 and 7 ", " 6, 7 "

        retourne une legend : liste avec les infos pour chaque 'match'
    """ 
    
    pattern =  r' (\d{1,3}(( ?, ?| and | or )\d{1,3})*)[^a-z0-9\-]'

    # Explication:
    # la forme générale est E(SE) avec Element  Separator
    # et le séparateur est une virgule ou and ou or 
   
    allmatches = list( regex.finditer(pattern, description) )
    splitpattern = regex.compile(r'[^\d]+')
    
    legend = []
    for m in allmatches:
        capture = m.group(1)
        numbers = splitpattern.split( capture )
        start = m.start()

        label = description[ max(0, m.start()-40 ): m.start() ]  # to prevent overlapping
        label = label.strip(' ')  # cas avec plusieurs espace consécutifs
        
        context = description[ max(0, m.start()-16 ): min( len(description), m.end()+10 ) ]  # to debug

        for n in numbers:
            item = {'number':int(n), 'label':label, 'position':[m.span(1)], 'context':context}
            legend.append( item )

    return legend

In [448]:
# test
legend = findAllCandidates( description )
print(len(legend))

228


In [449]:
print_legend( legend )

  of the present invention taken from fig.    1             taken from fig. 1 as indicat
  nts in figs.  1 - 6 , and in use in fig.    1            d in use in fig. 1. as shown 
  , and in use in fig. 1. as shown in fig.    1            as shown in fig. 1, the devic
  l views.            [0060]    [0060]fig.    1            0]    [0060]fig. 1 is a persp
   nts throughout the several views, figs.    1            al views, figs.  1 - 6  illus
               [0096]    as shown in figs.    1             shown in figs.  1 - 4  and  
   is depicted to various extents in figs.    1            xtents in figs.  1 - 6 , and 
     nd of the invention            [0001]    1                   [0001]    1. field of 
     --
   in use.            [0061]    [0061]fig.    2            1]    [0061]fig. 2 is a close
               [0091]    as shown in figs.    2             shown in figs.  2 - 3  and  
   nd a second end  34 , as shown in figs.    2             shown in figs.  2 - 3 . to t
     eir own 

In [450]:
a = list( range(5) )
b = list( a )
a[4] = 10
b

[0, 1, 2, 3, 4]

In [451]:
def disqualify( legend, pattern, n=-1 ):
    for i, row in enumerate( legend ):
        if pattern.search( row['label'] ):
            legend[ i ]['number'] = n

KEYWORDS = ['january', 'february', 'march', 'april', 'may', 'june',\
            'july', 'august', 'september', 'october', 'november', 'december',\
            'fig', 'figure', 'figures', 'claim', 'claims', 'at', 'and',\
            'numeral', 'embodiment', 'invention', 'part']

KEYWORDS.extend( ['the', 'a', 'an', 'these', 'their', 'when', 'with', 'by', 'this', \
                   'have', 'having', 'has', 'is', 'are', 'over', 'its', 'of said', 'and', 'as',\
                   'of', 'in', 'to', 'but', 'another', 'through', 'on'] )
            
def firstScreening( legend ):
    """ Premier filtrage en regardant comment se termine le label,
        Critères éliminants:
        
            - ne se termine pas par une lettre (Typiquement Fig. 5)
            - se termine par un des mots clés ('claim') ou un mot commun
            
        change le numéro pour 998 ou 999..
    """
    newlegend = copy.deepcopy(legend) # copy (pour ne pas ecrasser Legend)
    
    pattern = regex.compile( r'[^a-z]$', regex.I )
    disqualify( newlegend, pattern )
   

    pattern = regex.compile( r'\W(%s)$' % '|'.join(KEYWORDS), regex.I )
    disqualify( newlegend, pattern )

    return newlegend

In [452]:
legend = firstScreening(legend) 
print_legend( legend )

  throughout the several views, figs.  1 -   -1            iews, figs.  1 - 6  illustrat
          [0096]    as shown in figs.  1 -   -1            wn in figs.  1 - 4  and  6 , 
  epicted to various extents in figs.  1 -   -1            ts in figs.  1 - 6 , and in u
          [0091]    as shown in figs.  2 -   -1            wn in figs.  2 - 3  and  6 , 
   second end  34 , as shown in figs.  2 -   -1            wn in figs.  2 - 3 . to the f
  e k. c. lee et al.          issued: dec.   -1                issued: dec. 4, 1979     
  sued to george k. c. lee, et al. on dec.   -1            , et al. on dec. 4, 1979 is i
   jaws  24 , 26  as discussed above. fig.   -1            ssed above. fig. 3 depicts th
  gger assembly and handle taken from fig.   -1             taken from fig. 3 as indicat
  of the present invention taken from fig.   -1             taken from fig. 1 as indicat
  of the present invention taken from fig.   -1             taken from fig. 3 as indicat
  and the clipper ass

## Coupe sur certain mot clé (the, a, an, ...)

In [453]:
def cut_legend_using_pattern( legend, pattern ):
    for i, row in enumerate( legend ):
        label = row['label']

        match = pattern.search(label)

        if match:
            newlabel = match.group(2)
            legend[i]['label'] = newlabel
            
SMALL_WORDS = ['the', 'a', 'an', 'these', 'their', 'when', 'with', 'by', 'this', 'that', \
               'have', 'having', 'has', 'is', 'are', 'should', 'over', 'its', 'of said', 'and', 'as',\
               'of', 'in', 'to', 'at',  'but', 'another', 'through', 'on', 'same', 'from',\
               'include', 'includes', 'beyond', 'between']

In [454]:
def cutLabels( legend ):
    """ coupe les labels sur certain mots commun ('the', 'a'... etc)
        et sur certains caractères spéciaux (. , ;)
    """
    newlegend = copy.deepcopy(legend) # copy (pour ne pas ecrasser Legend)
    
    pattern_small_words = regex.compile( r'^.* (%s) (.+)$' % '|'.join(SMALL_WORDS), regex.I )
    cut_legend_using_pattern( newlegend, pattern_small_words )
    
    pattern = regex.compile( '^.*([\.,;] )(.+)$' )
    cut_legend_using_pattern( newlegend, pattern )
    
    return newlegend

In [455]:
# test
legend = cutLabels(legend) 
print_legend( legend )

                                       1 -   -1            iews, figs.  1 - 6  illustrat
                                       1 -   -1            ts in figs.  1 - 6 , and in u
                                       1 -   -1            wn in figs.  1 - 4  and  6 , 
                                       2 -   -1            wn in figs.  2 - 3  and  6 , 
                                       2 -   -1            wn in figs.  2 - 3 . to the f
                                      dec.   -1            , et al. on dec. 4, 1979 is i
                              issued: dec.   -1                issued: dec. 4, 1979     
                                      fig.   -1             taken from fig. 1 as indicat
                                      fig.   -1             taken from fig. 3 as indicat
                                      fig.   -1             taken from fig. 3 as indicat
                                      fig.   -1             taken from fig. 3 as indicat
                     

### Merge : regroupe les candidats ayant un label identique

Rq: utile pour le dev. mais pas pour l'identification

In [456]:
def get_positions( label, candidats ):
    positions = []
    for item in candidats:
        if item['label']==label:
            positions.extend( item['position'] )
    return positions

def get_context( label, candidats ):
    items = [ item for item in candidats if item['label']==label ]
    
    return items[0]['context']

def merge( legend ):
    # merge identical label
    
    numbers_unique = { item['number'] for item in legend  } # set

    new_legend = []
    for n in numbers_unique:

        candidats = [ line for line in legend if line['number']==n ]

        labels_unique = { item['label'] for item in candidats  }

        new_candidats = [ { 'number':n, 'label':label_u,\
                           'position':get_positions(label_u, candidats), 'context':get_context(label_u, candidats) }
                          for label_u in labels_unique ]
        
        new_legend.extend( new_candidats )

    return new_legend

In [457]:
print_legend( merge( legend ) )

                                      dec.   -1            , et al. on dec. 4, 1979 is i
                              issued: dec.   -1                issued: dec. 4, 1979     
                         particularly fig.   -1            articularly fig. 4, a special
                      [0060]    [0060]fig.   -1            0]    [0060]fig. 1 is a persp
                      [0061]    [0061]fig.   -1            1]    [0061]fig. 2 is a close
                      [0062]    [0062]fig.   -1            2]    [0062]fig. 3 is a side 
                      [0063]    [0063]fig.   -1            3]    [0063]fig. 4 is a front
                      [0064]    [0064]fig.   -1            4]    [0064]fig. 5 is a close
                      [0065]    [0065]fig.   -1            5]    [0065]fig. 6 is a close
  entor: david rains          issued: jul.   -1                issued: jul. 7, 1998     
                                      jan.   -1           4 issued on jan. 14, 1986 nath
              mackel 

## Identification du meilleur candidats

In [458]:
def candidates_for_onenumber( legend, i ):
    candidates = [ c for c in legend if c['number'] == i ]

    for c in candidates:
        c['reversed'] = c['label'].split(' ')[::-1]
        c['weight'] = len( c['position'] )
        
    return candidates

In [459]:
print( candidates_for_onenumber(legend, 34) )

[{'reversed': ['end', 'second'], 'weight': 1, 'label': 'second end', 'number': 34, 'context': 'nd a second end  34 , as shown', 'position': [(27902, 27904)]}, {'reversed': ['end', 'second', 'shaft', 'extension'], 'weight': 1, 'label': 'extension shaft second end', 'number': 34, 'context': 'haft second end  34 . the exte', 'position': [(28101, 28103)]}, {'reversed': ['end', 'second', 'shaft', 'extension'], 'weight': 1, 'label': 'extension shaft second end', 'number': 34, 'context': 'haft second end  34  using a s', 'position': [(30697, 30699)]}]


In [460]:
def increment_dico( dico, key, value ):
    if key in dico:
        dico[key] += value
    else:
        dico[key] = value
        
def append_dico( dico, key, value ):
    if key in dico:
        dico[key].append( value )
    else:
        dico[key] = [value]

def who_are_the_winners( scoreboard ):
    scoreMax = max( scoreboard.values() )
    winners = [ sb[0] for sb in scoreboard.items() if sb[1] == scoreMax ]
    return winners

In [461]:
def choose_best_label( remainingcandidates ):
    """ Choisi un candidat gagnant unique
    
        choisi le label mot par mot, en partant de la fin :
            - si un mot apparait plus souvent que tout les autres, alors il est choisi
            - si plusieurs mots apparaissent aussi souvent, alors le label est coupé ici
    """
    N = len( remainingcandidates )
    
    if N == 1:
        return remainingcandidates[0]['label']
    
    previouschoice=[]
    for i in range(20):
        current_indice = len(previouschoice)

        scoreboard = {}
        nextword2candidates ={}
        for c in remainingcandidates:
            if current_indice < len( c['reversed'] ):
                next_word = c['reversed'][ current_indice ]
            else:
                next_word = ''

            append_dico( nextword2candidates, next_word, c )
            increment_dico( scoreboard, next_word, c['weight'] )

            #print(next_word, c['weight'])

        #print( 'scoreboard: ', scoreboard )
        winners = who_are_the_winners( scoreboard )
        #print( 'winners:', winners )
        if len(winners) == 1 and winners[0]:
            choice = winners[0]
            remainingcandidates = nextword2candidates[choice]
            previouschoice.append( choice )
            #print(' ')

        else: # exaequo ou fin -> stop
            selectedlabel = ' '.join( previouschoice[::-1] )
            #print( selectedlabel )
            break

    return selectedlabel

In [462]:
# test
candidates = candidates_for_onenumber(legend, 50)
choose_best_label( candidates )

'mirror'

In [463]:
allnumbers = sorted( { c['number'] for c in legend if c['number']>0 } ) # unique et triés
print( allnumbers )

[10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 50, 52, 54, 56, 90, 360]


In [464]:
def getUniqueLabel(legend):
    allnumbers = sorted( { c['number'] for c in legend if c['number']>0 } ) # unique et triés

    final_legend = []
    for numero in allnumbers:
        candidates = candidates_for_onenumber(legend, numero)
        thelabel = choose_best_label( candidates )

        if not thelabel: continue
        line = {'numero':numero , 'label':thelabel  }
        final_legend.append( line )
        
    return final_legend

In [465]:
final_legend = getUniqueLabel(legend)
for line in final_legend:
    print( '{numero:>5}  {label}'.format( **line ) )

   10  device
   12  user
   14  hand
   16  toe
   18  nail clipper assembly
   24  cutting jaws
   26  cutting jaws
   28  lever
   30  extension shaft
   32  first end
   34  extension shaft second end
   36  pistol grip handle
   38  compartment
   40  trigger mechanism
   42  first roller
   44  second roller
   46  cord
   50  mirror
   52  mirror extension arm
   54  second double ball joints
   56  second double ball joints


In [466]:
def findAndFilter( description ):
    description = fullTextTrimming( description )
    legend = findAllCandidates( description )
    legend = firstScreening(legend) 
    legend = cutLabels(legend) 

    return legend

def extract_finale_legend( description ):
    legend = findAndFilter( description )
    final_legend = getUniqueLabel(legend)
    
    return final_legend

In [467]:
# test
description = patent['description']
extract_finale_legend( description )

[{'label': 'device', 'numero': 10},
 {'label': 'user', 'numero': 12},
 {'label': 'hand', 'numero': 14},
 {'label': 'toe', 'numero': 16},
 {'label': 'nail clipper assembly', 'numero': 18},
 {'label': 'cutting jaws', 'numero': 24},
 {'label': 'cutting jaws', 'numero': 26},
 {'label': 'lever', 'numero': 28},
 {'label': 'extension shaft', 'numero': 30},
 {'label': 'first end', 'numero': 32},
 {'label': 'extension shaft second end', 'numero': 34},
 {'label': 'pistol grip handle', 'numero': 36},
 {'label': 'compartment', 'numero': 38},
 {'label': 'trigger mechanism', 'numero': 40},
 {'label': 'first roller', 'numero': 42},
 {'label': 'second roller', 'numero': 44},
 {'label': 'cord', 'numero': 46},
 {'label': 'mirror', 'numero': 50},
 {'label': 'mirror extension arm', 'numero': 52},
 {'label': 'second double ball joints', 'numero': 54},
 {'label': 'second double ball joints', 'numero': 56}]

# Remarques
### Comment selectionner le meilleurs candidats ?

 * si unique:  ok
       et si unique et len()==40... ? c.a.d. pas coupé
 * si max unique: ok
 * sinon : selectionne les qualifiés, puis...
 * exaequo...  construit un arbre de comptage ?




### To look
* patents number: 3,803,713  
* comment ne pas modifier le texte, pour pouvoir l'annoter ?
* paranthese: between the elongated plates ( 1 , 2 ).
* line 5 - 5
*      march   15                 march 15,   
en debut de la description ...
* generally... adverbes ?
* " respectivement A et B " ... respective
* letter sur les vieux brevets
* merge keywords and small_words ??
* pluriel

* sort of recursive... identifier un permet d'en déduire un autre...

## other things
* Find values like "110 volts"... 

In [468]:
# Find values
units = ['inch', 'volt', 'm', 'mm']
find_sci_values =  r" (\d{1,4}([\.,]\d{1,4})? (%s)\W)" % '|'.join(units)

print( regex.findall(find_sci_values, description) )
description = regex.sub(find_sci_values, '<value>', description  )

[]


In [469]:
# Find values
units = ['inch', 'volt', 'm', 'mm']
find_sci_values =  r" (\d{1,4}([\.,]\d{1,4})? (%s)\W)" % '|'.join(units)

print( regex.findall(find_sci_values, description) )
description = regex.sub(find_sci_values, '<value>', description  )

[]


In [470]:
# Find floats
find_sci_floats =  r" (\d{1,4}([\.,]\d{1,4})\W)" 

print( regex.findall(find_sci_floats, description) )
description = regex.sub(find_sci_floats, '<value>', description  )

[('4,176,', ',176'), ('4,564,', ',564'), ('4,846,', ',846'), ('4,956,', ',956'), ('5,357,', ',357'), ('5,775,', ',775'), ('5,926,', ',926'), ('4,176,', ',176'), ('4,564,', ',564'), ('4,847,', ',847'), ('4,956,', ',956'), ('5,775,', ',775'), ('5,357,', ',357'), ('5,926,', ',926'), ('5,357,', ',357')]


In [471]:
# " between 0.4 to 2.5 " 
betweenpattern = r"(between [\d\.,]{1,4} (to|and) [\d\.,]{1,4}\W)"

print( regex.findall(betweenpattern, description) )
description = regex.sub(betweenpattern, '<between>', description  )

[]
