# Extraire la legende depuis la description

In [249]:
import pickle
import regex   # permet overlapping matching
import copy

In [250]:
def print_legend( legend ):
    sortfun = lambda x: ( x['number'] , len(x['position']), x['label'][::-1] )
    legendsorted = sorted( legend, key=sortfun  )

    current_number = legendsorted[0]['number']
    for match in legendsorted:
        if current_number != match['number']:
            print('     --')
            current_number = match['number']

        n_merged = len(match['position'])
        n_merged_str = '  ' if n_merged==1 else '(%i)'%n_merged
        print( '{label:>42} {number:>4}  {n}  {context:>35}'.format( **match, n=n_merged_str) )

In [251]:
# load pre-processed data
data = pickle.load( open( "../web/data/patent_infos.pickle", "rb" ) )
len(data)

379

In [252]:
# select a patent

k = 44 #70 #44 #41
patent = list(data.values())[k]
description = patent['description']

print( patent['patent_number'] )
print( 'https://www.google.com/patents/%s'%patent['patent_number'] )

print('nombre de mots :', len(description.split(' ')))
print('nombre de caractères :', len(description))

US2664624
https://www.google.com/patents/US2664624
nombre de mots : 3555
nombre de caractères : 19112


In [253]:
def fullTextTrimming( description ):
    """ Travail sur le texte entier
        en gardant le nombre de caractère constant
    """
    description = description.replace('\n', ' ')
    description = description.lower() 
    
    # description = regex.sub('\s+',' ', description) # remove multiple space
    description = description.lower()

    # patent number XX
    # patent_number_pattern = r'(\d{1, 4}[,-])?\d{3, 8}[,-]\d{3, 8}'
    # description = regex.sub(patent_number_pattern, '', description  )

    # remove non-AASII caracters
    description = regex.sub(r'[^\x00-\x7F]', ' ', description)
    
    return description

In [254]:
# test
description = fullTextTrimming( description )
print('nombre de caractères :', len(description))

nombre de caractères : 19112


In [255]:
def findAllCandidates( description ):
    """ Recherche  les chiffres isolés pouvant correspondre à un numéro de légende

        par exemple   " 4, 5 and 6 ", ou " 6 and 7 ", " 6, 7 "

        retourne une legend : liste avec les infos pour chaque 'match'
    """ 
    
    pattern =  r' (\d{1,3}(( ?, ?| and | or )\d{1,3})*)[^a-z0-9\-]'

    # Explication:
    # la forme générale est E(SE) avec Element  Separator
    # et le séparateur est une virgule ou and ou or 
   
    allmatches = list( regex.finditer(pattern, description) )
    splitpattern = regex.compile(r'[^\d]+')
    
    legend = []
    for m in allmatches:
        capture = m.group(1)
        numbers = splitpattern.split( capture )
        start = m.start()

        label = description[ max(0, m.start()-40 ): m.start() ]  # to prevent overlapping
        label = label.strip(' ')  # cas avec plusieurs espace consécutifs
        
        context = description[ max(0, m.start()-16 ): min( len(description), m.end()+10 ) ]  # to debug

        for n in numbers:
            item = {'number':int(n), 'label':label, 'position':[m.span(1)], 'context':context}
            legend.append( item )

    return legend

In [256]:
# test
legend = findAllCandidates( description )
print(len(legend))

137


In [257]:
print_legend( legend )

  ng. through the opening in the members i    0            in the members i 0 and i] and
  s necessary to subject the jaw members i    0            he jaw members i 0 and ii to 
     --
  ogether for the clipping or trimming op-    1             or trimming op- 1 eration. a
   invention.    in this drawing:     fig.    1            rawing:     fig. 1 is a plan 
  ansverse section on the line 3-3 of fig.    1            line 3-3 of fig. 1;     fig. 
  e elevation of the clipper shown in fig.    1            er shown in fig. 1;     fig. 
   1;     fig. 4 is a view similar to fig.    1             similar to fig. 1 but showin
  ber when in the folded position of fi s.    1      osition of fi s. 1 and 2 to provide
  ver is raised from the position of figs.    1      osition of figs. 1 and 2 about its 
  r turned over from the position of figs.    1      osition of figs. 1 and 2, so that t
   closed or inoperative position of figs.    1      osition of figs. 1 and 2 and preven
  e top of th

In [258]:
a = list( range(5) )
b = list( a )
a[4] = 10
b

[0, 1, 2, 3, 4]

In [259]:
def disqualify( legend, pattern, n=-1 ):
    for i, row in enumerate( legend ):
        if pattern.search( row['label'] ):
            legend[ i ]['number'] = n

KEYWORDS = ['january', 'february', 'march', 'april', 'may', 'june',\
            'july', 'august', 'september', 'october', 'november', 'december',\
            'fig', 'figure', 'figures', 'claim', 'claims', 'at', 'and',\
            'numeral', 'embodiment', 'invention', 'part']

KEYWORDS.extend( ['the', 'a', 'an', 'these', 'their', 'when', 'with', 'by', 'this', \
                   'have', 'having', 'has', 'is', 'are', 'over', 'its', 'of said', 'and', 'as',\
                   'of', 'in', 'to', 'but', 'another', 'through', 'on'] )
            
def firstScreening( legend ):
    """ Premier filtrage en regardant comment se termine le label,
        Critères éliminants:
        
            - ne se termine pas par une lettre (Typiquement Fig. 5)
            - se termine par un des mots clés ('claim') ou un mot commun
            
        change le numéro pour 998 ou 999..
    """
    newlegend = copy.deepcopy(legend) # copy (pour ne pas ecrasser Legend)
    
    pattern = regex.compile( r'[^a-z]$', regex.I )
    disqualify( newlegend, pattern )
   

    pattern = regex.compile( r'\W(%s)$' % '|'.join(KEYWORDS), regex.I )
    disqualify( newlegend, pattern )

    return newlegend

In [260]:
legend = firstScreening(legend) 
print_legend( legend )

  s l3, m are spaced apart. these jaws l3,   -1           . these jaws l3, 14 are normal
  ogether for the clipping or trimming op-   -1             or trimming op- 1 eration. a
  tos may 26, 1925 1,702,137 schnefel feb.   -1           37 schnefel feb. 12, 1929 2,47
      rmal operating jaw-closing pressure.   -1            ng pressure.     2. the combi
   invention.    in this drawing:     fig.   -1            rawing:     fig. 1 is a plan 
  tion on the line 3-3 of fig. 1;     fig.   -1            fig. 1;     fig. 4 is a view 
  of the clipper shown in fig. 1;     fig.   -1            fig. 1;     fig. 3 is a trans
   the features of the invention;     fig.   -1            ention;     fig. 5 is a side 
  ne embodiment of the invention;     fig.   -1            ention;     fig. 2 is a side 
  and the nail file extended, and     fig.   -1            ed, and     fig. 6 is a parti
  39; downward as &#39;sh own at 31c, fig.   -1            own at 31c, fig. 5. this brin
  tter in the clippin

## Coupe sur certain mot clé (the, a, an, ...)

In [261]:
def cut_legend_using_pattern( legend, pattern ):
    for i, row in enumerate( legend ):
        label = row['label']

        match = pattern.search(label)

        if match:
            newlabel = match.group(2)
            legend[i]['label'] = newlabel
            
SMALL_WORDS = ['the', 'a', 'an', 'these', 'their', 'when', 'with', 'by', 'this', 'that', \
               'have', 'having', 'has', 'is', 'are', 'should', 'over', 'its', 'of said', 'and', 'as',\
               'of', 'in', 'to', 'at',  'but', 'another', 'through', 'on', 'same', 'from',\
               'include', 'includes', 'beyond', 'between']

In [262]:
def cutLabels( legend ):
    """ coupe les labels sur certain mots commun ('the', 'a'... etc)
        et sur certains caractères spéciaux (. , ;)
    """
    newlegend = copy.deepcopy(legend) # copy (pour ne pas ecrasser Legend)
    
    pattern_small_words = regex.compile( r'^.* (%s) (.+)$' % '|'.join(SMALL_WORDS), regex.I )
    cut_legend_using_pattern( newlegend, pattern_small_words )
    
    pattern = regex.compile( '^.*([\.,;] )(.+)$' )
    cut_legend_using_pattern( newlegend, pattern )
    
    return newlegend

In [263]:
# test
legend = cutLabels(legend) 
print_legend( legend )

                                  jaws l3,   -1           . these jaws l3, 14 are normal
                  clipping or trimming op-   -1             or trimming op- 1 eration. a
              1925 1,702,137 schnefel feb.   -1           37 schnefel feb. 12, 1929 2,47
      rmal operating jaw-closing pressure.   -1            ng pressure.     2. the combi
                                      fig.   -1            er shown in fig. 1;     fig. 
                                      fig.   -1            line 3-3 of fig. 1;     fig. 
                                      fig.   -1             similar to fig. 1 but showin
                                      fig.   -1            shown at in fig. 2, to&#39; r
                                      fig.   -1            position of fig. 4 and then i
                                      fig.   -1            position of fig. 5 over the u
                                      fig.   -1            as shown in fig. 6, the bar 2
                     

### Merge : regroupe les candidats ayant un label identique

Rq: utile pour le dev. mais pas pour l'identification

In [264]:
def get_positions( label, candidats ):
    positions = []
    for item in candidats:
        if item['label']==label:
            positions.extend( item['position'] )
    return positions

def get_context( label, candidats ):
    items = [ item for item in candidats if item['label']==label ]
    
    return items[0]['context']

def merge( legend ):
    # merge identical label
    
    numbers_unique = { item['number'] for item in legend  } # set

    new_legend = []
    for n in numbers_unique:

        candidats = [ line for line in legend if line['number']==n ]

        labels_unique = { item['label'] for item in candidats  }

        new_candidats = [ { 'number':n, 'label':label_u,\
                           'position':get_positions(label_u, candidats), 'context':get_context(label_u, candidats) }
                          for label_u in labels_unique ]
        
        new_legend.extend( new_candidats )

    return new_legend

In [265]:
print_legend( merge( legend ) )

                                  jaws l3,   -1           . these jaws l3, 14 are normal
                  clipping or trimming op-   -1             or trimming op- 1 eration. a
              1925 1,702,137 schnefel feb.   -1           37 schnefel feb. 12, 1929 2,47
      rmal operating jaw-closing pressure.   -1            ng pressure.     2. the combi
                         drawing:     fig.   -1            rawing:     fig. 1 is a plan 
                                  see fig.   -1            ration, see fig. 5. in moving
                         dotted lines fig.   -1            otted lines fig. 4, to a more
                                nail (fig.   -1            t the nail (fig. 6), so that 
                               upper (fig.   -1             the upper (fig. 6) side, i. 
               1929 2,477,782 bassett aug.   -1                 782 bassett aug. 2, 1949
                                      jan.   -1                        jan. 5, i954 w. e
  50 zhwemor m @m (it

## Identification du meilleur candidats

In [266]:
def candidates_for_onenumber( legend, i ):
    candidates = [ c for c in legend if c['number'] == i ]

    for c in candidates:
        c['reversed'] = c['label'].split(' ')[::-1]
        c['weight'] = len( c['position'] )
        
    return candidates

In [267]:
print( candidates_for_onenumber(legend, 34) )

[{'number': 34, 'position': [(12066, 12068)], 'weight': 1, 'context': 'securing flanges 34. this eyel', 'reversed': ['flanges', 'securing', 'form'], 'label': 'form securing flanges'}]


In [268]:
def increment_dico( dico, key, value ):
    if key in dico:
        dico[key] += value
    else:
        dico[key] = value
        
def append_dico( dico, key, value ):
    if key in dico:
        dico[key].append( value )
    else:
        dico[key] = [value]

def who_are_the_winners( scoreboard ):
    scoreMax = max( scoreboard.values() )
    winners = [ sb[0] for sb in scoreboard.items() if sb[1] == scoreMax ]
    return winners

In [297]:
def choose_best_label( remainingcandidates ):
    """ Choisi un candidat gagnant unique
    
        choisi le label mot par mot, en partant de la fin :
            - si un mot apparait plus souvent que tout les autres, alors il est choisi
            - si plusieurs mots apparaissent aussi souvent, alors le label est coupé ici
    """
    N = len( remainingcandidates )
    
    if N == 1:
        return remainingcandidates[0]['label']
    
    previouschoice=[]
    for i in range(20):
        current_indice = len(previouschoice)

        scoreboard = {}
        nextword2candidates ={}
        for c in remainingcandidates:
            if current_indice < len( c['reversed'] ):
                next_word = c['reversed'][ current_indice ]
            else:
                next_word = ''

            append_dico( nextword2candidates, next_word, c )
            increment_dico( scoreboard, next_word, c['weight'] )

            #print(next_word, c['weight'])

        #print( 'scoreboard: ', scoreboard )
        winners = who_are_the_winners( scoreboard )
        #print( 'winners:', winners )
        if len(winners) == 1 and winners[0]:
            choice = winners[0]
            remainingcandidates = nextword2candidates[choice]
            previouschoice.append( choice )
            #print(' ')

        else: # exaequo ou fin -> stop
            selectedlabel = ' '.join( previouschoice[::-1] )
            #print( selectedlabel )
            break

    return selectedlabel

In [283]:
# test
candidates = candidates_for_onenumber(legend, 33)
choose_best_label( candidates )

''

In [284]:
allnumbers = sorted( { c['number'] for c in legend if c['number']>0 } ) # unique et triés
print( allnumbers )

[1, 2, 3, 4, 15, 18, 20, 21, 22, 23, 24, 25, 26, 28, 30, 31, 32, 33, 34, 35, 36, 39, 538, 624, 664, 995]


In [287]:
final_legend = []
for numero in allnumbers:
    candidates = candidates_for_onenumber(legend, numero)
    thelabel = choose_best_label( candidates )

    line = {'numero':numero , 'label':thelabel  }
    final_legend.append( line )

In [298]:
for line in final_legend:
    print( '{numero:>5}  {label}'.format( **line ) )

    1  
    2  
    3  file
    4  i
   15  curved cutting or knife edges
   18  aligned openings
   20  slot
   21  
   22  bar
   23  lever
   24  opening
   25  
   26  lug
   28  free end
   30  nail file
   31  portion
   32  free end
   33  
   34  form securing flanges
   35  upwardly extending lugs
   36  
   39  nail file
  538  t united states patents number name date
  624  bassett
  664  bassett
  995  t united states patents number name date


# Remarques
### Comment selectionner le meilleurs candidats ?

 * si unique:  ok
       et si unique et len()==40... ? c.a.d. pas coupé
 * si max unique: ok
 * sinon : selectionne les qualifiés, puis...
 * exaequo...  construit un arbre de comptage ?




### To look
* comment ne pas modifier le texte, pour pouvoir l'annoter ?
* paranthese: between the elongated plates ( 1 , 2 ).
* line 5 - 5
*      march   15                 march 15,   
en debut de la description ...
* generally... adverbes ?
* " respectivement A et B " ... respective
* letter sur les vieux brevets
* merge keywords and small_words ??
* pluriel

* sort of recursive... identifier un permet d'en déduire un autre...

## other things
* Find values like "110 volts"... 

In [272]:
# Find values
units = ['inch', 'volt', 'm', 'mm']
find_sci_values =  r" (\d{1,4}([\.,]\d{1,4})? (%s)\W)" % '|'.join(units)

print( regex.findall(find_sci_values, description) )
description = regex.sub(find_sci_values, '<value>', description  )

[]


In [273]:
# Find values
units = ['inch', 'volt', 'm', 'mm']
find_sci_values =  r" (\d{1,4}([\.,]\d{1,4})? (%s)\W)" % '|'.join(units)

print( regex.findall(find_sci_values, description) )
description = regex.sub(find_sci_values, '<value>', description  )

[]


In [274]:
# Find floats
find_sci_floats =  r" (\d{1,4}([\.,]\d{1,4})\W)" 

print( regex.findall(find_sci_floats, description) )
description = regex.sub(find_sci_floats, '<value>', description  )

[('2,664,', ',664'), ('172,077 ', ',077'), ('2,477,', ',477'), ('1,538,', ',538'), ('1,702,', ',702'), ('2,477,', ',477')]


In [275]:
# " between 0.4 to 2.5 " 
betweenpattern = r"(between [\d\.,]{1,4} (to|and) [\d\.,]{1,4}\W)"

print( regex.findall(betweenpattern, description) )
description = regex.sub(betweenpattern, '<between>', description  )

[]
