# Extraire la legende depuis la description

In [355]:
import pickle
import regex   # permet overlapping matching
import copy

In [356]:
def print_legend( legend ):
    sortfun = lambda x: ( x['number'] , len(x['position']), x['label'][::-1] )
    legendsorted = sorted( legend, key=sortfun  )

    current_number = legendsorted[0]['number']
    for match in legendsorted:
        if current_number != match['number']:
            print('     --')
            current_number = match['number']

        n_merged = len(match['position'])
        n_merged_str = '  ' if n_merged==1 else '(%i)'%n_merged
        print( '{label:>42} {number:>4}  {n}  {context:>35}'.format( **match, n=n_merged_str) )

In [357]:
# load pre-processed data
data = pickle.load( open( "../web/data/patent_infos.pickle", "rb" ) )
len(data)

379

In [379]:
# test: select a patent

k = 44  #70 #44 #41
patent = sorted( data.values(), key=lambda x:x['patent_number'] )[k]
description = patent['description']

print( patent['patent_number'] )
print( 'https://www.google.com/patents/%s'%patent['patent_number'] )

print('nombre de mots :', len(description.split(' ')))
print('nombre de caractères :', len(description))

US20030094183
https://www.google.com/patents/US20030094183
nombre de mots : 7150
nombre de caractères : 33448


In [359]:
def fullTextTrimming( description ):
    """ Travail sur le texte entier
        en gardant le nombre de caractère constant
    """
    description = description.replace('\n', ' ')
    description = description.lower() 
    
    # description = regex.sub('\s+',' ', description) # remove multiple space
    description = description.lower()

    # patent number XX
    # patent_number_pattern = r'(\d{1, 4}[,-])?\d{3, 8}[,-]\d{3, 8}'
    # description = regex.sub(patent_number_pattern, '', description  )

    # remove non-AASII caracters
    description = regex.sub(r'[^\x00-\x7F]', ' ', description)
    
    return description

In [360]:
# test
description = fullTextTrimming( description )
print('nombre de caractères :', len(description))

nombre de caractères : 10785


In [361]:
def findAllCandidates( description ):
    """ Recherche  les chiffres isolés pouvant correspondre à un numéro de légende

        par exemple   " 4, 5 and 6 ", ou " 6 and 7 ", " 6, 7 "

        retourne une legend : liste avec les infos pour chaque 'match'
    """ 
    
    pattern =  r' (\d{1,3}(( ?, ?| and | or )\d{1,3})*)[^a-z0-9\-]'

    # Explication:
    # la forme générale est E(SE) avec Element  Separator
    # et le séparateur est une virgule ou and ou or 
   
    allmatches = list( regex.finditer(pattern, description) )
    splitpattern = regex.compile(r'[^\d]+')
    
    legend = []
    for m in allmatches:
        capture = m.group(1)
        numbers = splitpattern.split( capture )
        start = m.start()

        label = description[ max(0, m.start()-40 ): m.start() ]  # to prevent overlapping
        label = label.strip(' ')  # cas avec plusieurs espace consécutifs
        
        context = description[ max(0, m.start()-16 ): min( len(description), m.end()+10 ) ]  # to debug

        for n in numbers:
            item = {'number':int(n), 'label':label, 'position':[m.span(1)], 'context':context}
            legend.append( item )

    return legend

In [362]:
# test
legend = findAllCandidates( description )
print(len(legend))

53


In [363]:
print_legend( legend )

   3  of 1.5-4 mm, with a thickness  4  of    0            thickness  4  of 0.5-1 mm. it
     --
                                              1                       1,192,748           
  n of drawings           [0008]      fig.    1            [0008]      fig. 1  is an obl
  2  of about 25-40 mm, and a width  3  of    1            d a width  3  of 1.5-4 mm, wi
     --
                                              2                       2,610,399           
                                              2                       2,676,595           
   u  shaped.             [0010]      fig.    2            [0010]      fig. 2   b  is a 
  nail plate.             [0009]      fig.    2            [0009]      fig. 2   a  is an
       fig. 2   b  is a side view of  fig.    2            de view of  fig. 2   a , illu
  uding rounded a  u  shape shown in  fig.    2            e shown in  fig. 2   a  or an
   flat, rectangular shape having a length    2            having a length  2  of about 

In [364]:
a = list( range(5) )
b = list( a )
a[4] = 10
b

[0, 1, 2, 3, 4]

In [365]:
def disqualify( legend, pattern, n=-1 ):
    for i, row in enumerate( legend ):
        if pattern.search( row['label'] ):
            legend[ i ]['number'] = n

KEYWORDS = ['january', 'february', 'march', 'april', 'may', 'june',\
            'july', 'august', 'september', 'october', 'november', 'december',\
            'fig', 'figure', 'figures', 'claim', 'claims', 'at', 'and',\
            'numeral', 'embodiment', 'invention', 'part']

KEYWORDS.extend( ['the', 'a', 'an', 'these', 'their', 'when', 'with', 'by', 'this', \
                   'have', 'having', 'has', 'is', 'are', 'over', 'its', 'of said', 'and', 'as',\
                   'of', 'in', 'to', 'but', 'another', 'through', 'on'] )
            
def firstScreening( legend ):
    """ Premier filtrage en regardant comment se termine le label,
        Critères éliminants:
        
            - ne se termine pas par une lettre (Typiquement Fig. 5)
            - se termine par un des mots clés ('claim') ou un mot commun
            
        change le numéro pour 998 ou 999..
    """
    newlegend = copy.deepcopy(legend) # copy (pour ne pas ecrasser Legend)
    
    pattern = regex.compile( r'[^a-z]$', regex.I )
    disqualify( newlegend, pattern )
   

    pattern = regex.compile( r'\W(%s)$' % '|'.join(KEYWORDS), regex.I )
    disqualify( newlegend, pattern )

    return newlegend

In [366]:
legend = firstScreening(legend) 
print_legend( legend )

   u  shaped.             [0010]      fig.   -1            [0010]      fig. 2   b  is a 
  extensions.             [0011]      fig.   -1            [0011]      fig. 3   a  is an
   v  shaped.             [0012]      fig.   -1            [0012]      fig. 3   b  is a 
  n of drawings           [0008]      fig.   -1            [0008]      fig. 1  is an obl
  nail plate.             [0009]      fig.   -1            [0009]      fig. 2   a  is an
       fig. 2   b  is a side view of  fig.   -1            de view of  fig. 2   a , illu
       fig. 3   b  is a side view of  fig.   -1            de view of  fig. 3   a , illu
  uding rounded a  u  shape shown in  fig.   -1            e shown in  fig. 2   a  or an
    or an angular  v  shape shown in  fig.   -1            e shown in  fig. 3   a , with
  2  of about 25-40 mm, and a width  3  of   -1            d a width  3  of 1.5-4 mm, wi
   3  of 1.5-4 mm, with a thickness  4  of   -1            thickness  4  of 0.5-1 mm. it
     --
             

## Coupe sur certain mot clé (the, a, an, ...)

In [367]:
def cut_legend_using_pattern( legend, pattern ):
    for i, row in enumerate( legend ):
        label = row['label']

        match = pattern.search(label)

        if match:
            newlabel = match.group(2)
            legend[i]['label'] = newlabel
            
SMALL_WORDS = ['the', 'a', 'an', 'these', 'their', 'when', 'with', 'by', 'this', 'that', \
               'have', 'having', 'has', 'is', 'are', 'should', 'over', 'its', 'of said', 'and', 'as',\
               'of', 'in', 'to', 'at',  'but', 'another', 'through', 'on', 'same', 'from',\
               'include', 'includes', 'beyond', 'between']

In [368]:
def cutLabels( legend ):
    """ coupe les labels sur certain mots commun ('the', 'a'... etc)
        et sur certains caractères spéciaux (. , ;)
    """
    newlegend = copy.deepcopy(legend) # copy (pour ne pas ecrasser Legend)
    
    pattern_small_words = regex.compile( r'^.* (%s) (.+)$' % '|'.join(SMALL_WORDS), regex.I )
    cut_legend_using_pattern( newlegend, pattern_small_words )
    
    pattern = regex.compile( '^.*([\.,;] )(.+)$' )
    cut_legend_using_pattern( newlegend, pattern )
    
    return newlegend

In [369]:
# test
legend = cutLabels(legend) 
print_legend( legend )

                                      fig.   -1            de view of  fig. 2   a , illu
                                      fig.   -1            de view of  fig. 3   a , illu
                                      fig.   -1            e shown in  fig. 2   a  or an
                                      fig.   -1            e shown in  fig. 3   a , with
                          [0010]      fig.   -1            [0010]      fig. 2   b  is a 
                          [0011]      fig.   -1            [0011]      fig. 3   a  is an
                          [0012]      fig.   -1            [0012]      fig. 3   b  is a 
       drawings           [0008]      fig.   -1            [0008]      fig. 1  is an obl
                          [0009]      fig.   -1            [0009]      fig. 2   a  is an
                              width  3  of   -1            d a width  3  of 1.5-4 mm, wi
                          thickness  4  of   -1            thickness  4  of 0.5-1 mm. it
     --
             

### Merge : regroupe les candidats ayant un label identique

Rq: utile pour le dev. mais pas pour l'identification

In [370]:
def get_positions( label, candidats ):
    positions = []
    for item in candidats:
        if item['label']==label:
            positions.extend( item['position'] )
    return positions

def get_context( label, candidats ):
    items = [ item for item in candidats if item['label']==label ]
    
    return items[0]['context']

def merge( legend ):
    # merge identical label
    
    numbers_unique = { item['number'] for item in legend  } # set

    new_legend = []
    for n in numbers_unique:

        candidats = [ line for line in legend if line['number']==n ]

        labels_unique = { item['label'] for item in candidats  }

        new_candidats = [ { 'number':n, 'label':label_u,\
                           'position':get_positions(label_u, candidats), 'context':get_context(label_u, candidats) }
                          for label_u in labels_unique ]
        
        new_legend.extend( new_candidats )

    return new_legend

In [371]:
print_legend( merge( legend ) )

                          [0010]      fig.   -1            [0010]      fig. 2   b  is a 
                          [0011]      fig.   -1            [0011]      fig. 3   a  is an
                          [0012]      fig.   -1            [0012]      fig. 3   b  is a 
       drawings           [0008]      fig.   -1            [0008]      fig. 1  is an obl
                          [0009]      fig.   -1            [0009]      fig. 2   a  is an
                              width  3  of   -1            d a width  3  of 1.5-4 mm, wi
                          thickness  4  of   -1            thickness  4  of 0.5-1 mm. it
                                      fig.   -1  (4)        de view of  fig. 2   a , illu
     --
                                              1                       1,192,748           
     --
                                    length    2            having a length  2  of about 
                                              2  (2)                   2,610,399           

## Identification du meilleur candidats

In [372]:
def candidates_for_onenumber( legend, i ):
    candidates = [ c for c in legend if c['number'] == i ]

    for c in candidates:
        c['reversed'] = c['label'].split(' ')[::-1]
        c['weight'] = len( c['position'] )
        
    return candidates

In [373]:
print( candidates_for_onenumber(legend, 34) )

[]


In [374]:
def increment_dico( dico, key, value ):
    if key in dico:
        dico[key] += value
    else:
        dico[key] = value
        
def append_dico( dico, key, value ):
    if key in dico:
        dico[key].append( value )
    else:
        dico[key] = [value]

def who_are_the_winners( scoreboard ):
    scoreMax = max( scoreboard.values() )
    winners = [ sb[0] for sb in scoreboard.items() if sb[1] == scoreMax ]
    return winners

In [375]:
def choose_best_label( remainingcandidates ):
    """ Choisi un candidat gagnant unique
    
        choisi le label mot par mot, en partant de la fin :
            - si un mot apparait plus souvent que tout les autres, alors il est choisi
            - si plusieurs mots apparaissent aussi souvent, alors le label est coupé ici
    """
    N = len( remainingcandidates )
    
    if N == 1:
        return remainingcandidates[0]['label']
    
    previouschoice=[]
    for i in range(20):
        current_indice = len(previouschoice)

        scoreboard = {}
        nextword2candidates ={}
        for c in remainingcandidates:
            if current_indice < len( c['reversed'] ):
                next_word = c['reversed'][ current_indice ]
            else:
                next_word = ''

            append_dico( nextword2candidates, next_word, c )
            increment_dico( scoreboard, next_word, c['weight'] )

            #print(next_word, c['weight'])

        #print( 'scoreboard: ', scoreboard )
        winners = who_are_the_winners( scoreboard )
        #print( 'winners:', winners )
        if len(winners) == 1 and winners[0]:
            choice = winners[0]
            remainingcandidates = nextword2candidates[choice]
            previouschoice.append( choice )
            #print(' ')

        else: # exaequo ou fin -> stop
            selectedlabel = ' '.join( previouschoice[::-1] )
            #print( selectedlabel )
            break

    return selectedlabel

In [376]:
# test
candidates = candidates_for_onenumber(legend, 33)
choose_best_label( candidates )

ValueError: max() arg is an empty sequence

In [None]:
allnumbers = sorted( { c['number'] for c in legend if c['number']>0 } ) # unique et triés
print( allnumbers )

In [None]:
final_legend = []
for numero in allnumbers:
    candidates = candidates_for_onenumber(legend, numero)
    thelabel = choose_best_label( candidates )

    line = {'numero':numero , 'label':thelabel  }
    final_legend.append( line )

In [None]:
for line in final_legend:
    print( '{numero:>5}  {label}'.format( **line ) )

# Remarques
### Comment selectionner le meilleurs candidats ?

 * si unique:  ok
       et si unique et len()==40... ? c.a.d. pas coupé
 * si max unique: ok
 * sinon : selectionne les qualifiés, puis...
 * exaequo...  construit un arbre de comptage ?




### To look
* patents number: 3,803,713  
* comment ne pas modifier le texte, pour pouvoir l'annoter ?
* paranthese: between the elongated plates ( 1 , 2 ).
* line 5 - 5
*      march   15                 march 15,   
en debut de la description ...
* generally... adverbes ?
* " respectivement A et B " ... respective
* letter sur les vieux brevets
* merge keywords and small_words ??
* pluriel

* sort of recursive... identifier un permet d'en déduire un autre...

## other things
* Find values like "110 volts"... 

In [None]:
# Find values
units = ['inch', 'volt', 'm', 'mm']
find_sci_values =  r" (\d{1,4}([\.,]\d{1,4})? (%s)\W)" % '|'.join(units)

print( regex.findall(find_sci_values, description) )
description = regex.sub(find_sci_values, '<value>', description  )

In [None]:
# Find values
units = ['inch', 'volt', 'm', 'mm']
find_sci_values =  r" (\d{1,4}([\.,]\d{1,4})? (%s)\W)" % '|'.join(units)

print( regex.findall(find_sci_values, description) )
description = regex.sub(find_sci_values, '<value>', description  )

In [None]:
# Find floats
find_sci_floats =  r" (\d{1,4}([\.,]\d{1,4})\W)" 

print( regex.findall(find_sci_floats, description) )
description = regex.sub(find_sci_floats, '<value>', description  )

In [None]:
# " between 0.4 to 2.5 " 
betweenpattern = r"(between [\d\.,]{1,4} (to|and) [\d\.,]{1,4}\W)"

print( regex.findall(betweenpattern, description) )
description = regex.sub(betweenpattern, '<between>', description  )