# Pré-traitement des données 'Google'

* requète sur `BigQuerry`:
        /* 26Go proceceed */

        #standardSQL

        SELECT
          root.publication_number,
          publication_date,
          root.title_localized,
          root.cpc,
          root.citation
        FROM
          `patents-public-data.patents.publications` root,
          UNNEST( cpc ) AS cpc
        WHERE 
         cpc.code like 'A45D29/02' and
         country_code = 'US'
        LIMIT
          1000

In [133]:
import json
import pickle

In [134]:
import datetime as dt

In [155]:
# load the data
file_path = './data/fulldata_gglBigQuery.json'

data_brut = []
with open(file_path) as f:
    for line in f:
        entry = json.loads(line)
        data_brut.append( entry )
        
print( len(data_brut) )

408


In [171]:
print( data_brut[10].keys() )

dict_keys(['cpc', 'publication_date', 'citation', 'priority_claim', 'description_localized', 'inventor_harmonized', 'publication_number', 'title_localized', 'abstract_localized', 'assignee_harmonized'])


## Créer un dictionnaire { patent_number : infos, ... }

In [151]:
# formate le numéro de brevet (pour ggl):
# 6 ou 7 digits
# ou l'année + 7 digits
# si on trouve année+6digits ce n'est pas bon

# voir: https://www.uspto.gov/patents-application-process/applying-online/patent-number
# http://www.bpmlegal.com/howtopat1.html

def format_patentnumber( pubnumber ):
    
    number = pubnumber.split('-')[1]
    
    if len( number ) > 7:  # en vrai ==10
        number = number[0:4] + number[4:].zfill(7)

    number = 'US' + number
    
    return number

In [172]:
data = {}
k = 0  # debug
for entry in data_brut:
   
    
    pubnum = entry['publication_number']
    patent_number = format_patentnumber( pubnum )
    
    if patent_number in data: # supprime les doublons
        continue

    new_entry = {}
    
    new_entry['publication_number_raw'] = pubnum
    new_entry['patent_number'] = patent_number
    
    # date
    new_entry['year'] = int(  entry['publication_date'][0:4] )   
    new_entry['month'] = int( entry['publication_date'][4:6] )
    new_entry['day'] = int(   entry['publication_date'][6:]  )
    date = dt.date(new_entry['year'], new_entry['month'], new_entry['day'])
    
    new_entry['date_str'] = entry['publication_date']
    new_entry['date_formatted'] = date.strftime( '%B %d, %Y' )
    
    # info
    new_entry['title'] = entry['title_localized'][0]['text'].strip('.')
    
    # abstract
    if len( entry['abstract_localized'] )>0:
        new_entry['abstract'] = entry['abstract_localized'][0]['text']
    else:
        new_entry['abstract'] = ''
        
    # description text
    if len( entry['description_localized'] )>0:
        new_entry['description'] = entry['description_localized'][0]['text']
    else:
        new_entry['description'] = ''
    
    # inventor & assignee
    new_entry['inventor'] = entry['inventor_harmonized']
    new_entry['assignee'] = entry['assignee_harmonized']
    
    # citations :
    citation = [] 
    for cit in entry['citation']:
        if not cit['publication_number'] : continue
        infos = {}
        infos['publication_number_raw'] = cit['publication_number'] 
        infos['patent_number'] = format_publicationnumber( cit['publication_number']  )

        infos['category'] =  cit['category'] 

        citation.append( infos )
        
    new_entry['citation'] = citation

    # CPC
    new_entry['cpc'] = entry['cpc']
    
    # save
    data[ patent_number ] = new_entry
    
    k+=1  # debug
print(len(data))

379


In [173]:
# test
print( list(data.values())[2].keys() )

dict_keys(['title', 'cpc', 'year', 'date_str', 'date_formatted', 'publication_number_raw', 'description', 'abstract', 'citation', 'assignee', 'month', 'patent_number', 'inventor', 'day'])


## Mise en forme des noms

In [158]:
import re

In [159]:
def format_name( name ):
    # ajoute le point pour une initiale (lettre seule)
    singleletter = re.compile( r'\b(?P<letter>[A-Z])(?!\.)\b' )
    name = singleletter.sub('\g<letter>.', name)

    # passe en minuscule sauf premiere lettre
    wordpattern = re.compile( r'\b(?P<word>\w+)\b' )
    titlefun = lambda match:  match.group('word').title()
    name = wordpattern.sub(titlefun, name)

    # JR -> Jr.
    JRpattern = re.compile( r'\b(?P<jr>JR)(?!\.)\b', re.I )
    name = JRpattern.sub('Jr.', name)
    
    return name

In [160]:
# test
name = 'FRIEDMAN JR DAVID Jr. H hello B'
format_name( name )
name

'FRIEDMAN JR DAVID Jr. H hello B'

In [162]:
# pour les inventeurs:
for e in data.values():
    for i in e['inventor']:
        i['name_formatted'] = format_name( i['name'] )

In [174]:
# pour les 'assignees':
for e in data.values():
    for i in e['assignee']:
        i['name_formatted'] = format_name( i['name'] )

## Ajoute les entrées : 'cited' and 'citedby'

In [175]:
def countplusone( d, key ):
    # compte les occurences 
    if key in d:
        d[ key ]+=1
    else:
        d[ key ] = 1

In [176]:
# init
ghost = {}

for entry in data.values():
    entry['cited'] = []
    entry['citedby'] = []
    
# loop
for patent_number, entry in data.items():
    
    cited_number_list = [ c['patent_number'] for c in entry['citation']  ]
    
    for cited_number in cited_number_list:
        if cited_number in data:
            entry['cited'].append( cited_number )
            data[ cited_number ]['citedby'].append( patent_number )
        else:
            countplusone( ghost, cited_number ) 
            
    
print( len(ghost) )  # nombre des brevets cités mais non dans DATA

1086


In [180]:
# Statistique des brevets 'ghost'
print( sorted( ghost.items(), key= lambda x:x[1], reverse = True)[:10] )

[('US6220251', 17), ('US3744131', 16), ('US5546658', 16), ('US3838507', 15), ('US6523545', 14), ('US5392518', 14), ('US3903596', 14), ('US2955354', 13), ('US4856190', 12), ('US4196514', 12)]


In [148]:
# test
print( list(data.values())[2]['cited'] )

['US846565', 'US1085569', 'US1363164', 'US3089239', 'US4637137', 'US5323537', 'US20050160604', 'US20050172488', 'US20070067995']


In [181]:
pickle.dump( data, open('web/data/patent_infos.pickle','wb') )

## Cherche les images  et enregistre les meta-données

In [219]:
import os
import re

In [228]:
def patentid_from_figname( figname ):
    return figname.split('-')[0]
 
FIGURESDIR = 'web/static/figures_extracted/'
FIGURESLIST =  os.listdir( FIGURESDIR )

print('nombre de figures:', len(FIGURESLIST))

nombre de figures: 2586


In [229]:
FIGURESLIST[0]

'US20060143923-fig18.png'

In [230]:
# init 
for entry in data.values():
    entry['figures'] = []

pattern = re.compile( r"^(US[0-9RE]+)-fig([0-9]+).png$" )

# loop
nopatentforfigure = {}
for figname in FIGURESLIST:
    matchs = pattern.match( figname )
    if not matchs:
        print( 'erreur %s' % figname )
    else:
        fignum = matchs.group(2)
        patnum = matchs.group(1)
        
        figinfo = {'filename':figname, 'number':fignum}
        
        if patnum in data:
            data[patnum]['figures'].append( figinfo )
        else:
            if patnum in nopatentforfigure:
                nopatentforfigure[patnum].append( figinfo )
            else:
                nopatentforfigure[patnum] = [ figinfo ]
                
print('nombre de brevet hors DATA ayant une figure : ', len(nopatentforfigure))

nombre de brevet hors DATA ayant une figure :  0


In [231]:
# Brevet sans figures :
patentwithoutfigure = []
for patnum, entry in data.items():
    nfig = len( entry['figures'] )
    if nfig == 0:
        patentwithoutfigure.append( patnum )
        
print( 'nombre de brevets sans figures : %i'% len(patentwithoutfigure))

# enregistre la liste - pour effectuer les traitements 
pickle.dump( data, open('extract_image/patentwithoutfigure.pickle','wb') )

nombre de brevets sans figures : 60


In [233]:
# Trie les figures par numéro (à priori l'ordre d'aparition dans le brevet)

for entry in data.values():
    entry['figures'] = sorted(entry['figures'], key= lambda x: int( x['number'] ) )

In [234]:
pickle.dump( data, open('web/data/patent_infos.pickle','wb') )

https://stackoverflow.com/a/19035508/8069403

    >>> t = magic.from_file('teste.png')
    >>> t
    'PNG image data, 782 x 602, 8-bit/color RGBA, non-interlaced'
    >>> re.search('(\d+) x (\d+)', t).groups()
    ('782', '602')

In [226]:
sorted( ['910', '2',  '4'])

['2', '4', '910']

In [186]:
wand.from_file('g')

AttributeError: module 'wand' has no attribute 'from_file'