# Pré-traitement des données 'Google'

* requète sur `BigQuerry`:
        /* 26Go proceceed */

        #standardSQL

        SELECT
          root.publication_number,
          publication_date,
          root.title_localized,
          root.cpc,
          root.citation
        FROM
          `patents-public-data.patents.publications` root,
          UNNEST( cpc ) AS cpc
        WHERE 
         cpc.code like 'A45D29/02' and
         country_code = 'US'
        LIMIT
          1000

In [134]:
import json
import pickle

In [135]:
import datetime as dt

In [136]:
# load the data
file_path = './data/fulldata_gglBigQuery.json'

data_brut = []
with open(file_path) as f:
    for line in f:
        entry = json.loads(line)
        data_brut.append( entry )
        
print( len(data_brut) )

408


In [137]:
print( data_brut[10].keys() )

dict_keys(['abstract_localized', 'assignee_harmonized', 'cpc', 'title_localized', 'inventor_harmonized', 'publication_number', 'description_localized', 'publication_date', 'priority_claim', 'citation'])


## Créer un dictionnaire { patent_number : infos, ... }

In [138]:
# formate le numéro de brevet (pour ggl):
# 6 ou 7 digits
# ou l'année + 7 digits
# si on trouve année+6digits ce n'est pas bon

# voir: https://www.uspto.gov/patents-application-process/applying-online/patent-number
# http://www.bpmlegal.com/howtopat1.html

def format_patentnumber( pubnumber ):
    
    number = pubnumber.split('-')[1]
    
    if len( number ) > 7:  # en vrai ==10
        number = number[0:4] + number[4:].zfill(7)

    number = 'US' + number
    
    return number

In [139]:
data = {}
k = 0  # debug
for entry in data_brut:
   
    
    pubnum = entry['publication_number']
    patent_number = format_patentnumber( pubnum )
    
    if patent_number in data: # supprime les doublons
        continue

    new_entry = {}
    
    new_entry['publication_number_raw'] = pubnum
    new_entry['patent_number'] = patent_number
    
    # date
    new_entry['year'] = int(  entry['publication_date'][0:4] )   
    new_entry['month'] = int( entry['publication_date'][4:6] )
    new_entry['day'] = int(   entry['publication_date'][6:]  )
    date = dt.date(new_entry['year'], new_entry['month'], new_entry['day'])
    
    new_entry['date_str'] = entry['publication_date']
    new_entry['date_formatted'] = date.strftime( '%B %d, %Y' )
    
    # info
    new_entry['title'] = entry['title_localized'][0]['text'].strip('.')
    
    # abstract
    if len( entry['abstract_localized'] )>0:
        new_entry['abstract'] = entry['abstract_localized'][0]['text']
    else:
        new_entry['abstract'] = ''
        
    # description text
    if len( entry['description_localized'] )>0:
        new_entry['description'] = entry['description_localized'][0]['text']
    else:
        new_entry['description'] = ''
    
    # inventor & assignee
    new_entry['inventor'] = entry['inventor_harmonized']
    new_entry['assignee'] = entry['assignee_harmonized']
    
    # citations :
    citation = [] 
    for cit in entry['citation']:
        if not cit['publication_number'] : continue
        infos = {}
        infos['publication_number_raw'] = cit['publication_number'] 
        infos['patent_number'] = format_patentnumber( cit['publication_number']  )

        infos['category'] =  cit['category'] 

        citation.append( infos )
        
    new_entry['citation'] = citation

    # CPC
    new_entry['cpc'] = entry['cpc']
    
    # save
    data[ patent_number ] = new_entry
    
    k+=1  # debug
print(len(data))

379


In [140]:
# test
print( list(data.values())[2].keys() )

dict_keys(['date_str', 'day', 'assignee', 'date_formatted', 'description', 'month', 'cpc', 'year', 'publication_number_raw', 'abstract', 'patent_number', 'inventor', 'citation', 'title'])


## Mise en forme des noms

In [141]:
import re

In [142]:
def format_name( name ):
    # ajoute le point pour une initiale (lettre seule)
    singleletter = re.compile( r'\b(?P<letter>[A-Z])(?!\.)\b' )
    name = singleletter.sub('\g<letter>.', name)

    # passe en minuscule sauf premiere lettre
    wordpattern = re.compile( r'\b(?P<word>\w+)\b' )
    titlefun = lambda match:  match.group('word').title()
    name = wordpattern.sub(titlefun, name)

    # JR -> Jr.
    JRpattern = re.compile( r'\b(?P<jr>JR)(?!\.)\b', re.I )
    name = JRpattern.sub('Jr.', name)
    
    return name

In [143]:
# test
name = 'FRIEDMAN JR DAVID Jr. H hello B'
format_name( name )
name

'FRIEDMAN JR DAVID Jr. H hello B'

In [144]:
# pour les inventeurs:
for e in data.values():
    for i in e['inventor']:
        i['name_formatted'] = format_name( i['name'] )

In [145]:
# pour les 'assignees':
for e in data.values():
    for i in e['assignee']:
        i['name_formatted'] = format_name( i['name'] )

## Ajoute les entrées : 'cited' and 'citedby'

In [146]:
def countplusone( d, key ):
    # compte les occurences 
    if key in d:
        d[ key ]+=1
    else:
        d[ key ] = 1

In [147]:
# init
ghost = {}

for entry in data.values():
    entry['cited'] = []
    entry['citedby'] = []
    
# loop
for patent_number, entry in data.items():
    
    cited_number_list = [ c['patent_number'] for c in entry['citation']  ]
    
    for cited_number in cited_number_list:
        if cited_number in data:
            entry['cited'].append( cited_number )
            data[ cited_number ]['citedby'].append( patent_number )
        else:
            countplusone( ghost, cited_number ) 
            
    
print( len(ghost) )  # nombre des brevets cités mais non dans DATA

1086


In [148]:
# Statistique des brevets 'ghost'
print( sorted( ghost.items(), key= lambda x:x[1], reverse = True)[:10] )

[('US6220251', 17), ('US3744131', 16), ('US5546658', 16), ('US3838507', 15), ('US3903596', 14), ('US5392518', 14), ('US6523545', 14), ('US2955354', 13), ('US4196514', 12), ('US4856190', 12)]


In [149]:
# test
print( list(data.values())[20]['cited'] )

['US702516', 'US3093147']


In [150]:
pickle.dump( data, open('web/data/patent_infos.pickle','wb') )

## Cherche les images existantes  et enregistre les meta-données

In [151]:
import os
import re

In [152]:
def patentid_from_figname( figname ):
    return figname.split('-')[0]
 
FIGURESDIR = 'web/static/figures_extracted/'
FIGURESLIST =  os.listdir( FIGURESDIR )

print('nombre de figures:', len(FIGURESLIST))

nombre de figures: 3361


In [153]:
FIGURESLIST[0]

'US9204703-fig7.png'

In [154]:
# init 
for entry in data.values():
    entry['figures'] = []

pattern = re.compile( r"^(US[0-9A-Z]+)-fig([0-9]+).png$" )

# loop
nopatentforfigure = {}
for figname in FIGURESLIST:
    matchs = pattern.match( figname )
    if not matchs:
        print( 'erreur %s' % figname )
    else:
        fignum = matchs.group(2)
        patnum = matchs.group(1)
        
        figinfo = {'filename':figname, 'number':fignum}
        
        if patnum in data:
            data[patnum]['figures'].append( figinfo )
        else:
            if patnum in nopatentforfigure:
                nopatentforfigure[patnum].append( figinfo )
            else:
                nopatentforfigure[patnum] = [ figinfo ]
                
print('nombre de brevet hors DATA ayant une figure : ', len(nopatentforfigure))

nombre de brevet hors DATA ayant une figure :  0


In [155]:
# Brevet sans figures :
patentwithoutfigure = []
for patnum, entry in data.items():
    nfig = len( entry['figures'] )
    if nfig == 0:
        patentwithoutfigure.append( patnum )
        
print( 'nombre de brevets sans figures : %i'% len(patentwithoutfigure))

# enregistre la liste - pour effectuer les traitements 
pickle.dump( data, open('extract_image/patentwithoutfigure.pickle','wb') )

nombre de brevets sans figures : 3


In [156]:
# Trie les figures par numéro (à priori l'ordre d'aparition dans le brevet)

for entry in data.values():
    entry['figures'] = sorted(entry['figures'], key= lambda x: int( x['number'] ) )

## Ajoute les dimensions de l'image

In [157]:
# https://stackoverflow.com/a/19035508/8069403
import magic

In [158]:
def add_image_size( figure ):
    filepath = FIGURESDIR+figure['filename']

    filebegining = magic.from_file(filepath)
    w, h = re.search('(\d+) x (\d+)', filebegining).groups()

    figure['width'] = int(w)
    figure['height'] = int(h)

In [159]:
# Loop
for entry in data.values() :
    for figure in entry['figures']:
        add_image_size( figure )

# Ajoute la legende

In [160]:
from extract_legend.extract_legend import  extract_legend

In [161]:
import numpy as np

def isThereOddNumber( legend ):
    numlist = np.array( [ line['numero'] for line in legend ] )
    thereisOddNumber = ( numlist % 2 ).any()
    return thereisOddNumber

In [162]:
def add_consecutive_field( legend ):
    """ test si les numéros sont consécutifs (sans gap)
    """
    increment = 1 if isThereOddNumber(legend) else 2
    # parce que certaines legendes sont numérotée de deux en deux...
    
    for i in range( len(legend)-1 ):
        n = legend[i]['numero']
        nSuivant = legend[i+1]['numero']
        if n+increment == nSuivant:
            legend[i]['consecutive'] = True
        else:
            legend[i]['consecutive'] = False

    legend[-1]['consecutive'] = True

In [163]:
# Loop
for e in  data.values():
    legend, raw_legend = extract_legend( e['description'] )
    if legend:
        add_consecutive_field( legend )
    e['legend'] = legend
    e['raw_legend'] = raw_legend

## Grp by year

In [164]:
years = sorted( { patent['year'] for patent in data.values() } )
years[:10]

[1873, 1876, 1879, 1881, 1884, 1885, 1889, 1890, 1891, 1893]

In [165]:
datagroupedbyyear = [ {'year':y, 'patents':[]} for y in years ]

In [166]:
years = sorted( { patent['year'] for patent in data.values() } )
datagroupedbyyear = [ {'year':y, 'patents':[]} for y in years ]

for patent in data.values():
    i = years.index( patent['year'] )
    datagroupedbyyear[i]['patents'].append( patent )

In [167]:
# init
years = sorted( { patent['year'] for patent in data.values() } )
year2patent =   { y:[] for y in years }

# Loop
for patent in data.values():
    date = patent['year'], patent['month'], patent['day']

    year2patent[date[0]].append ( {'patent_number': patent['patent_number'], 'date':date} )
    
# Sort
for year, patents in year2patent.items():
    year2patent[year] = sorted( patents, key=lambda x:x['date']  ) 

In [168]:
# save
pickle.dump( data, open('web/data/patent_infos.pickle','wb') )