# Pré-traitement des données 'Google'

* requète sur `BigQuerry`:
        /* 26Go proceceed */

        #standardSQL

        SELECT
          root.publication_number,
          publication_date,
          root.title_localized,
          root.cpc,
          root.citation
        FROM
          `patents-public-data.patents.publications` root,
          UNNEST( cpc ) AS cpc
        WHERE 
         cpc.code like 'A45D29/02' and
         country_code = 'US'
        LIMIT
          1000

In [133]:
import json
import pickle

In [134]:
import datetime as dt

In [135]:
# load the data
file_path = './data/fulldata_gglBigQuery.json'

data_brut = []
with open(file_path) as f:
    for line in f:
        entry = json.loads(line)
        data_brut.append( entry )
        
print( len(data_brut) )

408


In [136]:
# le format du numéro de brevet (pour ggl):
# 6 ou 7 digits
# ou l'année + 7 digits
# si on trouve année+6digits ce n'est pas bon

# https://www.uspto.gov/patents-application-process/applying-online/patent-number
# http://www.bpmlegal.com/howtopat1.html

def format_publicationnumber( pubnumber ):
    
    number = pubnumber.split('-')[1]
    
    if len( number ) > 7:  # en vrai ==10
        number = number[0:4] + number[4:].zfill(7)

    number = 'US' + number
    
    return number

In [137]:
data_brut[0].keys()

dict_keys(['cpc', 'publication_date', 'citation', 'priority_claim', 'description_localized', 'inventor_harmonized', 'publication_number', 'title_localized', 'abstract_localized', 'assignee_harmonized'])

In [138]:
data_brut[0]

{'abstract_localized': [{'language': 'en',
   'text': 'An ingrown toenail cutter ( 10, 10 A) including a handle or shank ( 12, 12 A) which at one end has a curved or angled part ( 14, 52 ) which extends away from the handle ( 12, 12 A) and said curved or angled part ( 14, 52 ) has a cutting edge ( 15, 54 ) on an inner side thereof ( 14 A) and there is also provided a terminal abutment ( 17, 56 ) at a free end of the curved or angled part ( 14, 52 ) on the inner side ( 14 A) adjacent to cutting edge ( 15, 54 ) wherein the cutting edge ( 15,54 ) is oriented at an angle of  15° - 60°  to a longitudinal axis of the handle or shank.'}],
 'assignee_harmonized': [{'country_code': 'AU', 'name': 'HEDGER ALLAN'}],
 'citation': [{'application_number': '',
   'category': 'PRS',
   'filing_date': '0',
   'npl_text': '',
   'publication_number': 'US-350720-A',
   'type': ''},
  {'application_number': '',
   'category': 'PRS',
   'filing_date': '0',
   'npl_text': '',
   'publication_number': 'US-448

In [139]:
data = {}
k = 0
for entry in data_brut:
   
    
    pubnum = entry['publication_number']
    patent_number = format_publicationnumber( pubnum )
    
    if patent_number in data: # supprime les doublons
        continue

    new_entry = {}
    
    new_entry['publication_number_raw'] = pubnum
    new_entry['patent_number'] = patent_number
    
    # date
    new_entry['year'] = int(  entry['publication_date'][0:4] )   
    new_entry['month'] = int( entry['publication_date'][4:6] )
    new_entry['day'] = int(   entry['publication_date'][6:]  )
    date = dt.date(new_entry['year'], new_entry['month'], new_entry['day'])
    
    new_entry['date_str'] = entry['publication_date']
    new_entry['date_formatted'] = date.strftime( '%B %d, %Y' )
    
    # info
    new_entry['title'] = entry['title_localized'][0]['text'].strip('.')
    
    # abstract
    if len( entry['abstract_localized'] )>0:
        new_entry['abstract'] = entry['abstract_localized'][0]['text']
    else:
        new_entry['abstract'] = ''
        
    # description text
    if len( entry['description_localized'] )>0:
        new_entry['description'] = entry['description_localized'][0]['text']
    else:
        new_entry['description'] = ''
    
    # inventor
    new_entry['inventor'] = entry['inventor_harmonized']
    
    # citations :
    citation = [] 
    for cit in entry['citation']:
        if not cit['publication_number'] : continue
        infos = {}
        infos['publication_number_raw'] = cit['publication_number'] 
        infos['patent_number'] = format_publicationnumber( cit['publication_number']  )

        infos['category'] =  cit['category'] 

        citation.append( infos )
        
    new_entry['citation'] = citation

    # CPC
    new_entry['cpc'] = entry['cpc']
    
    # save
    data[ patent_number ] = new_entry
    
    k+=1  # debug
print(len(data))

379


In [140]:
# test
print( list(data.items())[2] )

('US20130067748', {'title': 'Apparatus for safely clipping the nails of young children and method of use thereof', 'cpc': [{'code': 'A45D29/02', 'first': 'true', 'inventive': 'true', 'tree': []}, {'code': 'A45D29/02', 'first': 'true', 'inventive': 'true', 'tree': []}], 'year': 2013, 'date_str': '20130321', 'date_formatted': 'March 21, 2013', 'publication_number_raw': 'US-2013067748-A1', 'description': 'CROSS-REFERENCE TO RELATED APPLICATIONS \n       [0001]    This application claims the benefit of and is a continuation of prior U.S. Provisional Application No. 61/494,357, filed 13 Jun. 2011. \n     \n    \n     BACKGROUND OF THE INVENTION \n       [0002]    The present invention generally relates to apparatus for clipping nails and, more specifically, to a nail clipper apparatus for safely clipping the fingernails or toenails of young children and others who may require care in the trimming of nails, such as the elderly, persons with poor eyesight, or disabled persons. \n         [000

## Format names

In [141]:
import re

In [142]:
def format_name( name ):
    # ajoute le point pour une initiale (lettre seule)
    singleletter = re.compile( r'\b(?P<letter>[A-Z])(?!\.)\b' )
    name = singleletter.sub('\g<letter>.', name)

    # passe en minuscule sauf premiere lettre
    wordpattern = re.compile( r'\b(?P<word>\w+)\b' )
    titlefun = lambda match:  match.group('word').title()
    name = wordpattern.sub(titlefun, name)

    # JR -> Jr.
    JRpattern = re.compile( r'\b(?P<jr>JR)(?!\.)\b', re.I )
    name = JRpattern.sub('Jr.', name)
    
    return name

In [143]:
# test
name = 'FRIEDMAN JR DAVID Jr. H hello B'
format_name( name )
name

'FRIEDMAN JR DAVID Jr. H hello B'

In [144]:
for e in data.values():
    for i in e['inventor']:
        i['name_formatted'] = format_name( i['name'] )

## Add 'cited' and 'citedby'

In [145]:
def countplusone( d, key ):
    if key in d:
        d[ key ]+=1
    else:
        d[ key ] = 1

In [146]:
# init
ghost = {}

for entry in data.values():
    entry['cited'] = []
    entry['citedby'] = []
    
# loop
for patent_number, entry in data.items():
    
    cited_number_list = [ c['patent_number'] for c in entry['citation']  ]
    
    for cited_number in cited_number_list:
        if cited_number in data:
            entry['cited'].append( cited_number )
            data[ cited_number ]['citedby'].append( patent_number )
        else:
            countplusone( ghost, cited_number ) 
            
    
    
print( len(ghost) )

1086


In [147]:
pickle.dump( data, open('web/data/patent_infos.pickle','wb') )

In [148]:
# test
print( list(data.values())[2]['cited'] )

['US846565', 'US1085569', 'US1363164', 'US3089239', 'US4637137', 'US5323537', 'US20050160604', 'US20050172488', 'US20070067995']


## search images

https://stackoverflow.com/a/19035508/8069403

    >>> t = magic.from_file('teste.png')
    >>> t
    'PNG image data, 782 x 602, 8-bit/color RGBA, non-interlaced'
    >>> re.search('(\d+) x (\d+)', t).groups()
    ('782', '602')