# Pré-traitement des données 'Google'

* requète sur `BigQuerry`:
        /* 26Go proceceed */

        #standardSQL

        SELECT
          root.publication_number,
          publication_date,
          root.title_localized,
          root.cpc,
          root.citation
        FROM
          `patents-public-data.patents.publications` root,
          UNNEST( cpc ) AS cpc
        WHERE 
         cpc.code like 'A45D29/02' and
         country_code = 'US'
        LIMIT
          1000

In [54]:
import json
import pickle

In [82]:
# load the data
file_path = './data/fulldata_gglBigQuery.json'

data_brut = []
with open(file_path) as f:
    for line in f:
        entry = json.loads(line)
        data_brut.append( entry )
        
print( len(data_brut) )

408


In [83]:
# le format du numéro de brevet (pour ggl):
# 6 ou 7 digits
# ou l'année + 7 digits
# si on trouve année+6digits ce n'est pas bon

# https://www.uspto.gov/patents-application-process/applying-online/patent-number
# http://www.bpmlegal.com/howtopat1.html

def format_publicationnumber( pubnumber ):
    
    number = pubnumber.split('-')[1]
    
    if len( number ) > 7:  # en vrai ==10
        number = number[0:4] + number[4:].zfill(7)

    number = 'US' + number
    
    return number

In [86]:
data_brut[0].keys()

dict_keys(['assignee_harmonized', 'priority_claim', 'description_localized', 'inventor_harmonized', 'cpc', 'publication_number', 'publication_date', 'citation', 'title_localized', 'abstract_localized'])

In [88]:
data_brut[0]

{'abstract_localized': [{'language': 'en',
   'text': 'An ingrown toenail cutter ( 10, 10 A) including a handle or shank ( 12, 12 A) which at one end has a curved or angled part ( 14, 52 ) which extends away from the handle ( 12, 12 A) and said curved or angled part ( 14, 52 ) has a cutting edge ( 15, 54 ) on an inner side thereof ( 14 A) and there is also provided a terminal abutment ( 17, 56 ) at a free end of the curved or angled part ( 14, 52 ) on the inner side ( 14 A) adjacent to cutting edge ( 15, 54 ) wherein the cutting edge ( 15,54 ) is oriented at an angle of  15° - 60°  to a longitudinal axis of the handle or shank.'}],
 'assignee_harmonized': [{'country_code': 'AU', 'name': 'HEDGER ALLAN'}],
 'citation': [{'application_number': '',
   'category': 'PRS',
   'filing_date': '0',
   'npl_text': '',
   'publication_number': 'US-350720-A',
   'type': ''},
  {'application_number': '',
   'category': 'PRS',
   'filing_date': '0',
   'npl_text': '',
   'publication_number': 'US-448

In [100]:
data = {}
k = 0
for entry in data_brut:
   
    
    pubnum = entry['publication_number']
    patent_number = format_publicationnumber( pubnum )
    
    if patent_number in data: # supprime les doublons
        continue

    new_entry = {}
    
    new_entry['publication_number_raw'] = pubnum
    new_entry['patent_number'] = patent_number
    
    # date
    new_entry['year'] = int(  entry['publication_date'][0:4] )   
    new_entry['month'] = int( entry['publication_date'][4:6] )
    new_entry['day'] = int(   entry['publication_date'][6:]  )
    
    new_entry['date_str'] = entry['publication_date']

    # info
    new_entry['title'] = entry['title_localized'][0]['text']
    new_entry['title'] = entry['title_localized'][0]['text']
    
    # abstract
    if len( entry['abstract_localized'] )>0:
        new_entry['abstract'] = entry['abstract_localized'][0]['text']
    else:
        new_entry['abstract'] = ''
        
    # description text
    if len( entry['description_localized'] )>0:
        new_entry['description'] = entry['description_localized'][0]['text']
    else:
        new_entry['description'] = ''
    
    # inventor
    new_entry['inventor'] = entry['inventor_harmonized']
    
    # citations :
    citation = [] 
    for cit in entry['citation']:
        if not cit['publication_number'] : continue
        infos = {}
        infos['publication_number_raw'] = cit['publication_number'] 
        infos['patent_number'] = format_publicationnumber( cit['publication_number']  )

        infos['category'] =  cit['category'] 

        citation.append( infos )
        
    new_entry['citation'] = citation

    # CPC
    new_entry['cpc'] = entry['cpc']
    
    # save
    data[ patent_number ] = new_entry
    
    k+=1  # debug
print(len(data))

379


In [101]:
# test
print( list(data.items())[2] )

('US20140250692', {'year': 2014, 'abstract': 'An improved apparatus for use with nails or claws comprising an adjustable pressure activated clipper, which effectively closes upon successful capture and engagement of a nail or claw due to pressure realized on the surface of the clipper&#39;s articulating trigger by the nail or claw. In operation, as pressure is applied to the internal surface portion of the trigger by the user, the trigger automatically articulates and releases the refracted blade to cut the nail or claw at the desired length.', 'month': 9, 'title': 'Pressure Activated Clipper', 'citation': [{'publication_number_raw': 'US-4321764-A', 'category': 'PRS', 'patent_number': 'US4321764'}, {'publication_number_raw': 'US-4690091-A', 'category': 'PRS', 'patent_number': 'US4690091'}, {'publication_number_raw': 'US-5727318-A', 'category': 'PRS', 'patent_number': 'US5727318'}, {'publication_number_raw': 'US-6878024-B1', 'category': 'PRS', 'patent_number': 'US6878024'}, {'publicatio

## Add 'cited' and 'citedby'

In [102]:
def countplusone( d, key ):
    if key in d:
        d[ key ]+=1
    else:
        d[ key ] = 1

In [103]:
# init
ghost = {}

for entry in data.values():
    entry['cited'] = []
    entry['citedby'] = []
    
# loop
for patent_number, entry in data.items():
    
    cited_number_list = [ c['patent_number'] for c in entry['citation']  ]
    
    for cited_number in cited_number_list:
        if cited_number in data:
            entry['cited'].append( cited_number )
            data[ cited_number ]['citedby'].append( patent_number )
        else:
            countplusone( ghost, cited_number ) 
            
    
    
print( len(ghost) )

1086


In [104]:
pickle.dump( data, open('web/data/patent_infos.pickle','wb') )

In [78]:
# test
print( list(data.values())[2]['cited'] )

['US5727318', 'US20090223055']
