# Pré-traitement des données 'Google'

* requète sur `BigQuerry`:
        /* 26Go proceceed */

        #standardSQL

        SELECT
          root.publication_number,
          publication_date,
          root.title_localized,
          root.cpc,
          root.citation
        FROM
          `patents-public-data.patents.publications` root,
          UNNEST( cpc ) AS cpc
        WHERE 
         cpc.code like 'A45D29/02' and
         country_code = 'US'
        LIMIT
          1000

In [1]:
import json

In [6]:
# load the data
file_path = './data/results-getsAll_20171119-115135.json'

data_brut = []
with open(file_path) as f:
    for line in f:
        entry = json.loads(line)
        data_brut.append( entry )

In [19]:
# le format du numéro de brevet (pour ggl):
# 6 ou 7 digits
# ou l'année + 7 digits
# si on trouve année+6digits ce n'est pas bon

# https://www.uspto.gov/patents-application-process/applying-online/patent-number
# http://www.bpmlegal.com/howtopat1.html

def format_publicationnumber( pubnumber ):
    
    number = pubnumber.split('-')[1]
    
    if len( number ) > 7:  # en vrai ==10
        number = number[0:4] + number[4:].zfill(7)

    number = 'US' + number
    
    return number

In [51]:
data = {}
k = 0
for entry in data_brut:
   
    
    pubnum = entry['publication_number']
    patent_number = format_publicationnumber( pubnum )
    
    if patent_number in data: # supprime les doublons
        continue

    new_entry = {}
    
    new_entry['publication_number_raw'] = pubnum
    
    # date
    new_entry['year'] = int(  entry['publication_date'][0:4] )   
    new_entry['month'] = int( entry['publication_date'][4:6] )
    new_entry['day'] = int(   entry['publication_date'][6:]  )
    
    new_entry['date_str'] = entry['publication_date']

    # info
    new_entry['title'] = entry['title_localized'][0]['text']
    new_entry['title'] = entry['title_localized'][0]['text']
    
    # citations :
    citation = [] 
    for cit in entry['citation']:
        if not cit['publication_number'] : continue
        infos = {}
        infos['publication_number_raw'] = cit['publication_number'] 
        infos['patent_number'] = format_publicationnumber( cit['publication_number']  )

        infos['category'] =  cit['category'] 

        citation.append( infos )
        
    new_entry['citation'] = citation

    # CPC
    new_entry['cpc'] = entry['cpc']
    
    # save
    data[ patent_number ] = new_entry
    
    k+=1  # debug
print(len(data))

379


In [53]:
entry = data_brut[40]
entry

{'citation': [{'application_number': '',
   'category': 'PRS',
   'filing_date': '0',
   'npl_text': '',
   'publication_number': 'US-853832-A',
   'type': ''},
  {'application_number': '',
   'category': 'PRS',
   'filing_date': '0',
   'npl_text': '',
   'publication_number': 'US-4117854-A',
   'type': ''},
  {'application_number': '',
   'category': 'PRS',
   'filing_date': '0',
   'npl_text': '',
   'publication_number': 'US-4753253-A',
   'type': ''},
  {'application_number': '',
   'category': 'PRS',
   'filing_date': '0',
   'npl_text': '',
   'publication_number': 'US-5161552-A',
   'type': ''},
  {'application_number': '',
   'category': 'PRS',
   'filing_date': '0',
   'npl_text': '',
   'publication_number': 'US-2006260629-A1',
   'type': ''},
  {'application_number': '',
   'category': 'PRS',
   'filing_date': '0',
   'npl_text': '',
   'publication_number': 'US-7475687-B2',
   'type': ''}],
 'cpc': [{'code': 'A45D29/06',
   'first': 'false',
   'inventive': 'true',
   'tre