In [37]:
import json
import os
import pandas as pd
from src.utils.UsefulPaths import Paths
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
paths = Paths()

with open(paths.json_subsectors, 'r') as file:
    subsectors = json.load(file)

In [3]:
df_abstract = pd.read_csv(paths.raw_abstract)

In [4]:
df_abstract.head(5)

Unnamed: 0,publication_number,abstract
0,20080063564,Embodiments of techniques for determining the ...
1,20080025285,A method for supporting frequency hopping of a...
2,20080056857,To correct any positional misalignment of a su...
3,20080031117,A holographic optical accessing system include...
4,20080056179,Transmitting an acknowledgement/negative ackno...


In [5]:
df_abstract['word_count'] = df_abstract['abstract'].apply(lambda x: len(str(x).split()))

In [19]:
df_abstract.word_count.describe()

count   4184916.00
mean        106.39
std          40.64
min           1.00
25%          77.00
50%         109.00
75%         139.00
max        1449.00
Name: word_count, dtype: float64

In [6]:
df_abstract.shape

(4184916, 3)

In [7]:
df_raw_patents = pd.read_csv(paths.raw_raw_patents, parse_dates=['grant_date', 'app_date'])

In [8]:
df_raw_patents = df_raw_patents.drop(['GoogleCity', 'CityCountry'], axis=1)

df_raw_patents = df_raw_patents.rename(columns=
                                       {
                                           'class_IPC_concat': 'class_ipc_concat',
                                           'class_IPC_distinct_count': 'class_ipc_distinct_count',
                                           'CountryName': 'country_name',
                                           'Ecosystem': 'ecosystem'
                                       }
)

df_raw_patents['app_name'] = df_raw_patents.apply(
    lambda row: str(row['name_first']) + ' ' + str(row['name_last']) if pd.notnull(row['name_last']) else row['name_first'],
    axis=1)

df_raw_patents = df_raw_patents.drop(['name_first', 'name_last'], axis=1)

df_raw_patents = df_raw_patents.loc[:, [
                                           'patent_id',
                                           'app_name',
                                           'app_year',
                                           'city',
                                           'country',
                                           'country_name',
                                           'ecosystem',
                                           'grant_date',
                                           'grant_year',
                                           'app_date',
                                           'class_concat',
                                           'class_distinct_count',
                                           'class_ipc_concat',
                                           'class_ipc_distinct_count'
                                       ]
                 ]

df_raw_patents.drop_duplicates(inplace=True)

In [9]:
df_raw_patents.shape

(9829883, 14)

In [15]:
df_merged = pd.merge(df_abstract, df_raw_patents, left_on='publication_number', right_on='patent_id', how='inner')
df_merged = df_merged.drop_duplicates(subset='publication_number', keep='first')
df_merged

Unnamed: 0,publication_number,abstract,word_count,patent_id,app_name,app_year,city,country,country_name,ecosystem,grant_date,grant_year,app_date,class_concat,class_distinct_count,class_ipc_concat,class_ipc_distinct_count
0,20120154258,A display device having at least a plurality o...,88,20120154258,Mitsuru Asano,2012,Kanagawa,JP,Japan,Tokyo,2012-06-21 00:00:00+00:00,2012,2012-02-29 00:00:00+00:00,G09G,1.00,G09G,1.00
1,20120179978,Method and apparatus for previewing new events...,138,20120179978,Christopher Wormald,2012,Kitchener,CA,Canada,Waterloo,2012-07-12 00:00:00+00:00,2012,2012-01-26 00:00:00+00:00,"G06F,H04W",2.00,"G06F,H04W",2.00
4,20120178784,It has been found that inhibitors of the renni...,83,20120178784,Hugh Montgomery,2012,London,GB,United Kingdom,Greater London;London,2012-07-12 00:00:00+00:00,2012,2012-01-30 00:00:00+00:00,"A61P,A61K",2.00,A61K,1.00
7,20120160478,"A well tool can include a flow path, and a flo...",59,20120160478,Bradley TODD,2012,Duncan,US,United States,,2012-06-28 00:00:00+00:00,2012,2012-02-27 00:00:00+00:00,"E21B,C01B",2.00,"C01B,E21B",2.00
11,20120176344,"According to one disclosed method, coordinates...",149,20120176344,Atid Shamaie,2012,Ottawa,CA,Canada,Ottawa,2012-07-12 00:00:00+00:00,2012,2012-03-20 00:00:00+00:00,G06F,1.00,G06F,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7854878,20200299065,A conveyor including a conveyor belt and a fir...,192,20200299065,Carter Pedersen,2020,Wahpeton,US,United States,,2020-09-24 00:00:00+00:00,2020,2020-03-04 00:00:00+00:00,B65G,1.00,B65G,1.00
7854882,20200336806,A user interface (UI) includes a video display...,141,20200336806,Dongwook YOON,2019,Vancouver,CA,Canada,Vancouver,2020-10-22 00:00:00+00:00,2020,2019-04-19 00:00:00+00:00,H04N,1.00,H04N,1.00
7854885,20200280504,A wireless network operating system for commun...,165,20200280504,Tommaso MELODIA,2018,Newton,US,United States,Boston,2020-09-03 00:00:00+00:00,2020,2018-11-29 00:00:00+00:00,"H04B,H04L,H04W",3.00,"H04B,H04L,H04W",3.00
7854887,20200302226,"In a method for failure detection, operational...",145,20200302226,Ahmed Adeniran,2019,Dhahran,SA,Saudi Arabia,,2020-09-24 00:00:00+00:00,2020,2019-07-03 00:00:00+00:00,"G06K,G06F",2.00,"G06F,G06K",2.00


In [16]:
df = df_merged[
    (df_merged.app_date > '2018-01-01 00:00:00+00:00') &
    (df_merged.word_count >= 77)
]
df.shape

(553664, 17)

In [42]:
df.to_csv(os.path.join(paths.data_processed, 'abstract_patents.csv'), index=False)

In [43]:
slice_number = 1000
df_sliced = df[:slice_number]
df_sliced.to_csv(os.path.join(paths.data_processed, 'abstract_patents_sliced.csv'), index=False)