In [1]:
import json
import os
import pandas as pd
from src.utils.UsefulPaths import Paths
from src.utils.SpacyUtils import SpacyUtil
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
paths = Paths()

with open(paths.json_subsectors, 'r') as file:
    subsectors = json.load(file)

In [3]:
df_abstract = pd.read_csv(paths.raw_abstract)

In [4]:
df_abstract.head(5)

Unnamed: 0,publication_number,abstract
0,20080063564,Embodiments of techniques for determining the ...
1,20080025285,A method for supporting frequency hopping of a...
2,20080056857,To correct any positional misalignment of a su...
3,20080031117,A holographic optical accessing system include...
4,20080056179,Transmitting an acknowledgement/negative ackno...


In [5]:
df_abstract['word_count'] = df_abstract['abstract'].apply(lambda x: len(str(x).split()))

In [6]:
df_abstract.word_count.describe()

count   4184916.00
mean        106.39
std          40.64
min           1.00
25%          77.00
50%         109.00
75%         139.00
max        1449.00
Name: word_count, dtype: float64

In [7]:
df_abstract.shape

(4184916, 3)

In [8]:
df_raw_patents = pd.read_csv(paths.raw_raw_patents, parse_dates=['grant_date', 'app_date'])

In [9]:
df_raw_patents = df_raw_patents.drop(['GoogleCity', 'CityCountry'], axis=1)

df_raw_patents = df_raw_patents.rename(columns=
                                       {
                                           'class_IPC_concat': 'class_ipc_concat',
                                           'class_IPC_distinct_count': 'class_ipc_distinct_count',
                                           'CountryName': 'country_name',
                                           'Ecosystem': 'ecosystem'
                                       }
)

df_raw_patents['app_name'] = df_raw_patents.apply(
    lambda row: str(row['name_first']) + ' ' + str(row['name_last']) if pd.notnull(row['name_last']) else row['name_first'],
    axis=1)

df_raw_patents = df_raw_patents.drop(['name_first', 'name_last'], axis=1)

df_raw_patents = df_raw_patents.loc[:, [
                                           'patent_id',
                                           'app_name',
                                           'app_year',
                                           'city',
                                           'country',
                                           'country_name',
                                           'ecosystem',
                                           'grant_date',
                                           'grant_year',
                                           'app_date',
                                           'class_concat',
                                           'class_distinct_count',
                                           'class_ipc_concat',
                                           'class_ipc_distinct_count'
                                       ]
                 ]

df_raw_patents.drop_duplicates(inplace=True)

In [10]:
df_raw_patents.shape

(9829883, 14)

In [11]:
df_merged = pd.merge(df_abstract, df_raw_patents, left_on='publication_number', right_on='patent_id', how='inner')
df_merged = df_merged.drop_duplicates(subset='publication_number', keep='first')
df_merged = df_merged.drop('patent_id', axis=1)
df_merged

Unnamed: 0,publication_number,abstract,word_count,app_name,app_year,city,country,country_name,ecosystem,grant_date,grant_year,app_date,class_concat,class_distinct_count,class_ipc_concat,class_ipc_distinct_count
0,20120154258,A display device having at least a plurality o...,88,Mitsuru Asano,2012,Kanagawa,JP,Japan,Tokyo,2012-06-21 00:00:00+00:00,2012,2012-02-29 00:00:00+00:00,G09G,1.00,G09G,1.00
1,20120179978,Method and apparatus for previewing new events...,138,Christopher Wormald,2012,Kitchener,CA,Canada,Waterloo,2012-07-12 00:00:00+00:00,2012,2012-01-26 00:00:00+00:00,"G06F,H04W",2.00,"G06F,H04W",2.00
4,20120178784,It has been found that inhibitors of the renni...,83,Hugh Montgomery,2012,London,GB,United Kingdom,Greater London;London,2012-07-12 00:00:00+00:00,2012,2012-01-30 00:00:00+00:00,"A61P,A61K",2.00,A61K,1.00
7,20120160478,"A well tool can include a flow path, and a flo...",59,Bradley TODD,2012,Duncan,US,United States,,2012-06-28 00:00:00+00:00,2012,2012-02-27 00:00:00+00:00,"E21B,C01B",2.00,"C01B,E21B",2.00
11,20120176344,"According to one disclosed method, coordinates...",149,Atid Shamaie,2012,Ottawa,CA,Canada,Ottawa,2012-07-12 00:00:00+00:00,2012,2012-03-20 00:00:00+00:00,G06F,1.00,G06F,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7854878,20200299065,A conveyor including a conveyor belt and a fir...,192,Carter Pedersen,2020,Wahpeton,US,United States,,2020-09-24 00:00:00+00:00,2020,2020-03-04 00:00:00+00:00,B65G,1.00,B65G,1.00
7854882,20200336806,A user interface (UI) includes a video display...,141,Dongwook YOON,2019,Vancouver,CA,Canada,Vancouver,2020-10-22 00:00:00+00:00,2020,2019-04-19 00:00:00+00:00,H04N,1.00,H04N,1.00
7854885,20200280504,A wireless network operating system for commun...,165,Tommaso MELODIA,2018,Newton,US,United States,Boston,2020-09-03 00:00:00+00:00,2020,2018-11-29 00:00:00+00:00,"H04B,H04L,H04W",3.00,"H04B,H04L,H04W",3.00
7854887,20200302226,"In a method for failure detection, operational...",145,Ahmed Adeniran,2019,Dhahran,SA,Saudi Arabia,,2020-09-24 00:00:00+00:00,2020,2019-07-03 00:00:00+00:00,"G06K,G06F",2.00,"G06F,G06K",2.00


In [12]:
df = df_merged[
    (df_merged.app_date > '2018-01-01 00:00:00+00:00') &
    (df_merged.word_count >= 77)
]
df.shape

(553664, 16)

In [13]:
df.to_csv(os.path.join(paths.data_processed, 'abstract_patents.csv'), index=False)

In [14]:
slice_number = 25000
df_sliced = df[:slice_number].copy()
df_sliced.to_csv(os.path.join(paths.data_processed, f'abstract_patents_sliced_{slice_number}.csv'), index=False)

In [15]:
spacy_util = SpacyUtil(model='en_core_web_sm', lemma=True, remove_stopwords=True, lower=True, remove_numbers=False)
df_sliced['processed_abstract'] = df_sliced['abstract'].apply(spacy_util.preprocess_text)

In [17]:
df_sliced['token_count'] = df_sliced['processed_abstract'].apply(lambda x: len(str(x).split()))

In [18]:
df_sliced

Unnamed: 0,publication_number,abstract,word_count,app_name,app_year,city,country,country_name,ecosystem,grant_date,grant_year,app_date,class_concat,class_distinct_count,class_ipc_concat,class_ipc_distinct_count,processed_abstract,token_count
4655632,20180152842,"A method, and a mobile device adapted thereto,...",80,Moorim Kim,2018,Seoul,KR,"Korea, Republic of",Seoul,2018-05-31 00:00:00+00:00,2018,2018-01-29 00:00:00+00:00,"H04M,H04W,G06F",3.00,"G06F,H04M,H04W",3.00,method mobile device adapt thereto verify user...,46
4655716,20180140889,A dumbbell with a selectable number of weight ...,131,Per HÖGLUND,2018,Jonkoping,SE,Sweden,Jonkoping,2018-05-24 00:00:00+00:00,2018,2018-01-03 00:00:00+00:00,A63B,1.00,A63B,1.00,dumbbell selectable number weight disk include...,76
4655724,20180143803,A virtual assistant AI system that may be conn...,83,David Placa,2018,San Francisco,US,United States,Silicon Valley,2018-05-24 00:00:00+00:00,2018,2018-01-16 00:00:00+00:00,G06F,1.00,G06F,1.00,virtual assistant ai system connect wide varie...,49
4655776,20180169402,Connector assemblies that are separate from me...,101,Patrick Wells,2018,Columbia Heights,US,United States,Minneapolis,2018-06-21 00:00:00+00:00,2018,2018-02-15 00:00:00+00:00,"H01R,A61N",2.00,"A61N,H01R",2.00,connector assembly separate medical lead exten...,60
4655913,20180163931,"An LED tube lamp, comprising a lamp tube, whic...",130,Aiming Xiong,2018,Jiaxing,CN,China,Hangzhou,2018-06-14 00:00:00+00:00,2018,2018-02-06 00:00:00+00:00,"F21Y,F21K,F21V,H05B",4.00,"F21K,F21V,F21Y,H05B",4.00,led tube lamp comprise lamp tube include light...,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5268976,20180271212,"A comfort system for Cowboy and work boots, in...",106,Billy Lovell,2018,Azle,US,United States,Dallas,2018-09-27 00:00:00+00:00,2018,2018-05-16 00:00:00+00:00,A43B,1.00,A43B,1.00,comfort system cowboy work boot include insole...,64
5268979,20180211124,An automated ridesharing dispatch system inclu...,145,Daniel Ramot,2018,Kfar Saba,IL,Israel,Tel Aviv,2018-07-26 00:00:00+00:00,2018,2018-03-07 00:00:00+00:00,"G08G,G01C,G06Q,G06K,B60N",5.00,"B60N,G01C,G06K,G06Q,G08G",5.00,automate ridesharing dispatch system include c...,78
5268983,20180205352,An audio amplifier system includes a delta-sig...,154,Robert McKENZIE,2018,Toronto,CA,Canada,Toronto,2018-07-19 00:00:00+00:00,2018,2018-01-17 00:00:00+00:00,"G06F,H03L,H04B,H03M,H03F,H04L",6.00,"G06F,H03F,H03L,H03M,H04B,H04L",6.00,audio amplifier system include delta sigma mod...,108
5268986,20180289740,A dietary nutritional supplement that provides...,119,Mark Force,2018,Ashland,US,United States,,2018-10-11 00:00:00+00:00,2018,2018-06-13 00:00:00+00:00,A61K,1.00,A61K,1.00,dietary nutritional supplement provide relief ...,75


In [19]:
df_sliced[:1000].to_csv(os.path.join(paths.data_processed, f'abstract_patents_sliced_1000.csv'), index=False)
df_sliced[:5000].to_csv(os.path.join(paths.data_processed, f'abstract_patents_sliced_5000.csv'), index=False)
df_sliced[:10000].to_csv(os.path.join(paths.data_processed, f'abstract_patents_sliced_10000.csv'), index=False)
df_sliced[:15000].to_csv(os.path.join(paths.data_processed, f'abstract_patents_sliced_15000.csv'), index=False)