In [1]:
import re
import spacy
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm
from itertools import chain
from ast import literal_eval
from tqdm.notebook import tqdm
import multiprocessing
tqdm.pandas()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('data/patents.csv')

In [3]:
#cleans the HTML tags
def filter_tags(text):
    return re.sub(r'<.*?>', '', text)

In [4]:
df['Abstract'] = df['Abstract'].apply(filter_tags)
df['CombinedText'] = (df['Title'] + '. ' + df['Abstract']).str.lower()

# Lemmatization

Multiprocessing to make the task faster

In [5]:
core_count = max(multiprocessing.cpu_count() -2,1)
nlp = spacy.load("en_core_web_sm") #Can use other spacy models for lemmatization

In [6]:
def lemmatize_text(text, nlp):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)

In [7]:
patents = df['CombinedText']
lemmatized_patents = []
for patent in tqdm(nlp.pipe(patents, n_process = core_count,batch_size = 64),total=len(patents)):
    lemmatized_patents.append([tok.lemma_ for tok in patent])

  0%|          | 0/578401 [00:00<?, ?it/s]

# Clean Tokens

In [11]:
from spacy.lang.en.stop_words import STOP_WORDS
stop_words = set(STOP_WORDS)

In [12]:
def clean_tokens(tokens):
    two_char = ['ai', 'ip', 'it', 'pc', 'io', 'ar', 'vr', 'ml', 'qc', 'rf', 'ux', 'ui']
    filtered_tokens = [word for word in tokens if word not in stop_words and word != "-" and re.match(r'[a-zA-Z-]+$', word)]
    filtered_tokens = [word for word in filtered_tokens if len(word) != 1 and (len(word) > 2 or word in two_char)]
    return filtered_tokens

#there are tokens such as ml- where we only care about the ml part
#hyphenated words are also split into two
def split_hyphenated_tokens(tokens):
    processed_tokens = []
    for token in tokens:
        if "-" in token:
            split_parts = token.split("-")
            processed_tokens.extend([part for part in split_parts if part])
        else:
            processed_tokens.append(token)
    return processed_tokens

In [13]:
test = ["word-word2", "tech-", "-data", "Io", "word"]
split_hyphenated_tokens(test)

['word', 'word2', 'tech', 'data', 'Io', 'word']

In [14]:
dehyphenated_tokens = [split_hyphenated_tokens(tokens) for tokens in lemmatized_patents]

In [15]:
cleaned_tokens = [clean_tokens(tokens) for tokens in dehyphenated_tokens]

In [16]:
#remove keywords that only occur n times
min_threshold = 2
counts = Counter(list(chain.from_iterable(cleaned_tokens)))
filtered_counts = {token: count for token, count in counts.items() if count < min_threshold}
tokens_to_remove = set(filtered_counts.keys())

In [17]:
def remove_low(tokens):
    return [word for word in tokens if not(word in tokens_to_remove)]

In [18]:
no_low_tokens = [remove_low(tokens) for tokens in cleaned_tokens]
df['LemmatizedTokens'] = no_low_tokens

In [19]:
df

Unnamed: 0,PatentNumber,EarliestPubDate,Title,Abstract,PubYear,CombinedText,LemmatizedTokens
0,TW126293 B,19900101,With expert system for process control capabil...,"With the rules of an expert system, which is d...",1990,with expert system for process control capabil...,"[expert, system, process, control, capability,..."
1,US4891782 A,19900102,PARALLEL NEURAL NETWORK FOR A FULL BINARY ADDER,A method for performing the addition of two N-...,1990,parallel neural network for a full binary adde...,"[parallel, neural, network, binary, adder, met..."
2,US4891762 A,19900102,"METHOD AND APPARATUS FOR TRACKING, MAPPING AND...",A method and apparatus for the identification ...,1990,"method and apparatus for tracking, mapping and...","[method, apparatus, tracking, mapping, recogni..."
3,US5004932 A,19900103,UNIT CIRCUIT FOR CONSTRUCTING A NEURAL NETWORK...,A semiconductor integrated circuit for constru...,1990,unit circuit for constructing a neural network...,"[unit, circuit, construct, neural, network, se..."
4,US5058168 A,19900103,OVERFLOW SPEECH DETECTING APPARATUS FOR SPEECH...,Time-serial pattern data of feature parameters...,1990,overflow speech detecting apparatus for speech...,"[overflow, speech, detect, apparatus, speech, ..."
...,...,...,...,...,...,...,...
578396,SE2250815A1,20231231,A SYSTEM AND METHOD FOR FIRE DETECTION,The present disclosure relates to a system (10...,2023,a system and method for fire detection. the pr...,"[system, method, fire, detection, present, dis..."
578397,DZ12209A,20231231,AN INTELLIGENT ELECTRONIC SYSTEM TO FIGHT AGAI...,Bird damage can be a serious problem for grain...,2023,an intelligent electronic system to fight agai...,"[intelligent, electronic, system, fight, bird,..."
578398,BA233551A,20231231,RAILWAY TRAIN EQUIPMENT FOR OBJECT DETECTION A...,The device of a railway train for the detectio...,2023,railway train equipment for object detection a...,"[railway, train, equipment, object, detection,..."
578399,IR110375B,20231231,INTELLIGENT WATER AND FLUID CONSUMPTION CONTRO...,The intelligent water and fluid consumption co...,2023,intelligent water and fluid consumption contro...,"[intelligent, water, fluid, consumption, contr..."


# Bigrams

In [20]:
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

In [21]:
all_tokens = list(chain.from_iterable(no_low_tokens))

In [22]:
bigram_finder = BigramCollocationFinder.from_words(all_tokens)
bigram_finder.apply_freq_filter(3)  # Filter bigrams that occur less than 3 times
bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 200)

In [29]:
bigrams

[('neural', 'network'),
 ('present', 'invention'),
 ('artificial', 'intelligence'),
 ('machine', 'learning'),
 ('storage', 'medium'),
 ('invention', 'relate'),
 ('real', 'time'),
 ('invention', 'disclose'),
 ('unmanned', 'aerial'),
 ('method', 'comprise'),
 ('deep', 'learning'),
 ('system', 'method'),
 ('machine', 'learn'),
 ('following', 'step'),
 ('aerial', 'vehicle'),
 ('present', 'disclosure'),
 ('comprise', 'step'),
 ('method', 'apparatus'),
 ('computer', 'readable'),
 ('convolutional', 'neural'),
 ('comprise', 'following'),
 ('speech', 'recognition'),
 ('electronic', 'device'),
 ('learning', 'model'),
 ('natural', 'language'),
 ('invention', 'provide'),
 ('technical', 'field'),
 ('computer', 'program'),
 ('method', 'include'),
 ('embodiment', 'present'),
 ('method', 'system'),
 ('feature', 'vector'),
 ('readable', 'storage'),
 ('method', 'device'),
 ('accord', 'embodiment'),
 ('time', 'series'),
 ('point', 'cloud'),
 ('computer', 'implement'),
 ('feature', 'extraction'),
 ('follo

In [28]:
#manual list, not ideal but ok
bad_bigrams =[
    ('accord', 'embodiment'),
    ('accord', 'present'),
    ('apparatus', 'include'),
    ('belong', 'field'),
    ('comprise', 'follow'),
    ('comprise', 'following'),
    ('configure', 'receive'),
    ('disclosure', 'provide'),
    ('disclosure', 'relate'),
    ('embodiment', 'present'),
    ('far', 'include'),
    ('implement', 'method'),
    ('include', 'receive'),
    ('invention', 'disclose'),
    ('invention', 'provide'),
    ('invention', 'relate'),
    ('method', 'apparatus'),
    ('method', 'comprise'),
    ('method', 'include'),
    ('method', 'thereof'),
    ('present', 'application'),
    ('present', 'disclosure'),
    ('present', 'invention'),
    ('question', 'answer'),
    ('relate', 'method'),
    ('solve', 'problem'),
    ('step', 'acquire'),
    ('step', 'firstly'),
    ('step', 'obtain'),
    ('system', 'include'),
    ('use', 'artificial'),
    ('use', 'machine')
]

In [31]:
final_bigrams = list(set(bigrams) - set(bad_bigrams))
len(final_bigrams)

168

In [32]:
def detect_bigrams(text, bigrams_set):
    combined_text = []
    i = 0
    text_len = len(text)

    skip = False
    while i < text_len:
        if i < text_len - 1 and (text[i], text[i + 1]) in bigrams_set:
            combined_text.append(f"{text[i]}_{text[i + 1]}")
            i += 1
            skip = True
        else:
            if skip == True:
                i+=1
                skip = False
            else:
                combined_text.append(text[i])
                i += 1
    return combined_text

In [34]:
text_sample = ['unmanned', 'aerial', 'vehicle', 'neural', 'neural', 'network','network']

print(text_sample)
print(detect_bigrams(text_sample,final_bigrams))

['unmanned', 'aerial', 'vehicle', 'neural', 'neural', 'network', 'network']
['unmanned_aerial', 'aerial_vehicle', 'neural', 'neural_network', 'network']


In [35]:
bigrams_set = set(final_bigrams)
df['BigramsTokens'] = df['LemmatizedTokens'].progress_apply(lambda x: detect_bigrams(x, bigrams_set))

  0%|          | 0/578401 [00:00<?, ?it/s]

In [36]:
df.to_csv('data/Cleaned_Patents.csv',index=False)