In [1]:
# import standard libraries
import os
import numpy as np
# import third-party libraries
import gensim
from gensim import corpora
import pandas as pd
# import local libraries

# Topic extraction

In [2]:
# load data
patents = pd.read_csv('data/patents/clean/patents.csv.gz', compression='gzip')

In [3]:
patents['text'] = patents['text'].apply(lambda x: x.split(' '))

In [6]:
# create list of tokens
all_text = list(patents['text'].values)

In [7]:
# create our dictionary and corpus from patent text
dictionary = corpora.Dictionary(all_text)
corpus = [dictionary.doc2bow(text) for text in all_text]

In [8]:
# create lda model
num_topics = 100
filename = f'data/nlp/model{num_topics}.gensim'
if not os.path.exists(filename):
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    ldamodel.save(filename)
else:
    ldamodel = gensim.models.ldamodel.LdaModel.load(filename)

In [31]:
topics = ldamodel.print_topics(num_topics=20, num_words=3)
for topic in topics:
    print(topic)
    print()

(55, '0.203*"quality" + 0.065*"obesity" + 0.051*"precursor"')

(98, '0.131*"particle" + 0.083*"electrophoretic" + 0.081*"specificity"')

(54, '0.284*"receptor" + 0.097*"antagonist" + 0.088*"compound"')

(33, '0.192*"crystal" + 0.167*"liquid" + 0.152*"display"')

(65, '0.163*"double" + 0.094*"estrogen" + 0.085*"biocompatible"')

(0, '0.216*"form" + 0.062*"crystalline" + 0.054*"phenyl"')

(70, '0.074*"mold" + 0.070*"polymerization" + 0.056*"thus"')

(30, '0.256*"portion" + 0.193*"surface" + 0.063*"delivery"')

(76, '0.228*"drug" + 0.126*"property" + 0.065*"especially"')

(64, '0.149*"life" + 0.113*"hemoglobin" + 0.083*"activation"')

(69, '0.181*"contact" + 0.106*"silicone" + 0.086*"hydrogel"')

(13, '0.124*"chronic" + 0.099*"step" + 0.097*"phase"')

(96, '0.120*"method" + 0.066*"composition" + 0.058*"host"')

(66, '0.150*"capsule" + 0.074*"psa" + 0.064*"formulation"')

(40, '0.136*"carrier" + 0.123*"skin" + 0.083*"oxide"')

(50, '0.095*"composition" + 0.081*"vaccine" + 0.070*"immunogeni

In [10]:
i = 1234

In [25]:
new_doc_bow = dictionary.doc2bow(patents['text'][i])
print(ldamodel.get_document_topics(new_doc_bow, minimum_probability=0.1))
print(patents['patent_title'][i])
print(patents['patent_abstract'][i])

[(43, 0.12708373), (54, 0.1511013), (74, 0.462142), (91, 0.176979)]
High-affinity human antibodies to human protease-activated receptor-2
The present invention provides antibodies that bind to protease-activated receptor-2 (PAR-2) and methods of using same. According to certain embodiments of the invention, the antibodies are fully human antibodies that bind to human PAR-2. The antibodies of the invention are useful for the treatment of diseases and disorders associated with one or more PAR-2 biological activities, including the treatment of pain conditions, inflammatory conditions and gastrointestinal conditions.
