In [1]:
# import standard libraries
import os
import numpy as np
# import third-party libraries
import gensim
from gensim import corpora
import pandas as pd
# import local libraries

# Topic extraction

In [2]:
# load data
patents = pd.read_csv('data/patents/clean/patents.csv.gz', compression='gzip')

In [3]:
patents['text'] = patents['text'].apply(lambda x: x.split(' '))

In [4]:
# create list of tokens
all_text = list(patents['text'].values)

In [5]:
# create our dictionary and corpus from patent text
dictionary = corpora.Dictionary(all_text)
corpus = [dictionary.doc2bow(text) for text in all_text]

In [14]:
# create lda model
num_topics = 10
filename = f'data/nlp/model{num_topics}.gensim'
if not os.path.exists(filename):
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    ldamodel.save(filename)
else:
    ldamodel = gensim.models.ldamodel.LdaModel.load(filename)

In [15]:
topics = ldamodel.print_topics(num_topics=num_topics, num_words=3)
for topic in topics:
    print(topic)
    print()

(0, '0.118*"device" + 0.062*"electronic" + 0.054*"compound"')

(1, '0.037*"antibody" + 0.034*"method" + 0.031*"complement"')

(2, '0.046*"liquid" + 0.040*"medium" + 0.036*"display"')

(3, '0.041*"light" + 0.028*"eculizumab" + 0.020*"least"')

(4, '0.073*"compound" + 0.044*"composition" + 0.040*"receptor"')

(5, '0.039*"composition" + 0.035*"method" + 0.031*"solution"')

(6, '0.081*"antibody" + 0.035*"method" + 0.029*"cell"')

(7, '0.028*"device" + 0.021*"system" + 0.021*"position"')

(8, '0.072*"polymer" + 0.041*"human" + 0.034*"method"')

(9, '0.092*"compound" + 0.045*"formula" + 0.039*"inhibitor"')



In [16]:
i = 1234

In [17]:
new_doc_bow = dictionary.doc2bow(patents['text'][i])
print(ldamodel.get_document_topics(new_doc_bow, minimum_probability=0.1))
print(patents['patent_title'][i])
print(patents['patent_abstract'][i])

[(1, 0.45028755), (4, 0.11288472), (6, 0.34613568)]
High-affinity human antibodies to human protease-activated receptor-2
The present invention provides antibodies that bind to protease-activated receptor-2 (PAR-2) and methods of using same. According to certain embodiments of the invention, the antibodies are fully human antibodies that bind to human PAR-2. The antibodies of the invention are useful for the treatment of diseases and disorders associated with one or more PAR-2 biological activities, including the treatment of pain conditions, inflammatory conditions and gastrointestinal conditions.
