In [1]:
# import standard libraries
import os
import numpy as np
import pickle
# import third-party libraries
import gensim
from gensim import corpora
import pandas as pd
# import local libraries

# Topic extraction

In [2]:
# load data
patents = pd.read_csv('data/patents/clean/patents.csv.gz', compression='gzip')

In [3]:
patents['text'] = patents['text'].apply(lambda x: x.split(' '))

In [4]:
# create list of tokens
all_text = list(patents['text'].values)

In [5]:
# create our dictionary and corpus from patent text
dictionary = corpora.Dictionary(all_text)
corpus = [dictionary.doc2bow(text) for text in all_text]

In [6]:
# create lda model
num_topics = 200
filename = f'data/nlp/model{num_topics}.gensim'
if not os.path.exists(filename):
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    ldamodel.save(filename)
else:
    ldamodel = gensim.models.ldamodel.LdaModel.load(filename)

In [7]:
topics = ldamodel.print_topics(num_topics=num_topics, num_words=3)
for topic in topics:
    print(topic)
    print()

(0, '0.053*"patient" + 0.051*"cell" + 0.045*"method"')

(1, '0.149*"device" + 0.059*"electronic" + 0.050*"organic"')

(2, '0.106*"solution" + 0.097*"formulation" + 0.042*"high"')

(3, '0.125*"alia" + 0.092*"inter" + 0.059*"rheumatoid"')

(4, '0.100*"light" + 0.040*"pigment" + 0.038*"chain"')

(5, '0.127*"complement" + 0.073*"method" + 0.052*"disclosure"')

(6, '0.067*"syndrome" + 0.065*"cancer" + 0.064*"disease"')

(7, '0.078*"liquid" + 0.075*"medium" + 0.057*"display"')

(8, '0.090*"least" + 0.073*"group" + 0.036*"space"')

(9, '0.082*"form" + 0.035*"composition" + 0.033*"method"')

(10, '0.058*"human" + 0.041*"non" + 0.037*"animal"')

(11, '0.202*"antibody" + 0.056*"human" + 0.050*"method"')

(12, '0.064*"protein" + 0.052*"composition" + 0.042*"method"')

(13, '0.073*"derivative" + 0.061*"alkyl" + 0.022*"capsule"')

(14, '0.150*"compound" + 0.054*"formula" + 0.051*"inhibitor"')

(15, '0.082*"compound" + 0.039*"formula" + 0.034*"composition"')

(16, '0.069*"system" + 0.028*"portion" +

In [8]:
i = 1234

In [9]:
new_doc_bow = dictionary.doc2bow(patents['text'][i])
print(ldamodel.get_document_topics(new_doc_bow, minimum_probability=0.1))
print(patents['patent_title'][i])
print(patents['patent_abstract'][i])

[(11, 0.97432303)]
High-affinity human antibodies to human protease-activated receptor-2
The present invention provides antibodies that bind to protease-activated receptor-2 (PAR-2) and methods of using same. According to certain embodiments of the invention, the antibodies are fully human antibodies that bind to human PAR-2. The antibodies of the invention are useful for the treatment of diseases and disorders associated with one or more PAR-2 biological activities, including the treatment of pain conditions, inflammatory conditions and gastrointestinal conditions.


In [14]:
# save dictionary
filename = 'data/nlp/dictionary.pkl'
if not os.path.exists(filename):
    with open(filename, 'wb') as f:
        pickle.dump(dictionary, f)