# 1. Input Parameters 

In [None]:
# Filter out words that appears in more than x% of all documents
extremes_no_above=0.70 # i.e. Filter out terms appearing in more than 90% of documents. Higher means less words removed.

# Filter out words that appears in less than n documents (this is a number, not %!)
extremes_no_below=10 # i.e. Filter out terms appearing in less than 10 documents. Lower means less words removed

# TF IDF low value words removal threshold. 
tfidf_low_value = 0.020 # Set higher to remove more words. You can see what words are removed below!

In [None]:
# Set training parameters.
chunksize = 10000
passes = 40 # default is one pass
iterations = 500
eval_every = None  # For logging, to save time, put None to not evaluate model perplexity.

In [None]:
## Set number seed for reproducibility
seed_number = 7

In [None]:
# For monitoring convergence
# https://stackoverflow.com/questions/37570696/how-to-monitor-convergence-of-gensim-lda-model
import logging
logging.basicConfig(filename='gensim.log', format="%(asctime)s:%(levelname)s:%(message)s", level=logging.INFO)
logging.root.level = logging.INFO  # ipython sometimes messes up the logging setup; restore

# 2. Load libraries and data

In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup 
from gensim import corpora, models
from nltk.stem import WordNetLemmatizer
import glob
import nltk
import gensim
import string
from unicodedata import category

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

np.random.seed(seed_number)

In [None]:
df = pd.read_csv("full_lens_data.csv")

In [None]:
X = df['Abstract'].dropna().astype(str)

## 2.1 Pre-processing

In [None]:
# Start with tokenization
X_tokenized = X.apply(nltk.word_tokenize)

# Take a look at top tokens by frequency
pd.Series(np.concatenate(X_tokenized.values)).value_counts().head(20)

In [None]:
# FInd out number of tokens
len(pd.Series(np.concatenate(X_tokenized.values)).value_counts())

In [None]:
# Take a look at the default common english stopwords (used by the library)
sw = nltk.corpus.stopwords.words('english')
# sw

In [None]:
# Convert all tokens to lower case
X_tokenized = X_tokenized.apply(lambda row: [word.lower() for word in row])

# Remove single character tokens
X_tokenized = X_tokenized.apply(lambda row: [word for word in row if len(word) > 1])

# Remove punctuation
X_tokenized = X_tokenized.apply(lambda row: [word for word in row if word not in sw and word not in string.punctuation])

# Remove other unneeded stuff
custom_sw = [u'\'\'', u'``', 'r']
X_tokenized = X_tokenized.apply(lambda row: [word for word in row if word not in custom_sw])

In [None]:
# Take a look at top n words
pd.Series(np.concatenate(X_tokenized.values)).value_counts().head(30)

## 2.2 Lemmatization

In [None]:
# Lemmatize dataset
wordnet_lemmatizer = WordNetLemmatizer()
X_tokenized = X_tokenized.apply(lambda x: [wordnet_lemmatizer.lemmatize(y) for y in x])

# Take a look at top 50 words
pd.Series(np.concatenate(X_tokenized.values)).value_counts().head(20)

# 3. Find bigrams and trigrams

In [None]:
# Compute bigrams and trigrams

from gensim.models.phrases import Phrases

# Add bigrams and trigrams to docs (only ones that appear extremes_no_below times or more).
bigram = Phrases(X_tokenized, min_count=extremes_no_below, delimiter=b'_', threshold=6)
trigram = Phrases(bigram[X_tokenized.tolist()], min_count=extremes_no_below, delimiter=b'_', threshold=6)

In [None]:
# Uncomment to see what tokens are generated in the second document, index #1
# X_tokenized[1]

In [None]:
# Uncomment to see what tokens are generated in the second document, index #1
# bigram[X_tokenized[1]]

In [None]:
# Uncomment to see what tokens are generated in the second document, index #1
# trigram[bigram[X_tokenized[1]]]

In [None]:
X_tokenized_bitrigrams = []

for idx in range(len(X_tokenized)):
    X_tokenized_bitrigrams.append([])
    for token in bigram[X_tokenized[idx]]:
        if token.count('_') == 1:
            # Token is a bigram, add to document.
            X_tokenized_bitrigrams[idx].append(token)
    for token in trigram[bigram[X_tokenized[idx]]]:
        if token.count('_') == 2:
            # Token is a trigram, add to document.
            X_tokenized_bitrigrams[idx].append(token)

In [None]:
# Look at all the bigrams and trigrams (row = document #, columns = bigrams trigrams in that particular document)
pd.DataFrame(X_tokenized_bitrigrams).head(20)

In [None]:
# Append the bigrams and trigrams to the training set
for idx in range(len(X_tokenized_bitrigrams)):
    for token in X_tokenized_bitrigrams[idx]:
        X_tokenized[idx].append(token)

In [None]:
# Take a look at top n words
pd.Series(np.concatenate(X_tokenized.values)).value_counts().head(50)

In [None]:
# How many tokens do we have?
len(pd.Series(np.concatenate(X_tokenized.values)).value_counts())

# 4. Manual and extreme word filtering

In [None]:
# Generate dictionary before other manual and extremes filtering
dictionary = corpora.Dictionary(X_tokenized)
len(dictionary)

## 4.1 Manual filtering

In [None]:
stoplist = set('')
# List of the wordsto be removed manually
stoplist = set("tissue invention engineering method methods preparation used present relates solution one said herein embodiment\
                adding comprises tissue_engineering  application present_invention also preparation_method provided first second \
                bone cornea corneal skin cartilage stent lung vascular heart cancer immune tissue-engineered muscle organ described_herein \
                connective dermal hair liver head spine cord producing end include derivative tissue_construct present_invention_relates \
                seed_cell bone_tissue_engineering present_invention_provides e.g. cardiac within prepare easy example sample patient utility_model \
                mean based surgical selected".split())
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)

In [None]:
# take a look at the remainingwords after manual filtering 
for k, id in dictionary.items():
    print(k, id)

In [None]:
# generate corpus
corpus = [dictionary.doc2bow(document) for document in X_tokenized]

## 4.2 Filtering using TF-IDF model

In [None]:
# Filter out words with low TF IDF values on a per document level
tfidf = models.TfidfModel(corpus, id2word = dictionary)

#Filter low value words and also words missing in tfidf models.
low_value = tfidf_low_value
filtered_words = []

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    filtered_words.append(low_value_words)

    #reassign
    corpus[i] = new_bow

In [None]:
low_value_words = []
for doc in filtered_words:
    low_value_words.append([dictionary.id2token[word] for word in doc if len(doc) > 0])
low_value_df = pd.Series(low_value_words)

In [None]:
# Show words removed by tf_idf 
# low_value_df.aloc[low_value_df.str.len() > 0]

In [None]:
# Unique number of tokens to be removed
low_value_words_list = [item for sublist in low_value_words for item in sublist]
len(set(low_value_words_list))

In [None]:
set(low_value_words_list)

In [None]:
# Remove low value words in our dictionary
low_value_ids = [dictionary.token2id[word] for word in low_value_words_list]
dictionary.filter_tokens(bad_ids=set(low_value_ids))

In [None]:
# Number of words remaining
len(dictionary)

## 4.3 Extremes filtering 

In [None]:
# Remove words appearing in less than n documents or in more than x% of the documents
dictionary.filter_extremes(no_below=extremes_no_below, no_above=extremes_no_above)

In [None]:
# Words remaining after all filtering
for k, id in dictionary.items():
    print(k, id)

In [None]:
# Recompute corpus with the extremes and low valued words filtered out
corpus = [dictionary.doc2bow(doc) for doc in X_tokenized]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

# 5. Coherence scores 

In [None]:
# https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    print('Please wait...')
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        print('Evaluating # topics: ', num_topics)
        model = gensim.models.LdaMulticore(corpus=corpus, random_state=seed_number , id2word=dictionary, chunksize=chunksize, \
                               iterations=iterations, num_topics=num_topics, \
                               passes=passes, eval_every=eval_every)        
        model_list.append(model)
        coherencemodel = gensim.models.coherencemodel.CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# How many models?
limit=5; start=2; step=1;
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=X_tokenized, start=start, limit=limit, step=step)

In [None]:
%%capture out_x
# Show graph
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
x = range(start, limit, step)
fig, ax = plt.subplots()
ax.plot(x, coherence_values)
ax.set_xlabel("Num Topics")
ax.set_ylabel("Coherence score")
ax.xaxis.set_major_locator(ticker.MaxNLocator(integer=True))
plt.grid(b=True, which='both', color='0.85', linestyle='--')
plt.show()

In [None]:
import warnings
warnings.filterwarnings('ignore')
out_x.show()

# 6. LDA model  

In [None]:
# number of topics to be chosen based on the coherence value
ldamodel = model_list[2]

In [None]:
%%capture topics_bubble
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=True, R=30)
pyLDAvis.display(lda_display)

In [None]:
topics_bubble.show()

# 7. Post Processing 

## 7.1 Topic distribution

In [None]:
# Distribution of topics acroos all documents
dict(ldamodel.get_document_topics(corpus[4]))

In [None]:
probabilities = []

for c in corpus:
    probabilities.append(dict(ldamodel.get_document_topics(c)))

In [None]:
df_prop = pd.DataFrame(probabilities)

In [None]:
# Initialize dataframe and rename the columns
df_prop = pd.DataFrame(probabilities)
df_prop.columns = [str('Topic {}'.format(i+1)) for i in range(len(df_prop.columns))]

In [None]:
df_prop

In [None]:
df_subset = df[['Title', 'Applicants', 'Inventors', 'URL', 'Abstract']]

In [None]:
df_subset = df_subset.join(df_prop)

In [None]:
df_subset.to_csv('Lens_Data_With_Topics.csv')

In [None]:
from IPython.display import FileLink

In [None]:
df_subset

In [None]:
FileLink('Lens_Data_With_Topics.csv')

## 7.2 Organ distribution

In [None]:
# List the organ-specific keywords and count number of patents focusing on each organ
organs = ['bone' ,
          'cartilage' ,
          'dental' ,
          'heart' ,
          'vascular' ,
          'cardiovascular' , 
          'adipose' ,
          'tendon' ,
          'ligament' ,
          'connective' ,
          'skin' ,
          'hair' ,
          'immune' ,
          'soft' ,
          'kidney' ,
          'bladder' ,
          'spine' ,
          'spinal_cord' ,
          'nerve' ,
          'lung' ,
          'liver' ,
          'pancreas' ,
          'reproductive' ,
          'muscles' ,
          'organoid' , 
          'ear' ,
          'eye' ,
          'cancer']

In [None]:
df_organs = df[['Title', 'Applicants', 'Inventors', 'URL', 'Abstract']]

In [None]:
for organ in organs:
    df_organs[organ] = df_organs['Abstract'].str.lower().str.contains(organ)

In [None]:
df_organs.to_csv('Lens_Data_With_Organs.csv')

In [None]:
FileLink('Lens_Data_With_Organs.csv')