In [1]:
!pip install contextualized-topic-models==2.2.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import General Utility Libraries 

In [2]:
import re
import urllib
import gzip
import io
import csv
import random
from collections import defaultdict
from tqdm import tqdm

Where to store the data file. If you want, you can adjust the path.

In [3]:
path_before_1990 = '/content/drive/My Drive/titles_before_1990.txt'
path_from_1990_to_2009 = '/content/drive/My Drive/titles_from_1990_to_2009.txt'
path_from_2010 = '/content/drive/My Drive/titles_from_2010.txt'

Execute the following cell only once to download the data and write it as a file to your google drive. Afterwards, skip this cell or comment it out.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# to download the data manually or get more information, go to: https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html
url = 'https://dblp.uni-trier.de/xml/dblp.xml.gz'
# num_titles = 500000  # the (max)number of titles to load 


def load_gzip_file(url):
    """Download Gzip-file."""
    response = urllib.request.urlopen(url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    return decompressed_file

def extract_titles(input_file, max_num=40000):
    """Extract title and publication year of dblp papers, given as input file.
    
    Divide the papers into 3 time periods. 
    
    Collect max max_num papers per time period.
    """
    pairs_before_1990 = []
    count_before_1990 = 0
    pairs_from_1990_to_2009 = []
    count_from_1990_to_2009 = 0
    pairs_from_2010 = []
    count_from_2010 = 0
    got_title = False
    for line in tqdm(input_file):
        line_str = line.decode('utf-8')
        if got_title: 
            # we have a title and check for the corresponding year
            year_result = re.search(r'<year>(.*)</year>', line_str)
            if year_result:
                # we also have the year and thus save the title-year pair
                year = int(year_result.group(1))
                if year < 1990:
                    pairs_before_1990.append((title, year))
                    count_before_1990 += 1
                elif year < 2010:
                    pairs_from_1990_to_2009.append((title, year))
                    count_from_1990_to_2009 += 1
                else:
                    pairs_from_2010.append((title, year))
                    count_from_2010 += 1
                got_title = False
        else:
            # we have no title and search for title
            result = re.search(r'<title>(.*)</title>', line_str)
            if result:
                title = result.group(1)
                if len(title.split(' ')) < 3:  
                    # only include titles with at least four words
                    continue
                got_title = True
        
        if count_before_1990 >= max_num and count_from_1990_to_2009 >= max_num and count_from_2010 >= max_num:
            return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010
    
    return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

def save_data(pairs, file_path):
    with open(file_path, 'w') as fout:
        writer = csv.writer(fout)
        for pair in pairs:
            writer.writerow(pair)

in_file = load_gzip_file(url)
pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010 = extract_titles(in_file)
save_data(pairs_before_1990, path_before_1990)
save_data(pairs_from_1990_to_2009, path_from_1990_to_2009)
save_data(pairs_from_2010, path_from_2010)

Mounted at /content/drive


14922037it [00:33, 441349.44it/s]


Mount your google drive (in case it is not yet mounted) so that the newly created files are available.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# LDA

In [5]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

num_lda_topics = 5

### Before the 1990s:

In [None]:
with open(path_before_1990) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

Let's perform some simple preprocessing:

In [None]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    return text

prepro_titles = [preprocess_text(title) for title in titles]

In [None]:
prepro_titles[:10]

['object model capabilities for distributed object management',
 'distributed object management technology',
 'muffin a distributed database machine',
 'algebraical optimization of ftaexpressions',
 'wissensrepraumlsentation und maschinelles lernen',
 'an algebraic characterization of stuf',
 'zur systemarchitektur von lilog',
 'mengenorientierte auswertung von anfragen in der logikprogrammiersprache prolog',
 'definite resolution over constraint languages',
 'dokumentation der syntax der liloggrammatik']

Now we turn the documents (or titles in this case) into a matrix feature representation.

In [None]:
num_features = 10000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()

In [None]:
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)

In [None]:
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: design using digital circuits applications der number implementation von software und class
Topic 1: algorithm problem method sequential recognition time machines dynamic pattern computing solution use
Topic 2: data networks theory systems approach distributed programming graphs model language chemical structure
Topic 3: note logic functions network algorithms application memory sets models languages machine development
Topic 4: systems computer control analysis information linear new problems parallel optimal finite performance


Topics:
0. Graph/networks algorithms (seems to be mostly about algorithms that (maybe) operate on graphs/networks)
1. pattern recognition (and maybe robotics)
2. ...

### From 1990 to 2009:

Add your code for topic modelling the period from 1990 to 2009 here...

In [None]:
with open(path_from_1990_to_2009) as fin:
    reader = csv.reader(fin)
    titles1 = [row[0] for row in reader]

Preprocessing

In [None]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    return text

prepro_titles1 = [preprocess_text(title) for title in titles1]

In [None]:
prepro_titles1[:10]

['an evaluation of objectoriented dbms developments  edition',
 'darwin on the incremental migration of legacy information systems',
 'integrating heterogeneous autonomous distributed applications using the dom prototype',
 'integrating objectoriented applications and middleware with relational databases',
 'towards a transaction management system for dom',
 'a risc object model for object system interoperation concepts and applications',
 'metaobject protocol concepts for a risc object model',
 'object data language facilities for multimedia data types',
 'object data model facilities for multimedia data types',
 'experiments with dispatching in a distributed object system']

Turn titles into a matrix feature representation

In [None]:
num_features = 10000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
tf1 = tf_vectorizer.fit_transform(prepro_titles1)
tf_feature_names1 = tf_vectorizer.get_feature_names_out()

In [None]:
lda1 = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf1)

In [None]:
for topic_idx1, topic1 in enumerate(lda1.components_):
    print(f'Topic {topic_idx1}:', end=' ')
    print(' '.join([tf_feature_names1[i] for i in topic1.argsort()[:-12 - 1:-1]]))

Topic 0: systems control model nonlinear models using methods equations approach learning time parallel
Topic 1: using algorithms dynamic modeling graphs fuzzy functions framework communication images image state
Topic 2: design networks systems application neural using management detection mobile distributed software development
Topic 3: analysis data adaptive study problem estimation networks stability wireless performance identification web
Topic 4: method based algorithm information linear new network robust efficient evaluation problems multiple


### From 2010 onwards:

Add your code for topic modelling the period from 2010 onwards here...

In [6]:
with open(path_from_2010) as fin:
    reader = csv.reader(fin)
    titles2 = [row[0] for row in reader]

Preprocessing

In [7]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    return text

prepro_titles2 = [preprocess_text(title) for title in titles2]

In [8]:
prepro_titles2[:10]

['spectre attacks exploiting speculative execution',
 'computer science curricula ',
 'differences in productivity and impact across the different computer science subareas',
 'klaus tschira stiftung gemeinnuumltzige gmbh kts',
 'catchment classification by runoff behaviour with selforganizing maps som',
 'analysis of projected hydrological behavior of catchments based on signature indices',
 'ear shape for biometric identification',
 'multithreaded implementation for cryptography and cryptanalysis',
 'privacypreserving authentication in wireless access networks',
 'private key cryptosystem']

Turn titles into a matrix feature representation

In [9]:
num_features = 10000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
tf2 = tf_vectorizer.fit_transform(prepro_titles2)
tf_feature_names2 = tf_vectorizer.get_feature_names_out()

In [10]:
lda2 = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf2)

In [11]:
for topic_idx2, topic2 in enumerate(lda2.components_):
    print(f'Topic {topic_idx2}:', end=' ')
    print(' '.join([tf_feature_names2[i] for i in topic2.argsort()[:-12 - 1:-1]]))

Topic 0: based adaptive optimization analysis image information detection application mobile power applications framework
Topic 1: using model network systems nonlinear estimation linear study neural performance equations deep
Topic 2: systems algorithm design based models distributed time problems energy efficient computing equation
Topic 3: data analysis dynamic sensor novel social dynamics communication selection approach graphs functions
Topic 4: control networks method learning wireless optimal data stability online tracking machine human


# Combined Topic Models

New method developed by [Bianchi et al. 2021](https://aclanthology.org/2021.acl-short.96/). 

[A 6min presentation of the paper by one of the authors.](https://underline.io/lecture/25716-pre-training-is-a-hot-topic-contextualized-document-embeddings-improve-topic-coherence)

Code: [https://github.com/MilaNLProc/contextualized-topic-models](https://github.com/MilaNLProc/contextualized-topic-models)

Tutorial: [https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing](https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing)

Again, perform topic modelling for the three time periods - this time using the combined topic models (CTMs). 

You can use and adapt the code from the tutorial linked above.

Use the available GPU for faster running times.

In [None]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
import nltk
num_ctm_topics = 5  # you can also choose a higher number of topics

### Before the 1990s:

In [None]:
with open(path_before_1990) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

Preprocessing

In [None]:
from nltk.corpus import stopwords as stop_words

nltk.download('stopwords')

#documents = [line.strip() for line in open(titles, encoding="utf-8").readlines()]

stopwords = list(stop_words.words("english"))

sp = WhiteSpacePreprocessing(titles, stopwords_language="english",vocabulary_size=2000)

preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(preprocessed_documents[:2])

['object model capabilities distributed object management', 'distributed object management technology']


In [None]:
print(unpreprocessed_corpus[:2])

['Object Model Capabilities For Distributed Object Management.', 'Distributed Object Management Technology.']


In [None]:
print(vocab[:10])

['computerized', 'combinatorial', 'tables', 'implicational', 'centers', 'definition', 'family', 'necessary', 'survey', 'health']


In [None]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/198 [00:00<?, ?it/s]



In [None]:
tp.vocab[:10]

['aacute',
 'abelian',
 'absolute',
 'abstract',
 'abstraction',
 'abstracts',
 'academic',
 'acceptance',
 'access',
 'accuracy']

Training Combined TM

In [None]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=20, num_epochs=10)
ctm.fit(training_dataset) # run the model

Epoch: [10/10]	 Seen Samples: [395240/395240]	Train Loss: 39.51440046094422	Time: 0:00:04.827666: : 10it [00:46,  4.67s/it]


Topics

In [None]:
ctm.get_topic_lists(5)

[['computer', 'review', 'book', 'report', 'science'],
 ['algorithm', 'algorithms', 'parallel', 'efficient', 'fast'],
 ['recognition', 'pattern', 'image', 'processing', 'using'],
 ['relations', 'algebras', 'characterization', 'choice', 'cardinals'],
 ['functions', 'graphs', 'automata', 'binary', 'trees'],
 ['von', 'uuml', 'der', 'de', 'auml'],
 ['systems', 'decision', 'software', 'management', 'system'],
 ['systems', 'control', 'time', 'distributed', 'model'],
 ['circuits', 'digital', 'sequential', 'design', 'fault'],
 ['method', 'equations', 'using', 'linear', 'analysis'],
 ['optimal', 'time', 'control', 'stochastic', 'queue'],
 ['pascal', 'compiler', 'cipher', 'modula', 'list'],
 ['note', 'problem', 'technical', 'problems', 'solution'],
 ['ai', 'editor', 'operations', 'automation', 'letter'],
 ['networks', 'network', 'performance', 'local', 'communication'],
 ['information', 'chemical', 'data', 'structure', 'abstracts'],
 ['logic', 'logics', 'modal', 'symbolic', 'meeting'],
 ['theory'

Draw

In [None]:
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=10)

Sampling: [10/10]: : 10it [00:27,  2.72s/it]


In [None]:
%%capture
!pip install pyldavis

In [None]:
import pyLDAvis as vis

lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=10)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

  from collections import Iterable
  from collections import Mapping
Sampling: [10/10]: : 10it [00:29,  2.91s/it]
  default_term_info = default_term_info.sort_values(


Topic Predictions

In [None]:
topics_predictions = ctm.get_thetas(training_dataset, n_samples=5) # get all the topic predictions

Sampling: [5/5]: : 5it [00:14,  2.83s/it]


In [None]:
preprocessed_documents[0] # see the text of our preprocessed document

'object model capabilities distributed object management'

In [None]:
import numpy as np
topic_number = np.argmax(topics_predictions[0]) # get the topic id of the first document

In [None]:
topic_number

19

In [None]:
ctm.get_topic_lists(5)[15]

['information', 'chemical', 'data', 'structure', 'abstracts']

In [None]:
ctm.get_topic_lists(5)[topic_number] #and the topic should be about natural location/places/related things

['system', 'data', 'based', 'design', 'knowledge']

### From 1990 to 2009

In [None]:
with open(path_from_1990_to_2009) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

Preprocessing

In [None]:
from nltk.corpus import stopwords as stop_words

nltk.download('stopwords')

#documents = [line.strip() for line in open(titles, encoding="utf-8").readlines()]

stopwords = list(stop_words.words("english"))

sp = WhiteSpacePreprocessing(titles, stopwords_language="english",vocabulary_size=2000)

preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
preprocessed_documents[:2]

['evaluation object oriented developments',
 'incremental migration information systems']

In [None]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/1634 [00:00<?, ?it/s]

In [None]:
tp.vocab[:10]

['aacute',
 'ab',
 'absolute',
 'abstract',
 'abstraction',
 'ac',
 'academic',
 'acceptance',
 'access',
 'accuracy']

Train CTM

In [None]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=20, num_epochs=10)
ctm.fit(training_dataset) # run the model

Epoch: [10/10]	 Seen Samples: [3267180/3267180]	Train Loss: 45.15349863518338	Time: 0:00:34.519010: : 10it [05:51, 35.18s/it]


Topics

In [None]:
ctm.get_topic_lists(5)

[['control', 'feedback', 'robust', 'robot', 'adaptive'],
 ['problem', 'optimization', 'algorithms', 'scheduling', 'genetic'],
 ['information', 'case', 'use', 'knowledge', 'management'],
 ['sup', 'sub', 'time', 'linear', 'state'],
 ['special', 'review', 'introduction', 'issue', 'editorial'],
 ['data', 'analysis', 'mining', 'gene', 'classification'],
 ['order', 'method', 'solution', 'equations', 'differential'],
 ['fast', 'reconstruction', 'transform', 'dimensional', 'image'],
 ['observations', 'measurements', 'characteristics', 'ground', 'ocean'],
 ['automata', 'theory', 'languages', 'logic', 'semantics'],
 ['graphs', 'number', 'random', 'trees', 'graph'],
 ['networks', 'sensor', 'wireless', 'routing', 'protocol'],
 ['using', 'recognition', 'detection', 'neural', 'face'],
 ['power', 'high', 'low', 'circuit', 'current'],
 ['channel', 'estimation', 'frequency', 'channels', 'performance'],
 ['web', 'service', 'services', 'multimedia', 'internet'],
 ['oriented', 'real', 'distributed', 'obje

Draw

In [None]:
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=10)

Sampling: [10/10]: : 10it [03:06, 18.66s/it]


In [None]:
import pyLDAvis as vis

lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=10)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

Sampling: [10/10]: : 10it [03:07, 18.74s/it]
  default_term_info = default_term_info.sort_values(


Topics predictions

In [None]:
topics_predictions = ctm.get_thetas(training_dataset, n_samples=5) # get all the topic predictions

Sampling: [5/5]: : 5it [01:34, 18.84s/it]


In [None]:
preprocessed_documents[0] # see the text of our preprocessed document

'evaluation object oriented developments'

In [None]:
import numpy as np
topic_number = np.argmax(topics_predictions[0]) # get the topic id of the first document

In [None]:
topic_number

16

In [None]:
ctm.get_topic_lists(5)[15]

['web', 'service', 'services', 'multimedia', 'internet']

In [None]:
ctm.get_topic_lists(5)[topic_number] #and the topic should be about natural location/places/related things

['oriented', 'real', 'distributed', 'object', 'system']

### From 2010 onwards

In [None]:
with open(path_from_2010) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

Preprocessing

In [None]:
from nltk.corpus import stopwords as stop_words

nltk.download('stopwords')

#documents = [line.strip() for line in open(titles, encoding="utf-8").readlines()]

stopwords = list(stop_words.words("english"))

sp = WhiteSpacePreprocessing(titles, stopwords_language="english",vocabulary_size=2000)

preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
preprocessed_documents[:2]

['attacks exploiting execution', 'computer science']

In [None]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/4104 [00:00<?, ?it/s]

In [None]:
tp.vocab[:10]

['aacute',
 'abstract',
 'ac',
 'academic',
 'accelerated',
 'accelerating',
 'acceleration',
 'acceptance',
 'access',
 'accuracy']

Train CTM

In [None]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=20, num_epochs=10)
ctm.fit(training_dataset) # run the model

Epoch: [10/10]	 Seen Samples: [8206920/8206920]	Train Loss: 54.05124197405827	Time: 0:01:27.026698: : 10it [14:31, 87.17s/it]


Topics

In [None]:
ctm.get_topic_lists(5)

[['tracking', 'control', 'autonomous', 'robot', 'adaptive'],
 ['computing', 'cloud', 'internet', 'smart', 'things'],
 ['sup', 'estimation', 'array', 'frequency', 'low'],
 ['power', 'current', 'circuit', 'permanent', 'synchronous'],
 ['learning', 'machine', 'deep', 'reinforcement', 'vector'],
 ['functional', 'brain', 'molecular', 'connectivity', 'dynamics'],
 ['drone', 'cascade', 'adjustment', 'overview', 'employing'],
 ['neural', 'network', 'real', 'artificial', 'convolutional'],
 ['optimization', 'problem', 'objective', 'multiobjective', 'evolutionary'],
 ['segmentation', 'representation', 'local', 'via', 'face'],
 ['satellite', 'forest', 'land', 'mapping', 'sentinel'],
 ['decision', 'fuzzy', 'making', 'group', 'model'],
 ['development', 'software', 'case', 'chain', 'engineering'],
 ['differential', 'order', 'dimensional', 'solutions', 'boundary'],
 ['online', 'social', 'media', 'perspective', 'use'],
 ['time', 'discrete', 'stability', 'linear', 'delay'],
 ['number', 'graphs', 'degree

Draw

In [None]:
lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=10)

Sampling: [10/10]: : 10it [07:46, 46.65s/it]


In [None]:
import pyLDAvis as vis

lda_vis_data = ctm.get_ldavis_data_format(tp.vocab, training_dataset, n_samples=10)

ctm_pd = vis.prepare(**lda_vis_data)
vis.display(ctm_pd)

Sampling: [10/10]: : 10it [07:48, 46.82s/it]
  default_term_info = default_term_info.sort_values(


Topics predictions

In [None]:
topics_predictions = ctm.get_thetas(training_dataset, n_samples=5) # get all the topic predictions

In [None]:
preprocessed_documents[0] # see the text of our preprocessed document

'attacks exploiting execution'

In [None]:
import numpy as np
topic_number = np.argmax(topics_predictions[0]) # get the topic id of the first document

In [None]:
topic_number

6

In [None]:
ctm.get_topic_lists(5)[15]

['time', 'discrete', 'stability', 'linear', 'delay']

In [None]:
ctm.get_topic_lists(5)[topic_number] #and the topic should be about natural location/places/related things

['drone', 'cascade', 'adjustment', 'overview', 'employing']