In [1]:
!pip install contextualized-topic-models==2.5.0



## Import General Utility Libraries

In [2]:
import re
import urllib
import gzip
import io
import csv
import random
from collections import defaultdict
from tqdm import tqdm

Where to store the data file. If you want, you can adjust the path.

In [3]:
path_before_1990 = '/content/drive/My Drive/titles_before_1990.txt'
path_from_1990_to_2009 = '/content/drive/My Drive/titles_from_1990_to_2009.txt'
path_from_2010 = '/content/drive/My Drive/titles_from_2010.txt'

Execute the following cell only once to download the data and write it as a file to your google drive. Afterwards, skip this cell or comment it out.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

# to download the data manually or get more information, go to: https://dblp.org/faq/How+can+I+download+the+whole+dblp+dataset.html
url = 'https://dblp.uni-trier.de/xml/dblp.xml.gz'
# num_titles = 500000  # the (max)number of titles to load


def load_gzip_file(url):
    """Download Gzip-file."""
    response = urllib.request.urlopen(url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj=compressed_file)
    return decompressed_file

def extract_titles(input_file, max_num=40000):
    """Extract title and publication year of dblp papers, given as input file.

    Divide the papers into 3 time periods.

    Collect max max_num papers per time period.
    """
    pairs_before_1990 = []
    count_before_1990 = 0
    pairs_from_1990_to_2009 = []
    count_from_1990_to_2009 = 0
    pairs_from_2010 = []
    count_from_2010 = 0
    got_title = False
    for line in tqdm(input_file):
        line_str = line.decode('utf-8')
        if got_title:
            # we have a title and check for the corresponding year
            year_result = re.search(r'<year>(.*)</year>', line_str)
            if year_result:
                # we also have the year and thus save the title-year pair
                year = int(year_result.group(1))
                if year < 1990:
                    pairs_before_1990.append((title, year))
                    count_before_1990 += 1
                elif year < 2010:
                    pairs_from_1990_to_2009.append((title, year))
                    count_from_1990_to_2009 += 1
                else:
                    pairs_from_2010.append((title, year))
                    count_from_2010 += 1
                got_title = False
        else:
            # we have no title and search for title
            result = re.search(r'<title>(.*)</title>', line_str)
            if result:
                title = result.group(1)
                if len(title.split(' ')) < 3:
                    # only include titles with at least four words
                    continue
                got_title = True

        if count_before_1990 >= max_num and count_from_1990_to_2009 >= max_num and count_from_2010 >= max_num:
            return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

    return pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010

def save_data(pairs, file_path):
    with open(file_path, 'w') as fout:
        writer = csv.writer(fout)
        for pair in pairs:
            writer.writerow(pair)

in_file = load_gzip_file(url)
pairs_before_1990, pairs_from_1990_to_2009, pairs_from_2010 = extract_titles(in_file)
save_data(pairs_before_1990, path_before_1990)
save_data(pairs_from_1990_to_2009, path_from_1990_to_2009)
save_data(pairs_from_2010, path_from_2010)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


16432737it [00:36, 455925.51it/s]


Mount your google drive (in case it is not yet mounted) so that the newly created files are available.

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# LDA

In [6]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

num_lda_topics = 5

### Before the 1990s:

In [7]:
with open(path_before_1990) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

Let's perform some simple preprocessing:

In [16]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = text.lower()
    return text

prepro_titles = [preprocess_text(title) for title in titles]

In [9]:
prepro_titles[:10]

['object model capabilities for distributed object management',
 'distributed object management technology',
 'muffin a distributed database machine',
 'algebraical optimization of ftaexpressions',
 'wissensrepraumlsentation und maschinelles lernen',
 'an algebraic characterization of stuf',
 'zur systemarchitektur von lilog',
 'mengenorientierte auswertung von anfragen in der logikprogrammiersprache prolog',
 'definite resolution over constraint languages',
 'dokumentation der syntax der liloggrammatik']

Now we turn the documents (or titles in this case) into a matrix feature representation.

In [10]:
num_features = 10000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_features, stop_words='english')
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()

In [11]:
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)

In [12]:
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: systems design control linear using model recognition distributed memory digital network time
Topic 1: analysis algorithms parallel models data sets der von solution detection und review
Topic 2: computer logic theory programming software simulation computing structure based nonlinear complexity modal
Topic 3: algorithm networks note method problem application functions circuits languages set techniques machine
Topic 4: information new problems sequential optimal graphs machines finite binary number chemical structures


Topics:
0. Graph/networks algorithms (seems to be mostly about algorithms that (maybe) operate on graphs/networks)
1. pattern recognition (and maybe robotics)
2. ...

### From 1990 to 2009:

Add your code for topic modelling the period from 1990 to 2009 here...

In [13]:
with open(path_from_1990_to_2009) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

In [14]:
prepro_titles = [preprocess_text(title) for title in titles]

In [15]:
prepro_titles[:10]

['an evaluation of objectoriented dbms developments  edition',
 'darwin on the incremental migration of legacy information systems',
 'integrating heterogeneous autonomous distributed applications using the dom prototype',
 'integrating objectoriented applications and middleware with relational databases',
 'towards a transaction management system for dom',
 'a risc object model for object system interoperation concepts and applications',
 'metaobject protocol concepts for a risc object model',
 'object data language facilities for multimedia data types',
 'object data model facilities for multimedia data types',
 'experiments with dispatching in a distributed object system']

In [16]:
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()

In [17]:
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)

In [18]:
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: based information new network systems model estimation modeling time approach fuzzy image
Topic 1: design method theory computing identification structure case digital application sets implementation search
Topic 2: using systems control analysis networks linear nonlinear algorithm adaptive models problem optimal
Topic 3: data model neural detection software learning development knowledge power codes prediction set
Topic 4: study dynamic graphs management scheme systems programming logic realtime space tracking properties




1.   Information Theory (Words like information, networks, modeling)
2.   Algorithm Design (Words like theory, computing, implementation)
3.   Linear and non-linear modelling of data
4.   Neural Networks/Deep Learning (Words like prediction, learning, neural)
5.   Dynamic Programming (Words like dynamic, raeltime, tracking)



### From 2010 onwards:

Add your code for topic modelling the period from 2010 onwards here...

In [19]:
with open(path_from_2010) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

In [20]:
prepro_titles = [preprocess_text(title) for title in titles]

In [21]:
prepro_titles[:10]

['spectre attacks exploiting speculative execution',
 ' jahre studiengang informatik an der rwth',
 'computer science curricula ',
 'differences in productivity and impact across the different computer science subareas',
 'schloss dagstuhl  jahresbericht  annual report ',
 'schloss dagstuhl  jahresbericht  annual report ',
 'schloss dagstuhl  jahresbericht  annual report ',
 'schloss dagstuhl  jahresbericht  annual report ',
 'schloss dagstuhl  jahresbericht  annual report ',
 'schloss dagstuhl  jahresbericht  annual report ']

In [22]:
tf = tf_vectorizer.fit_transform(prepro_titles)
tf_feature_names = tf_vectorizer.get_feature_names_out()

In [23]:
lda = LatentDirichletAllocation(n_components=num_lda_topics, max_iter=5, learning_method='online', random_state=42).fit(tf)

In [24]:
for topic_idx, topic in enumerate(lda.components_):
    print(f'Topic {topic_idx}:', end=' ')
    print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-12 - 1:-1]]))

Topic 0: using networks model systems network algorithm detection neural efficient performance wireless time
Topic 1: optimization image application equations applications methods hybrid new identification smart digital sensing
Topic 2: based study information deep framework mobile classification prediction problem multiple management approach
Topic 3: control learning nonlinear estimation linear design distributed robust optimal power problems approach
Topic 4: analysis data method systems adaptive dynamic energy recognition finite selection images graphs




1.   Neural Networks and Algorithms
2.   Image Classification and Analysis
3.   Deep Learning and Classification
4.   Algorithm Design
5.   Incoherent



# Combined Topic Models

Method developed by [Bianchi et al. 2021](https://aclanthology.org/2021.acl-short.96/).

[A 6min presentation of the paper by one of the authors.](https://underline.io/lecture/25716-pre-training-is-a-hot-topic-contextualized-document-embeddings-improve-topic-coherence)

Code: [https://github.com/MilaNLProc/contextualized-topic-models](https://github.com/MilaNLProc/contextualized-topic-models)

Tutorial: [https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing](https://colab.research.google.com/drive/1fXJjr_rwqvpp1IdNQ4dxqN4Dp88cxO97?usp=sharing)

Again, perform topic modelling for the three time periods - this time using the combined topic models (CTMs).

You can use and adapt the code from the tutorial linked above.

Use the available GPU for faster running times.

In [6]:
from contextualized_topic_models.models.ctm import CombinedTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords

num_ctm_topics = 5  # you can also choose a higher number of topics

In [7]:
import nltk
from nltk.corpus import stopwords as stop_words

nltk.download('stopwords')

stopwords = list(stop_words.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Before the 1990s:

In [6]:
with open(path_before_1990) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

In [7]:
titles[0:5]

['Object Model Capabilities For Distributed Object Management.',
 'Distributed Object Management Technology.',
 'Muffin: A Distributed Database Machine',
 'Algebraical Optimization of FTA-Expressions',
 'Wissensrepr&auml;sentation und Maschinelles Lernen']

In [12]:
sp = WhiteSpacePreprocessingStopwords(titles, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

In [13]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/198 [00:00<?, ?it/s]

In [14]:
tp.vocab[:10]

array(['aacute', 'abelian', 'absolute', 'abstract', 'abstraction',
       'abstracts', 'academic', 'acceptance', 'access', 'accuracy'],
      dtype=object)

In [15]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=20, num_epochs=10)
ctm.fit(training_dataset) # run the model

Epoch: [10/10]	 Seen Samples: [394880/395280]	Train Loss: 39.711509172881634	Time: 0:00:11.214372: : 10it [02:03, 12.36s/it]
100%|██████████| 618/618 [00:10<00:00, 59.46it/s]


In [17]:
ctm.get_topic_lists(5)[0:5]

[['digital', 'fault', 'analysis', 'design', 'error'],
 ['note', 'problems', 'technical', 'problem', 'linear'],
 ['network', 'communications', 'memory', 'digital', 'communication'],
 ['systems', 'model', 'decision', 'distributed', 'control'],
 ['code', 'probability', 'random', 'surface', 'generator']]

1. Digital Design
2. Incoherent (Since, we could not make any conclusive topic out of the output words)
3. Communication Networks
4. Control Systems
5. Incoherent (Since, we could not make any conclusive topic out of the output words)

### From 1990 to 2009

In [8]:
with open(path_from_1990_to_2009) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

In [9]:
titles[0:5]

['An Evaluation of Object-Oriented DBMS Developments: 1994 Edition.',
 'DARWIN: On the Incremental Migration of Legacy Information Systems',
 'Integrating Heterogeneous, Autonomous, Distributed Applications Using the DOM Prototype.',
 'Integrating Object-Oriented Applications and Middleware with Relational Databases.',
 'Towards a Transaction Management System for DOM.']

In [13]:
sp = WhiteSpacePreprocessingStopwords(titles, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

In [14]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Batches:   0%|          | 0/1638 [00:00<?, ?it/s]

In [15]:
tp.vocab[:10]

array(['aacute', 'ab', 'absolute', 'abstract', 'abstraction', 'ac',
       'academic', 'acceptance', 'access', 'accuracy'], dtype=object)

In [16]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=20, num_epochs=10)
ctm.fit(training_dataset) # run the model

Epoch: [10/10]	 Seen Samples: [3274240/3274720]	Train Loss: 45.242344110621616	Time: 0:01:39.183410: : 10it [16:34, 99.49s/it]
100%|██████████| 5117/5117 [01:26<00:00, 59.26it/s]


In [17]:
ctm.get_topic_lists(5)[0:5]

[['power', 'low', 'high', 'phase', 'circuit'],
 ['models', 'model', 'markov', 'distribution', 'estimation'],
 ['uuml', 'der', 'und', 'de', 'von'],
 ['service', 'web', 'services', 'environments', 'management'],
 ['image', 'recognition', 'images', 'detection', 'segmentation']]

1. Power Modeling
2. Markov Models
3. Incoherent (Since, we could not make any conclusive topic out of the output words)
4. Web Services and Management
5. Image Recognition and Analysis

### From 2010 onwards

In [8]:
with open(path_from_2010) as fin:
    reader = csv.reader(fin)
    titles = [row[0] for row in reader]

In [9]:
titles[0:5]

['Spectre Attacks: Exploiting Speculative Execution.',
 '50 Jahre Studiengang Informatik an der RWTH',
 'Computer Science Curricula 2013',
 'Differences in productivity and impact across the different computer science subareas.',
 'Schloss Dagstuhl - Jahresbericht / Annual Report 2013']

In [10]:
sp = WhiteSpacePreprocessingStopwords(titles, stopwords_list=stopwords)
preprocessed_documents, unpreprocessed_corpus, vocab, retained_indices = sp.preprocess()

In [11]:
tp = TopicModelDataPreparation("all-mpnet-base-v2")

training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)



Batches:   0%|          | 0/4648 [00:00<?, ?it/s]

In [12]:
tp.vocab[:10]

  and should_run_async(code)


array(['aacute', 'abstract', 'ac', 'academic', 'accelerated',
       'accelerating', 'acceleration', 'acceptance', 'access', 'accuracy'],
      dtype=object)

In [None]:
ctm = CombinedTM(bow_size=len(tp.vocab), contextual_size=768, n_components=20, num_epochs=1)
ctm.fit(training_dataset) # run the model

  and should_run_async(code)
Epoch: [1/1]	 Seen Samples: [929472/929481]	Train Loss: 55.991646483311726	Time: 0:04:50.696124: : 1it [04:50, 290.70s/it]

  0%|          | 0/14524 [00:00<?, ?it/s][A
  0%|          | 1/14524 [00:00<1:25:38,  2.83it/s][A
  0%|          | 11/14524 [00:00<08:10, 29.59it/s] [A
  0%|          | 18/14524 [00:00<05:58, 40.43it/s][A
  0%|          | 25/14524 [00:00<05:07, 47.11it/s][A
  0%|          | 32/14524 [00:00<04:44, 50.87it/s][A
  0%|          | 40/14524 [00:00<04:17, 56.22it/s][A
  0%|          | 47/14524 [00:01<04:06, 58.64it/s][A
  0%|          | 55/14524 [00:01<04:00, 60.27it/s][A
  0%|          | 62/14524 [00:01<04:01, 59.83it/s][A
  0%|          | 69/14524 [00:01<03:54, 61.61it/s][A
  1%|          | 76/14524 [00:01<03:49, 62.99it/s][A
  1%|          | 83/14524 [00:01<03:55, 61.36it/s][A
  1%|          | 90/14524 [00:01<03:48, 63.03it/s][A
  1%|          | 97/14524 [00:01<03:53, 61.82it/s][A
  1%|          | 105/14524 [00:01<03:45, 63.

In [None]:
ctm.get_topic_lists(5)[0:5]