# Import

We start by importing the necessary libraries.

In [None]:
import pandas as pd
import time
import numpy as np
from gensim import corpora, utils

# Loading the data/models
We first load the corpus from the preprocessing step.

In [None]:
#set working dir
wd = 'Corpora/Deterrence/'

In [None]:
# load the corpus
data = pd.read_json(wd+'/Topic modeling/200324_deter_evp_preprocessed.json', 
                    orient = 'records', convert_dates = False) 
# convert dates to datetime obj
data['date_rough'] = pd.to_datetime(data['date_rough'], errors = 'coerce', infer_datetime_format=True)
#example
data.head(3)

Unnamed: 0,author,database,date,doi,filename,fulltext,place,pubtitle,title,url,date_rough
0,V. OLEVSKII,UDB_MIL,January 2010,,,"[интерес, решение, дать, задача, определить, ф...","Moscow,\n ...",Zarubezhnoe voennoe obozrenie,RAZRABOTKA NOVOI STRATEGIChESKOI KONTsEPTsII NATO,https://dlib.eastview.com/browse/doc/21371182,2010-01-01 00:00:00+00:00
1,V. OLEVSKII,UDB_MIL,January 2010,,,"[время, применение, войско, сила, рассматриват...","Moscow,\n ...",Zarubezhnoe voennoe obozrenie,RAZRABOTKA NOVOI STRATEGIChESKOI KONTsEPTsII NATO,https://dlib.eastview.com/browse/doc/21371182,2010-01-01 00:00:00+00:00
2,VLADIMIR BABKIN,UDB_EDU,2010,,,"[итог, получаться, главный, особенность, инфор...","Moscow,\n ...",Svobodnaia mysl',Nauka dlia zhizni ili zhizn' dlia nauki?,https://dlib.eastview.com/browse/doc/21891872,2010-01-01 00:00:00+00:00


In [None]:
print(data.date_rough.isna().sum()) #how many dates are missing
print(data.database.isna().sum()) #how many database markers are missing

352
253


In [None]:
# let's sort data by the source database (EDU vs MIL) - we might need it later
data.sort_values('database', na_position='first', inplace = True)
# reset index to start count from 1 after sorting - we'll do this frequently
data.reset_index(inplace = True, drop = True)

If we need to do a an analysis of deterrence without breaking it into time slices - we'll  use the LDA model. 

In the cell below, we are loading three pre-fitted LDA models.

In [None]:
# LDA models
#save the output of pyLDAvis

from gensim.test.utils import datapath
from gensim.models import LdaMulticore

fname = datapath('/Projects/Deterrence/Topic modeling/models/200406_1/lda.model')
fname2 = datapath('/Projects/Deterrence/Topic modeling/models/200406_2/lda.model')
fname3 = datapath('/Projects/Deterrence/Topic modeling/models/200406_3/lda.model')

lda = LdaMulticore.load(fname, mmap = 'r')
lda2 = LdaMulticore.load(fname2, mmap = 'r')
lda3 = LdaMulticore.load(fname3, mmap = 'r')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# DTM model 
model = DtmModel.load('')

#Prepare data

## Sequential dates
It is important for the dynamic topic model that all records are sorted sequentially and no dates are missing. It is also important to sort the dataframe before creating a bag of words so that sequence of token ids remains correct.

In [None]:
#let's see how many rows we have for each year
#transform date to just year
data['year'] = data['date_rough'].apply(lambda x: x.year)
#convert float to integer
data['year'] = data['year'].astype(int)
#drop records missing date
data.dropna(subset=['year'], inplace = True)
#how many non-empty fields do we have in each year?
data.groupby(['year']).count()

Unnamed: 0_level_0,author,database,date,doi,filename,fulltext,place,pubtitle,title,url,date_rough
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010,1036,1255,1275,0,0,1275,1275,1275,1275,1275,1275
2011,534,657,667,0,40,667,627,667,667,627,667
2012,274,309,321,0,54,321,267,321,321,267,321
2013,163,169,182,0,20,182,162,182,182,162,182
2014,1139,1249,1259,0,42,1259,1217,1259,1259,1217,1259
2015,1316,1426,1440,0,111,1440,1329,1440,1440,1329,1440
2016,1297,1402,1417,34,216,1417,1167,1416,1417,1167,1417
2017,1487,1599,1642,73,412,1642,1157,1624,1641,1157,1642
2018,1535,1722,1780,273,592,1780,915,1758,1780,915,1780
2019,1093,1208,1244,253,365,1244,626,1240,1244,626,1244


In [None]:
#we'll now sort the data frame by year
data.sort_values('year', inplace = True, ascending = True)
#reset index so that count starts from 0 again
data.reset_index(inplace = True, drop = True)

#now we count the number of full texts in each year
dates_cnt = data['year'].value_counts().rename_axis('year').reset_index(name='excerpt_count')
dates_cnt.sort_values('year', inplace = True, ascending = True)
dates_cnt #how it looks

Unnamed: 0,year,excerpt_count
4,2010,1275
7,2011,667
8,2012,321
9,2013,182
5,2014,1259
2,2015,1440
3,2016,1417
1,2017,1642
0,2018,1780
6,2019,1244


In [None]:
#create time sequence from the N of days in pandas
time_seq = dates_cnt['excerpt_count'].to_list()
#each record = 1 year; e.g. there are 1275 text excerpts from 2010
time_seq

[1275, 667, 321, 182, 1259, 1440, 1417, 1642, 1780, 1244]

## Dictionary and id2word

We'll create two dfs

*   with full corpus
*   with two subcorpora (military and academia)

### Full corpus

Our data is already tokenized into ngrams, so there's no need to do any preprocessing. Let's create a dictionary - an object topic model uses to reference words

In [None]:
dictionary = corpora.Dictionary(data['fulltext'].values)  # "corpora" is gensim module imported at the top
print(f'{len(dictionary)} tokens overall') #let's see how many tokens do we have

34195 tokens overall


In [None]:
# now we'll filter the tokens that appear too frequently or are too rare
dictionary.filter_extremes(no_below = 2, no_above = 0.99, keep_n=200000)
dictionary.compactify()  # make token IDs sequential

In [None]:
data['bows'] = data['fulltext'].apply(dictionary.doc2bow)  # convert documents (list of tokens) to BOWs

### Two subcorpora

In [None]:
#military pubs
military_subcorp = data.loc[data['database'] == 'UDB_MIL', ] #extract all records from the military DB

In [None]:
#academic pubs
acad_subcorp = data.loc[data['database'] == 'UDB_EDU', ] #extract all records from the academic DB

# Dynamic topic model

## Fit the model
If you've already using one of the models we've created (loaded it in the 'Loading the output' section) - then please pass this step and go on to the next ones. 

If, however, you want to fit your own model - this section might help.

[DTM](https://radimrehurek.com/gensim/models/wrappers/dtmmodel.html) is Gensim's wrapper for the [dynamic topic model by Blei et al](https://dl.acm.org/doi/pdf/10.1145/1143844.1143859). It is a little bit trickier to run (requires some additional components to compile a binary file before training a model) compared to LDAseqModel written in native Python; however, it may also have a better performance - Colab does not seem to swallow LDAseqModel very well (typically throws an error during training or just runs it until the runtime disconnects). So DTM might be a way round this issue.

Some good simple tutorials:


*   https://markroxor.github.io/gensim/static/notebooks/dtm_example.html
*   https://markroxor.github.io/gensim/static/notebooks/ldaseqmodel.html



In [None]:
from gensim.models.wrappers import DtmModel

In [None]:
# compiling the required C++ code
! git clone https://github.com/blei-lab/dtm.git
! sudo apt-get install libgsl0-dev
! cd dtm/dtm && make

Cloning into 'dtm'...
remote: Enumerating objects: 715, done.[K
remote: Total 715 (delta 0), reused 0 (delta 0), pack-reused 715[K
Receiving objects: 100% (715/715), 4.48 MiB | 8.58 MiB/s, done.
Resolving deltas: 100% (261/261), done.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
Note, selecting 'libgsl-dev' instead of 'libgsl0-dev'
The following additional packages will be installed:
  libgsl23 libgslcblas0
Suggested packages:
  gsl-ref-psdoc | gsl-doc-pdf | gsl-doc-info | gsl-ref-html
The following NEW packages will be installed:
  libgsl-dev libgsl23 libgslcblas0
0 upgraded, 3 newly installed, 0 to remove and 25 not upgraded.
Need to get 1,926 kB of archives.
After this operation, 9,474 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libgslcblas0 amd64 2.4+dfsg-6 [79.7 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libgsl23 amd64 2.4+dfsg-6 [823 kB]
Get:3 http

In [None]:
# path to our binary file 
dtm_path = "/content/dtm/dtm/main"

In [None]:
start = time.time()

#build a model
model = DtmModel(dtm_path = dtm_path, corpus = data['bows'].values, time_slices = time_seq, 
                 num_topics=16, id2word=dictionary, initialize_lda=True, 
                 top_chain_var=0.05) # higher than default to capture changes better

finish = time.time()
print(f'elapsed time: {finish - start}')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


elapsed time: 1531.3196246623993


In [None]:
# let's create a folder and save the model
! mkdir 'Projects/Deterrence/Topic modeling/200410 dtmodel'
model.save('Projects/Deterrence/Topic modeling/200410 dtmodel/dtm.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Create data for topic weights over time

We'll get use of the output produced above and tie it to our original data.

First we'll create a document-topic matrix. The challenge here is that we have 10 timeslots (2010-2019) with varying document-term densities for each. So we need to unpack them carefully, so that each timeslot in the model output is connected to the respective timeslot in the corpus.

In [None]:
#"unpack" data for time slots consecutively

topics = {} # a dictionary which will hold sequnetial doc-topic matrices
start = 2010 #our first year; each year will serve as a dictionary key

for year, num in enumerate(time_seq):
  #unpack data for the time slices
  doc_topic, topic_term, doc_lengths, term_frequency, vocab = model.dtm_vis(time=year, 
                                                                            corpus=data['bows'])
  topics[start] = doc_topic #assign doc-topic matrix to each year
  start += 1 #increment year by one and go to the next iteration

You might remeber that in data preparation section we sorted our data by date in ascending order, and created 'time slices' - i.e. the amount of documents in each year. We'll now use this variable to create correct sequences for the unpacked doc-topic matrices.

In [None]:
yr = 2010 #our first year; each year will again serve as a dictionary key
first_doc_index = 0 #our count of docs starts with 0
for year_slice in time_seq: #iterate over our time sequence variable
  last_doc_index = first_doc_index+year_slice #define last doc number in the current slice
  #in a current slice, only leave the indices of documents it will be applied to
  topics[yr] = topics[yr][first_doc_index:last_doc_index] 
  #for the next iteration - we'll start from the doc next to the one where we left off;
  #e.g. first in the slice 2 is still 1275th overall
  first_doc_index = first_doc_index+year_slice
  yr += 1 #go to the next time slice

1275
1942
2263
2445
3704
5144
6561
8203
9983
11227


In [None]:
# example - how many topics we have in year 2010?
len(topics[2010])

1275

Finally, we can convert our matrices to a single dataframe. They all have 16 columns (=num of topcis), so shouldn't be hard to stack.

In [None]:
doc_topic_matrix = pd.DataFrame() # create a placeholder df
for k, v in topics.items(): #iterate over keys and values in our dictionary with matrices
  x = pd.DataFrame.from_records(v) #create a df from the current slice matrix
  x['year_pub'] = k #create a column for year and assign the key value to it
  doc_topic_matrix = pd.concat([doc_topic_matrix, x]) #now append to our placeholder df
#let's see how it looks
doc_topic_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,year_pub
0,0.404726,0.000397,0.097220,0.000397,0.349912,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.143373,0.000397,2010
1,0.000169,0.000169,0.000169,0.000169,0.000169,0.921690,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.075943,2010
2,0.000188,0.000188,0.000188,0.000188,0.000188,0.000188,0.122398,0.000188,0.000188,0.201186,0.000188,0.189662,0.000188,0.139284,0.345401,0.000188,2010
3,0.000368,0.000368,0.000368,0.000368,0.000368,0.994477,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,2010
4,0.583410,0.000208,0.000208,0.000208,0.052212,0.000208,0.181070,0.000208,0.000208,0.000208,0.000208,0.137844,0.000208,0.000208,0.043178,0.000208,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1239,0.000102,0.000102,0.142326,0.116791,0.000102,0.000102,0.000102,0.117025,0.000102,0.000102,0.158926,0.041673,0.422241,0.000102,0.000102,0.000102,2019
1240,0.000262,0.000262,0.000262,0.291522,0.000262,0.000262,0.000262,0.574149,0.000262,0.000262,0.000262,0.000262,0.000262,0.000262,0.130922,0.000262,2019
1241,0.000984,0.000984,0.000984,0.000984,0.000984,0.985236,0.000984,0.000984,0.000984,0.000984,0.000984,0.000984,0.000984,0.000984,0.000984,0.000984,2019
1242,0.113319,0.000133,0.000133,0.000133,0.539384,0.000133,0.000133,0.000133,0.052567,0.000133,0.000133,0.103216,0.190051,0.000133,0.000133,0.000133,2019


In [None]:
#rename the columns
doc_topic_matrix.rename(columns = {0:'topic_0',	1:'topic_1',	2:'topic_2',	3:'topic_3',	4:'topic_4',	5: 'topic_5',	
                                   6:'topic_6',	7:'topic_7',	8:'topic_8',	9: 'topic_9',	10:'topic_10',	
                                   11:'topic_11',	12:'topic_12', 13:'topic_13',	14:'topic_14',	
                                   15:'topic_15'}, inplace = True)
#let's test - n of rows of our 'matrix' should coincide with our dataframe
print(len(doc_topic_matrix.index)) 
print(len(data.index))
doc_topic_matrix.head(4)

11227
11227


Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15,year_pub
0,0.404726,0.000397,0.09722,0.000397,0.349912,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.143373,0.000397,2010
1,0.000169,0.000169,0.000169,0.000169,0.000169,0.92169,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.075943,2010
2,0.000188,0.000188,0.000188,0.000188,0.000188,0.000188,0.122398,0.000188,0.000188,0.201186,0.000188,0.189662,0.000188,0.139284,0.345401,0.000188,2010
3,0.000368,0.000368,0.000368,0.000368,0.000368,0.994477,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,2010


In [None]:
# now let's join with our pubs data on columns
topics_over_time = pd.concat([data, doc_topic_matrix.reset_index(drop = True)], axis = 1)
#drop unnecessary columns
topics_over_time.drop(columns = ['date',#this is a column with unparsed dates - we don't need it anymore
                               'filename', 'year_pub'], inplace = True)
#convert year to datetime obj
topics_over_time['year'] = pd.to_datetime(topics_over_time['year'], format = '%Y')
topics_over_time.head()

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,date_rough,year,bows,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14,topic_15
0,Aleksei Khazbiev,,,"[похоже, американец, уверить, существенный, ра...","Moscow,\n ...",,Amerikanskii kapkan,https://dlib.eastview.com/browse/doc/21183436,2010-01-18 00:00:00+00:00,2010-01-01,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",0.404726,0.000397,0.09722,0.000397,0.349912,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.000397,0.143373,0.000397
1,Viktor Litovkin,UDB_MIL,,"[нужный, доктрина, президент, напомнить, прису...","Moscow,\n ...",Nezavisimoe voennoe obozrenie,"ARMEISKAIa REFORMA NA ""TROEChKU""",https://dlib.eastview.com/browse/doc/21476524,2010-03-12 00:00:00+00:00,2010-01-01,"[(14, 3), (15, 1), (24, 1), (25, 1), (26, 1), ...",0.000169,0.000169,0.000169,0.000169,0.000169,0.92169,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.000169,0.075943
2,Dmitrii RIuRIKOV,UDB_MIL,,"[быстрый_глобальный_удар, проект, продумать, в...","Moscow,\n ...",VPK. Voenno-promyshlennyi kur'er,PLIuS BYSTRYI GLOBAL'NYI UDAR,https://dlib.eastview.com/browse/doc/21504758,2010-03-17 00:00:00+00:00,2010-01-01,"[(2, 1), (23, 1), (24, 1), (41, 1), (47, 1), (...",0.000188,0.000188,0.000188,0.000188,0.000188,0.000188,0.122398,0.000188,0.000188,0.201186,0.000188,0.189662,0.000188,0.139284,0.345401,0.000188
3,Andrei GAVRILENKO,UDB_MIL,,"[сообщить, главное, штаб, вмф, подводный_лодка...","Moscow,\n ...",Krasnaia zvezda,OBESPEChIVAIa NATsIONAL'NUIu BEZOPASNOST',https://dlib.eastview.com/browse/doc/21541724,2010-03-20 00:00:00+00:00,2010-01-01,"[(2, 1), (24, 1), (61, 1), (64, 1), (68, 1), (...",0.000368,0.000368,0.000368,0.000368,0.000368,0.994477,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368,0.000368
4,Vadim MARKUShIN,UDB_MIL,,"[самый_дело, мочь, стратегически_важный, глыба...","Moscow,\n ...",Krasnaia zvezda,POSLEDNII ShAG - ON SAMYI TRUDNYI,https://dlib.eastview.com/browse/doc/21490896,2010-03-16 00:00:00+00:00,2010-01-01,"[(64, 1), (94, 2), (142, 1), (143, 1), (144, 1...",0.58341,0.000208,0.000208,0.000208,0.052212,0.000208,0.18107,0.000208,0.000208,0.000208,0.000208,0.137844,0.000208,0.000208,0.043178,0.000208


In [None]:
data.columns #which cols do we have now

Index(['author', 'database', 'date', 'doi', 'filename', 'fulltext', 'place',
       'pubtitle', 'title', 'url', 'date_rough', 'year', 'bows'],
      dtype='object')

In [None]:
#convert the df to long format (each record = 1 topic-over-doc)
topics_over_time = pd.melt(topics_over_time, id_vars = ['author', 'database', 'doi', 
                                                    'fulltext', 'place','pubtitle', 'title', 'url', 
                                                    'date_rough', 'year', 'bows'], 
            value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 'topic_5', 'topic_6', 
                          'topic_7', 'topic_8', 'topic_9', 'topic_10', 'topic_11', 'topic_12', 
                          'topic_13', 'topic_14', 'topic_15'],
            var_name = 'topic_num',
            value_name = 'topic_weight')
topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,date_rough,year,bows,topic_num,topic_weight
0,Aleksei Khazbiev,,,"[похоже, американец, уверить, существенный, ра...","Moscow,\n ...",,Amerikanskii kapkan,https://dlib.eastview.com/browse/doc/21183436,2010-01-18 00:00:00+00:00,2010-01-01,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",topic_0,0.404726
1,Viktor Litovkin,UDB_MIL,,"[нужный, доктрина, президент, напомнить, прису...","Moscow,\n ...",Nezavisimoe voennoe obozrenie,"ARMEISKAIa REFORMA NA ""TROEChKU""",https://dlib.eastview.com/browse/doc/21476524,2010-03-12 00:00:00+00:00,2010-01-01,"[(14, 3), (15, 1), (24, 1), (25, 1), (26, 1), ...",topic_0,0.000169
2,Dmitrii RIuRIKOV,UDB_MIL,,"[быстрый_глобальный_удар, проект, продумать, в...","Moscow,\n ...",VPK. Voenno-promyshlennyi kur'er,PLIuS BYSTRYI GLOBAL'NYI UDAR,https://dlib.eastview.com/browse/doc/21504758,2010-03-17 00:00:00+00:00,2010-01-01,"[(2, 1), (23, 1), (24, 1), (41, 1), (47, 1), (...",topic_0,0.000188


We've now matched topic weights to the text excerpts. However, we still need to add terms to our df to be able to read the topics.

In [None]:
year = 2010 # the first year in the corpus
term_topics_by_time = {} #create an empty dictionary; each year will be a key populated with topics

for t in range(len(time_seq)): #iterate over all units (years) in our time sequence
  n = 0 #start with the first topic (indexing starts with zero)
  topics = [] #an empty list to appends topics
  while n < 16: #go through all 16 topics
    current_topic = (model.show_topic(topicid = n, #show topic n 
                                   time=t, #in timeslice t
                                   topn=15)) #show 15 most salient terms
    current_topic = [(np.around(topic_stats[0], 3), topic_stats[1]) for topic_stats in current_topic]
    topics.append(current_topic)
    n += 1 #go to next topic
  term_topics_by_time[year] = topics #now populate the current timeslice with the topics
  year += 1 #go to the next year

#let's check how it looks
print(term_topics_by_time[2011])

[[(0.051, 'ядерный'), (0.028, 'сдерживание'), (0.021, 'ядерный_оружие'), (0.016, 'сша'), (0.009, 'мочь'), (0.008, 'страна'), (0.008, 'мир'), (0.007, 'союзник'), (0.007, 'свой'), (0.007, 'взаимный'), (0.006, 'государство'), (0.006, 'нападение'), (0.006, 'средство'), (0.006, 'стратегия'), (0.006, 'война')], [(0.017, 'кндр'), (0.012, 'сша'), (0.01, 'сторона'), (0.009, 'ядерный'), (0.009, 'заявить'), (0.009, 'сдерживание'), (0.008, 'пхеньян'), (0.008, 'страна'), (0.008, 'свой'), (0.007, 'подчеркнуть'), (0.007, 'южный_корея'), (0.007, 'военный'), (0.007, 'северный_корея'), (0.006, 'программа'), (0.006, 'переговоры')], [(0.03, 'ядерный'), (0.018, 'сдерживание'), (0.018, 'ракета'), (0.012, 'сша'), (0.01, 'вооружение'), (0.009, 'стратегический'), (0.008, 'ядерный_оружие'), (0.008, 'мочь'), (0.007, 'сила'), (0.007, 'удар'), (0.007, 'оружие'), (0.007, 'великобритания'), (0.006, 'средство'), (0.006, 'стратегия'), (0.006, 'страна')], [(0.008, 'сша'), (0.007, 'вопрос'), (0.007, 'американский'), (0.

In [None]:
topic_term_matrix = pd.DataFrame(term_topics_by_time) # create a df from the dictionary 
topic_term_matrix.head(5)

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,"[(0.056, ядерный), (0.036, сдерживание), (0.02...","[(0.051, ядерный), (0.028, сдерживание), (0.02...","[(0.037, ядерный), (0.025, ядерный_оружие), (0...","[(0.029, ядерный), (0.025, ядерный_оружие), (0...","[(0.025, ядерный), (0.024, сдерживание), (0.01...","[(0.025, ядерный), (0.023, сдерживание), (0.01...","[(0.025, ядерный), (0.022, сдерживание), (0.01...","[(0.031, ядерный), (0.023, сдерживание), (0.01...","[(0.037, ядерный), (0.024, сша), (0.02, сдержи...","[(0.026, ядерный), (0.021, сша), (0.018, сдерж..."
1,"[(0.02, кндр), (0.012, ядерный), (0.01, свой),...","[(0.017, кндр), (0.012, сша), (0.01, сторона),...","[(0.016, кндр), (0.013, сша), (0.01, сторона),...","[(0.019, кндр), (0.013, сша), (0.011, заявить)...","[(0.021, кндр), (0.012, заявить), (0.011, сша)...","[(0.02, кндр), (0.012, заявить), (0.01, програ...","[(0.021, кндр), (0.012, заявить), (0.01, пхень...","[(0.026, кндр), (0.011, северный_корея), (0.01...","[(0.024, россия), (0.015, кндр), (0.008, вопро...","[(0.023, россия), (0.011, страна), (0.01, заяв..."
2,"[(0.039, ядерный), (0.018, сдерживание), (0.01...","[(0.03, ядерный), (0.018, сдерживание), (0.018...","[(0.029, ядерный), (0.02, ракета), (0.018, сде...","[(0.034, ядерный), (0.019, сдерживание), (0.01...","[(0.039, ядерный), (0.023, сдерживание), (0.01...","[(0.039, ядерный), (0.029, сдерживание), (0.02...","[(0.041, ядерный), (0.024, сдерживание), (0.01...","[(0.044, ядерный), (0.023, сдерживание), (0.01...","[(0.043, ядерный), (0.023, сдерживание), (0.01...","[(0.028, ядерный), (0.018, сдерживание), (0.01..."
3,"[(0.008, ссср), (0.007, мнение), (0.007, амери...","[(0.008, сша), (0.007, вопрос), (0.007, америк...","[(0.011, сша), (0.008, американский), (0.008, ...","[(0.012, сша), (0.01, американский), (0.008, в...","[(0.012, сша), (0.01, сдерживание), (0.01, аме...","[(0.012, сдерживание), (0.012, сша), (0.011, а...","[(0.014, американский), (0.013, сша), (0.013, ...","[(0.016, сша), (0.013, американский), (0.011, ...","[(0.011, сша), (0.011, американский), (0.011, ...","[(0.015, президент), (0.011, стать), (0.01, ам..."
4,"[(0.025, сша), (0.018, китай), (0.018, военный...","[(0.029, сша), (0.017, китай), (0.015, страна)...","[(0.024, сша), (0.021, китай), (0.015, страна)...","[(0.025, китай), (0.024, сша), (0.013, страна)...","[(0.029, китай), (0.021, сша), (0.011, сдержив...","[(0.026, китай), (0.023, сша), (0.013, сдержив...","[(0.024, сша), (0.023, китай), (0.015, сдержив...","[(0.023, сша), (0.023, китай), (0.015, сдержив...","[(0.033, китай), (0.019, сша), (0.013, страна)...","[(0.031, китай), (0.016, сша), (0.015, страна)..."


In [None]:
topic_term_table = topic_term_matrix.T # we need to turn it
topic_term_table['year'] = topic_term_table.index.astype(int) # create an index (now indexed by years)
topic_term_table.reset_index(inplace = True, drop = True)
topic_term_table.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,year
0,"[(0.056, ядерный), (0.036, сдерживание), (0.02...","[(0.02, кндр), (0.012, ядерный), (0.01, свой),...","[(0.039, ядерный), (0.018, сдерживание), (0.01...","[(0.008, ссср), (0.007, мнение), (0.007, амери...","[(0.025, сша), (0.018, китай), (0.018, военный...","[(0.043, сдерживание), (0.034, ядерный), (0.03...","[(0.041, российский_федерация), (0.03, сдержив...","[(0.034, нато), (0.023, россия), (0.012, альян...","[(0.021, война), (0.012, советский), (0.01, ус...","[(0.035, ядерный), (0.028, сдерживание), (0.02...","[(0.025, военный), (0.015, сила), (0.015, росс...","[(0.01, мир), (0.01, свой), (0.009, мочь), (0....","[(0.026, сдерживание), (0.016, система), (0.01...","[(0.018, иран), (0.016, устрашение), (0.015, и...","[(0.034, ядерный), (0.03, сша), (0.029, сдержи...","[(0.007, свой), (0.007, наш), (0.007, говорить...",2010
1,"[(0.051, ядерный), (0.028, сдерживание), (0.02...","[(0.017, кндр), (0.012, сша), (0.01, сторона),...","[(0.03, ядерный), (0.018, сдерживание), (0.018...","[(0.008, сша), (0.007, вопрос), (0.007, америк...","[(0.029, сша), (0.017, китай), (0.015, страна)...","[(0.038, сдерживание), (0.027, стратегический)...","[(0.022, российский_федерация), (0.017, морско...","[(0.035, нато), (0.024, россия), (0.011, альян...","[(0.014, война), (0.013, советский), (0.011, у...","[(0.034, ядерный), (0.029, сила), (0.022, стра...","[(0.021, военный), (0.018, сила), (0.012, опер...","[(0.01, свой), (0.01, мир), (0.01, мочь), (0.0...","[(0.023, сдерживание), (0.012, военный), (0.01...","[(0.016, устрашение), (0.014, терроризм), (0.0...","[(0.034, ядерный), (0.032, сдерживание), (0.02...","[(0.009, человек), (0.008, свой), (0.006, гово...",2011
2,"[(0.037, ядерный), (0.025, ядерный_оружие), (0...","[(0.016, кндр), (0.013, сша), (0.01, сторона),...","[(0.029, ядерный), (0.02, ракета), (0.018, сде...","[(0.011, сша), (0.008, американский), (0.008, ...","[(0.024, сша), (0.021, китай), (0.015, страна)...","[(0.035, сдерживание), (0.034, стратегический)...","[(0.016, морской), (0.015, российский_федераци...","[(0.028, нато), (0.026, россия), (0.011, сша),...","[(0.016, советский), (0.014, война), (0.01, ус...","[(0.031, ядерный), (0.028, сила), (0.024, стра...","[(0.02, сила), (0.017, военный), (0.011, опера...","[(0.01, мир), (0.009, свой), (0.009, мочь), (0...","[(0.021, сдерживание), (0.012, военный), (0.01...","[(0.014, терроризм), (0.012, устрашение), (0.0...","[(0.047, ядерный), (0.034, сдерживание), (0.02...","[(0.011, человек), (0.007, свой), (0.006, гово...",2012


In [None]:
#let's rename topic columns
topic_term_table.rename(columns = {0:'topic_0',	1:'topic_1',	2:'topic_2',	3:'topic_3',	4:'topic_4',	5: 'topic_5',	
                                   6:'topic_6',	7:'topic_7',	8:'topic_8',	9: 'topic_9',	10:'topic_10',	
                                   11:'topic_11',	12:'topic_12', 13:'topic_13',	14:'topic_14',	
                                   15:'topic_15'}, inplace = True)
#pivot from wide to long
topic_term_table = pd.melt(topic_term_table, id_vars = 'year', 
                           value_vars = ['topic_0', 'topic_1', 'topic_2', 'topic_3', 'topic_4', 
                                         'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9', 
                                         'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14', 
                                         'topic_15'], 
                           var_name = 'topic_num', value_name = 'terms')

topic_term_table.reset_index(drop = True, inplace = True)

#convert years to day
topic_term_table['year'] = pd.to_datetime(topic_term_table['year'], format = '%Y')

topic_term_table.head(3)

Unnamed: 0,year,topic_num,terms
0,2010-01-01,topic_0,"[(0.056, ядерный), (0.036, сдерживание), (0.02..."
1,2011-01-01,topic_0,"[(0.051, ядерный), (0.028, сдерживание), (0.02..."
2,2012-01-01,topic_0,"[(0.037, ядерный), (0.025, ядерный_оружие), (0..."


In [None]:
# finally, we merge two dfs (topics over docs and terms over topics)
topics_over_time = topics_over_time.merge(topic_term_table, how = 'left', on = ['year', 'topic_num'])

topics_over_time.head(3)

Unnamed: 0,author,database,doi,fulltext,place,pubtitle,title,url,date_rough,year,bows,topic_num,topic_weight,terms
0,Aleksei Khazbiev,,,"[похоже, американец, уверить, существенный, ра...","Moscow,\n ...",,Amerikanskii kapkan,https://dlib.eastview.com/browse/doc/21183436,2010-01-18 00:00:00+00:00,2010-01-01,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...",topic_0,0.404726,"[(0.056, ядерный), (0.036, сдерживание), (0.02..."
1,Viktor Litovkin,UDB_MIL,,"[нужный, доктрина, президент, напомнить, прису...","Moscow,\n ...",Nezavisimoe voennoe obozrenie,"ARMEISKAIa REFORMA NA ""TROEChKU""",https://dlib.eastview.com/browse/doc/21476524,2010-03-12 00:00:00+00:00,2010-01-01,"[(14, 3), (15, 1), (24, 1), (25, 1), (26, 1), ...",topic_0,0.000169,"[(0.056, ядерный), (0.036, сдерживание), (0.02..."
2,Dmitrii RIuRIKOV,UDB_MIL,,"[быстрый_глобальный_удар, проект, продумать, в...","Moscow,\n ...",VPK. Voenno-promyshlennyi kur'er,PLIuS BYSTRYI GLOBAL'NYI UDAR,https://dlib.eastview.com/browse/doc/21504758,2010-03-17 00:00:00+00:00,2010-01-01,"[(2, 1), (23, 1), (24, 1), (41, 1), (47, 1), (...",topic_0,0.000188,"[(0.056, ядерный), (0.036, сдерживание), (0.02..."


In [None]:
topics_over_time.drop(columns = ['bows', 'doi', 'date_rough'], inplace = True)

In [None]:
def simplify_list(lst):
  x = [str(t[0])+' '+str(t[1]) for t in lst]
  return x

In [None]:
topics_over_time_simplified = topics_over_time
topics_over_time_simplified['terms'] = topics_over_time_simplified['terms'].apply(simplify_list)

In [None]:
topics_over_time_simplified.to_json(wd+'Topic modeling/200410_1105_rudeter_dtm.json', lines = True, 
                            orient = 'records', force_ascii=False, date_format = 'iso') # finally - exctract as json

## Create data for term weights over time

Now let's tweak it for the streamgraph/other diachronical vizz.
First a simple task - export topic-term table ordered by years

In [None]:
year = 2010 # the first year in the corpus
topics = [] #an empty list to appends topics
for t in range(len(time_seq)): #iterate over all years in our sequence
  n = 0 #start with first topic (indexing starts with zero)
  while n < 16: #go through all topics
    one_topic = model.show_topic(topicid = n, time=t, topn=15) #get topic n in the current timeslice
    topics.extend([list(term) + [year, n]  for term in one_topic]) #append each topic in the current timeslice
    n += 1 #go to next topic
  year += 1 #go to the next year

print(topics[0])

[0.02434929661075315, 'иран', 2010, 0]


In [None]:
len(topics)

2400

In [None]:
# convert term/topic weights to dataframe
terms_by_time = pd.DataFrame(topics, columns = ['weight', 'term', 'year', 'topic n'])
terms_by_time.head()

Unnamed: 0,weight,term,year,topic n
0,0.024349,иран,2010,0
1,0.015444,российский_федерация,2010,0
2,0.013948,санкция,2010,0
3,0.010349,кндр,2010,0
4,0.009477,территория,2010,0


In [None]:
#write the table
terms_by_time.to_csv('Projects/x/Scripts and supporting files/200311_term_weights.csv')

# LDA

## Academic and military together

## Fit the model
If you're already using one of the models we've created (loaded it in the 'Loading the output' section) - then please pass this step and go on to the next ones. 

If, however, you want to fit your own model - this section might help.

In [None]:
# how many CPUs does our host have?
! nproc --all

2


In [None]:
from gensim.models import LdaMulticore

In [None]:
lda = LdaMulticore(corpus=data['bows'].values, id2word=dictionary,
                   num_topics=20, #let's try more this time
                   passes=30,
                   alpha = 0.5, eta = 0.01,
                   workers = 2)

  diff = np.log(self.expElogbeta)


In [None]:
#let's also look into the what our topics are made of
for i, (words, _) in enumerate(lda.top_topics(data['bows'].values)):  # print topics for the same corpus (could be another)
    words = ' '.join([w for _, w in words])
    print(i, words)

0 китай страна отношение сша кнр китайский свой россия экономический пекин вопрос регион американский политика япония сторона стать влияние позиция вашингтон
1 военный безопасность регион страна государство международный сфера россия цель борьба региональный действие сотрудничество сдерживание область глобальный укрепление интерес участие развитие
2 сша россия союзник сдерживание американский свой соединить_штат европа договор угроза вашингтон нато сторона российский политика новый москва америка территория наш
3 сила стратегический сдерживание военный задача противник средство применение обеспечение действие агрессия ядерный операция государство должный вооружённый_сила нападение защита оборона угроза
4 сдерживание ядерный система потенциал стратегический средство возможность угроза оружие применение условие неядерный стратегия военный ядерный_оружие стратегический_стабильность сша цель уровень эффективный
5 россия нато сдерживание политика альянс стратегия отношение подход европейски

Let's also try to play with the hyperparameters to create 'alternative' models. More explanations on [Rizzoma](https://rizzoma.com/topic/194a9c875127f4bac9a94bdbbdec9b78/0_b_aurd_akd8b/)

In [None]:
#all default
lda2 = LdaMulticore(corpus=bows, id2word=dictionary,
                   num_topics=20, #let's try more this time
                   passes=30,
                  #  alpha = 0.5, eta = 0.01,
                   workers = 2)
#lower alpha
lda3 = LdaMulticore(corpus=bows, id2word=dictionary,
                   num_topics=20, #let's try more this time
                   passes=30,
                   alpha = 0.2, eta = 0.01,
                   workers = 2)

## Visualize

First we need to import and install our main visualization library - pyLDAvis

In [None]:
! pip install pyLDAvis
import pyLDAvis.gensim as gensimvis
import pyLDAvis
pyLDAvis.enable_notebook() #enable plotting in the notebook

### The whole corpus

In [None]:
# let's visualize it statically first
vis_data = gensimvis.prepare(topic_model = lda, corpus = data['bows'].values, 
                                 dictionary = dictionary, n_jobs = -1)
pyLDAvis.display(vis_data)

In [None]:
#save the output of pyLDAvis
pyLDAvis.save_html(vis_data, 
                   'Projects/Deterrence/Topic modeling/200408_lda_1.html')

Let's also try visualizing our 'alternative' models.

In [None]:
vis_data2 = gensimvis.prepare(topic_model = lda2, corpus = bows.values, 
                                 dictionary = dictionary)
vis_data3 = gensimvis.prepare(topic_model = lda3, corpus = bows.values, 
                                 dictionary = dictionary)

In [None]:
#save the output of pyLDAvis
pyLDAvis.save_html(vis_data2, 
                   'Projects/Deterrence/Topic modeling/200406_lda_2.html')
pyLDAvis.save_html(vis_data3, 
                   'Projects/Deterrence/Topic modeling/200406_lda_3.html')

In [None]:
#model 2
pyLDAvis.display(vis_data2)

In [None]:
#model 3
pyLDAvis.display(vis_data3)

Apart from the vizzes, let's also save the models so we can reapply them later.

In [None]:
# create directories
! mkdir 'Projects/Deterrence/Topic modeling/models/200406_1/'
! mkdir 'Projects/Deterrence/Topic modeling/models/200406_2/'
! mkdir 'Projects/Deterrence/Topic modeling/models/200406_3/'

#saving the models
lda.save('Projects/Deterrence/Topic modeling/models/200406_1/lda.model')
lda2.save('Projects/Deterrence/Topic modeling/models/200406_2/lda.model')
lda3.save('Projects/Deterrence/Topic modeling/models/200406_3/lda.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Academic and military separately
Let's also use the model we've trained on the whole corpus to see how two subcorpora (military and academic) look separately and (hopefully) compare them.

In [None]:
# prepare two visualizations
# for military subcorpus
mil_vis_data = gensimvis.prepare(topic_model = lda, corpus = military_subcorp['bows'].values, 
                                 dictionary = dictionary, 
                                 n_jobs = -1) #-1 means 'use all CPU cores available'
# and for the academic subcorpus
acad_vis_data = gensimvis.prepare(topic_model = lda, corpus = acad_subcorp['bows'].values, 
                                 dictionary = dictionary, 
                                  n_jobs = -1)

In [None]:
# docs from the military database
pyLDAvis.display(mil_vis_data)

In [None]:
#docs from the academic database
pyLDAvis.display(acad_vis_data)

In [None]:
#save the output of pyLDAvis
pyLDAvis.save_html(mil_vis_data, 
                   'Projects/Deterrence/Topic modeling/200408_lda_mil.html')
pyLDAvis.save_html(acad_vis_data, 
                   'Projects/Deterrence/Topic modeling/200408_lda_acad.html')