# Topic modelling med gensim og dhlab

In [2]:
import dhlab as dh
import pandas as pd
import gensim
import pyLDAvis
from pprint import pprint
import pyLDAvis.gensim_models as genvis

In [3]:
# sprit urn URN:NBN:no-nb_digibok_2020090207537

Finn en bok

In [4]:
bok = dh.Corpus(doctype="digibok", limit=1, title="Norgeshistorie")

In [5]:
bok

Unnamed: 0,dhlabid,urn,title,authors,oaiid,sesamid,isbn10,city,timestamp,year,publisher,langs,subjects,ddc,genres,literaryform,doctype,ocr_creator,ocr_timestamp
0,100027602,URN:NBN:no-nb_digibok_2010121306072,Norge. 2 : Norgeshistorie etter 1850,"Emblem , Terje",oai:nb.bibsys.no:999716948264702202,d33916ecf7219435b23a30c0d7415fee,8202163757,,19970101,1997,Cappelen,nob,History / Norway / Historie / Norge-historie /...,,,Faglitteratur,digibok,nb,20060101


In [6]:
urn = bok.frame.urn.tolist()[0]
urn

'URN:NBN:no-nb_digibok_2010121306072'

## Chunking

In [7]:
# Chunks 
res = dh.Chunks(chunks=1000, urn=urn)

In [8]:
len(res.chunks)

103

In [9]:
def chunks_to_corpus(chunks_list):
    res = []
    for x in chunks_list:
        inner_res = ""
        for y in x:
            inner_res += (y + " ") * x[y]
            
        res.append(inner_res)
    return res

In [10]:
texts = chunks_to_corpus(res.chunks)

## Find delta TFIDF

In [11]:
df = pd.DataFrame(res.chunks).transpose().fillna(0)

In [12]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,93,94,95,96,97,98,99,100,101,102
(,2.0,4.0,0.0,9.0,2.0,2.0,4.0,2.0,2.0,3.0,...,1.0,6.0,3.0,3.0,1.0,2.0,0.0,2.0,0.0,0.0
),2.0,4.0,0.0,9.0,2.0,3.0,5.0,2.0,2.0,3.0,...,1.0,6.0,4.0,3.0,1.0,6.0,1.0,2.0,0.0,0.0
",",176.0,27.0,30.0,50.0,23.0,42.0,49.0,26.0,31.0,38.0,...,24.0,39.0,29.0,32.0,44.0,31.0,38.0,49.0,45.0,50.0
-,1.0,7.0,3.0,4.0,5.0,21.0,10.0,8.0,14.0,5.0,...,4.0,7.0,4.0,0.0,9.0,17.0,4.0,12.0,7.0,13.0
.,3.0,49.0,59.0,61.0,50.0,53.0,40.0,35.0,40.0,53.0,...,51.0,48.0,47.0,54.0,56.0,48.0,55.0,55.0,57.0,46.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vjmli,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
xft,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
yji,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
yngfe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
# Import top 50000 tokens from NB collection
tot = dh.totals(50000)

In [14]:
# Divide corpus freq count by tot
res = df.sum(axis=1) /  tot.freq

In [15]:
# Get top 1000 more frequent tokens
target_tokens = res.sort_values(ascending=False).iloc[:1000].dropna().index

In [16]:
df.loc[target_tokens]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,93,94,95,96,97,98,99,100,101,102
mellomkrigstiden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
UJ,1.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,4.0,0.0,1.0,1.0
unionen,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,...,10.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
Nevn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nasjonalistiske,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sysselsatt,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
opposisjonen,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Nasjonalbiblioteket,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
beid,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [17]:
outer_lst = []

for col in df.loc[target_tokens].columns:
    inner_str = ""
    for i, x in df.loc[target_tokens][col].items():
        if x > 0:
            inner_str += (i + " ") * int(x)
        
    outer_lst.append(inner_str)


In [18]:
len(outer_lst)

103

## Prep for LDA

In [19]:
data = [x.split() for x in outer_lst]

In [20]:
id2word = gensim.corpora.Dictionary(data)

In [21]:
corpus = [id2word.doc2bow(chunk) for chunk in data]

In [22]:
data

[['UJ',
  'Nygaardsvold',
  'Terboven',
  'Hurtigruta',
  'Vidkun',
  'Sverdrup',
  'nasjonalisme',
  'stemmerett',
  'NKP',
  'Quisling',
  'Riksarkivet',
  'prostitusjon',
  'Hallesby',
  'prevensjon',
  'Folkeparti',
  'Folkeparti',
  'Stortinget',
  'samer',
  'Sosialistisk',
  'Sosialistisk',
  'kontinentalsokkelen',
  'alderstrygd',
  'abort',
  'Korvald',
  'Sametinget',
  'Venstre',
  'NATO',
  'middelskolen',
  'Lahnstein',
  '279',
  '279',
  'Linge',
  'Oscarsborg',
  'Michelsen',
  'Nasjonal',
  'Hitler',
  'Kristelig',
  'Samling',
  'N.A',
  'nasjonalbudsjett',
  'studerende',
  'Småbrukarlag',
  '263',
  '263',
  '263',
  '263',
  '263',
  '263',
  'Nansen',
  'Lyng',
  'realskolen',
  'Kommunistiske',
  '248',
  '248',
  'Fridtjof',
  'festning',
  'Selmer',
  'Venstreparti',
  'vekkelse',
  'Bonde-',
  'Hansteen',
  'Jernverk',
  'Rikskonsertene',
  'Odelstinget',
  'Næringslivets',
  'Postsparebanken',
  '208',
  '208',
  'asylsøkere',
  'sosialhjelp',
  'Kittelsen',


## Make model

In [23]:
lda_model = gensim.models.LdaMulticore(
    corpus = corpus,
    id2word = id2word,
    num_topics = 10
)

In [24]:
pprint(lda_model.print_topics())

[(0,
  '0.036*"årene" + 0.024*"Stortinget" + 0.019*"regjeringen" + 0.019*"z" + '
  '0.013*"UJ" + 0.013*"Z" + 0.012*"Regjeringen" + 0.011*"Arbeiderpartiet" + '
  '0.011*"tallet" + 0.011*"førte"'),
 (1,
  '0.034*"årene" + 0.021*"1970" + 0.019*"Stortinget" + 0.015*"\x84" + '
  '0.014*"Venstre" + 0.013*"z" + 0.011*"Arbeiderpartiet" + 0.010*"1960" + '
  '0.010*"tallet" + 0.010*"verdenskrig"'),
 (2,
  '0.052*"årene" + 0.032*"Stortinget" + 0.019*"Z" + 0.013*"tallet" + 0.012*"z" '
  '+ 0.012*"regjeringen" + 0.010*"1970" + 0.010*"kongen" + 0.009*"økte" + '
  '0.009*"Sverdrup"'),
 (3,
  '0.020*"regjeringen" + 0.018*"tyskerne" + 0.017*"årene" + 0.013*"Stortinget" '
  '+ 0.012*"z" + 0.011*"1940" + 0.011*"tallet" + 0.011*"kilder" + 0.011*"1970" '
  '+ 0.010*"problemstilling"'),
 (4,
  '0.022*"årene" + 0.021*"Stortinget" + 0.015*"Venstre" + '
  '0.015*"Arbeiderpartiet" + 0.015*"z" + 0.013*"partiet" + 0.011*"Høyre" + '
  '0.011*"vedtok" + 0.010*"utlendinger" + 0.010*"seinere"'),
 (5,
  '0.035*"årene"

In [25]:
prep = genvis.prepare(lda_model, corpus, id2word)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [26]:
# Save to html
pyLDAvis.save_html(prep, "result.html")

In [27]:
pyLDAvis.enable_notebook()

In [28]:
pyLDAvis.display(prep)