In [None]:
# Past your API key between the quotes
# You might need to trim off any spaces at the beginning and end
API_KEY = '8cJHqhhoDveFe-Ej77kb'
print('Your API key is: {}'.format(API_KEY))

Your API key is: 8cJHqhhoDveFe-Ej77kb


In [None]:
# This cell just sets up some stuff that we'll need later

import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import pandas as pd
from tqdm.auto import tqdm
import time
import re
from slugify import slugify
from time import strftime
from IPython.display import display, FileLink
from pathlib import Path

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))

API_URL = 'http://api.digitalnz.org/v3/records.json'

In [None]:
def process_articles(results):
    articles = []
    for result in results:
        # If you're harvesting something other than Papers Past, you'd probably 
        # want to change the way results are processed.
        title = re.sub(r'(\([^)]*\))[^(]*$', '', result['title']).strip()
        articles.append({
            'id': result['id'],
            'title': title,
            'newspaper': result['publisher'][0],
            'date': result['date'][0][:10],
            'text': result['fulltext'],
            'paperspast_url': result['landing_url'],
            'source_url': result['source_url']
        })
    return articles

def get_total(params):
    np = params.copy()
    np['per_page'] = 0
    data = get_records(np)
    return data['search']['result_count']
    
def get_records(params):
    response = requests.get(API_URL, params=params)
    return response.json()

def harvest(params):
    '''
    Do the harvesting!
    '''
    more = True
    articles = []
    params['page'] = 1
    total = get_total(params)
    with tqdm(total=total) as pbar:
        while more:
            data = get_records(params)
            results = data['search']['results']
            if results:
                articles += process_articles(data['search']['results'])
                pbar.update(len(results))
                params['page'] += 1
                time.sleep(0.2)
            else:
                more = False 
    return articles

def start_harvest(query=None, start_year=None, end_year=None, **kwargs):
    '''
    Initiates a harvest.
    If you've specified start and end years it'll loop over them getting results for each.
    '''
    params = {
        'text': " ",
        'and[primary_collection][]': 'Papers Past',
        'per_page': '100',
        'api_key': API_KEY
    }
    for key, value in kwargs.items():
        params[f'and[{key}][]'] = value
    if start_year and end_year:
        articles = []
        for year in tqdm(range(start_year, end_year+1), desc='Years'):
            current_year = year
            params['and[year][]'] = year
            articles += harvest(params)
    else:
        articles = harvest(params)
    return articles

def save_as_csv(articles, query_name):
    '''
    Save the results as a CSV file.
    Filename is constructed from the the supplied query_name and the current date/time.
    Displays a download link when finished.
    '''
    Path('data').mkdir(exist_ok=True)
    filename = f'{slugify(query_name)}-{strftime("%Y%m%d%H%M%S")}.csv'
    df = pd.DataFrame(articles)
    df.to_csv(Path('data', filename), index=False)
    display(FileLink(Path('data', filename)))

In [None]:
articles = start_harvest(start_year=1830, end_year=1845)

Years:   0%|          | 0/16 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/988 [00:00<?, ?it/s]

  0%|          | 0/1277 [00:00<?, ?it/s]

  0%|          | 0/2574 [00:00<?, ?it/s]

  0%|          | 0/3493 [00:00<?, ?it/s]

  0%|          | 0/1915 [00:00<?, ?it/s]

  0%|          | 0/2318 [00:00<?, ?it/s]

In [None]:
df = pd.DataFrame(articles)
len(df.text[0])

4476

In [None]:
df.tail()

Unnamed: 0,id,title,newspaper,date,text,paperspast_url,source_url
12592,26823325,POST SCRIPT,Wellington Independent,1845-04-05,post script the draft of a militia bill laid b...,https://paperspast.natlib.govt.nz/newspapers/W...,http://api.digitalnz.org/records/26823325/source
12593,26823327,STAVES.,Wellington Independent,1845-04-05,staves owing to the cessation of immigration t...,https://paperspast.natlib.govt.nz/newspapers/W...,http://api.digitalnz.org/records/26823327/source
12594,2676578,"New Zealand spectator, AND COOK'S STRAITS GU...",New Zealand Spectator and Cook's Strait Guardian,1845-10-11,new zealand spectator and cook straits guardia...,https://paperspast.natlib.govt.nz/newspapers/N...,http://api.digitalnz.org/records/2676578/source
12595,2646226,Narrative of Events at the Bay of Islands.,New Zealander,1845-11-19,narrative of events at the bay of islands on t...,https://paperspast.natlib.govt.nz/newspapers/N...,http://api.digitalnz.org/records/2646226/source
12596,1898425,WHALE-FISHERY.,Daily Southern Cross,1845-02-22,whale-fishery at the southern settlements the ...,https://paperspast.natlib.govt.nz/newspapers/D...,http://api.digitalnz.org/records/1898425/source


In [None]:
print(df["text"])

0        agency rphe subscribers haying formed j a co-p...
1        settlers to new zealand are respectfully infor...
2        c fltartin and co beg to inform emi jltjl gran...
3        new zealand published this day in bvo price 3s...
4        improved family paper for in formation on the ...
                               ...                        
12592    post script the draft of a militia bill laid b...
12593    staves owing to the cessation of immigration t...
12594    new zealand spectator and cook straits guardia...
12595    narrative of events at the bay of islands on t...
12596    whale-fishery at the southern settlements the ...
Name: text, Length: 12597, dtype: object


In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!pip install pyLDAvis

Collecting spacy
  Using cached spacy-3.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
Collecting thinc<8.1.0,>=8.0.9
  Using cached thinc-8.0.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (623 kB)
Collecting pathy>=0.3.5
  Using cached pathy-0.6.0-py3-none-any.whl (42 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Using cached pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
Collecting catalogue<2.1.0,>=2.0.6
  Using cached catalogue-2.0.6-py3-none-any.whl (17 kB)
Collecting spacy-legacy<3.1.0,>=3.0.8
  Using cached spacy_legacy-3.0.8-py2.py3-none-any.whl (14 kB)
Collecting typer<0.5.0,>=0.3.0
  Using cached typer-0.4.0-py3-none-any.whl (27 kB)
Collecting srsly<3.0.0,>=2.4.1
  Using cached srsly-2.4.1-cp37-cp37m-manylinux2014_x86_64.whl (456 kB)
Installing collected packages: catalogue, typer, srsly, pydantic, thinc, spacy-legacy, pathy, spacy
  Attempting uninstall: catalogue
    Found existing installation: catalogue 1.0.0
  

In [None]:
# Run in python console
import nltk; nltk.download('stopwords')


# Run in terminal or command prompt
!python -m spacy download en_core_web_sm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
     |████████████████████████████████| 13.6 MB 66 kB/s              
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: en-core-web-sm 2.2.5
    Uninstalling en-core-web-sm-2.2.5:
      Successfully uninstalled en-core-web-sm-2.2.5
Successfully installed en-core-web-sm-3.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
#!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

  from collections import Iterable
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [None]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
# Convert to list
data = df.text.values.tolist()
data2 = []
for item in data:
  data2.append(str(item))

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data2]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data2]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data2]

pprint(data2[:1])

['agency rphe subscribers haying formed j a co-partnership purpose proceeding '
 'with the first colony on the 25th of august to niw zealand where they will '
 'establish themselves they take the present opportunity to offer their '
 'services to their friends and others as agents for the management of landed '
 'property the pur chase and sale of merchandise and the superin tendence of '
 'shipping and other agency business their cotrespondents in london are messrs '
 'buckle bagster and buckle daniell and riddiford london august 19 1839 colony '
 'of new zealand agency a gentleman of active business habits possessing a '
 'practical knowledge of the art of surveying and being well acquainted with '
 'the value of land is about to establish him self in the above colony and '
 'would be willing to undertake the local management of ai estate for any '
 'gentleman who has invested capital in the purchase of land there and who may '
 'require s confidential agent to superintend the choice

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data2))

print(data_words[:1])

[['agency', 'rphe', 'subscribers', 'haying', 'formed', 'co', 'partnership', 'purpose', 'proceeding', 'with', 'the', 'first', 'colony', 'on', 'the', 'th', 'of', 'august', 'to', 'niw', 'zealand', 'where', 'they', 'will', 'establish', 'themselves', 'they', 'take', 'the', 'present', 'opportunity', 'to', 'offer', 'their', 'services', 'to', 'their', 'friends', 'and', 'others', 'as', 'agents', 'for', 'the', 'management', 'of', 'landed', 'property', 'the', 'pur', 'chase', 'and', 'sale', 'of', 'merchandise', 'and', 'the', 'superin', 'tendence', 'of', 'shipping', 'and', 'other', 'agency', 'business', 'their', 'cotrespondents', 'in', 'london', 'are', 'messrs', 'buckle', 'bagster', 'and', 'buckle', 'daniell', 'and', 'riddiford', 'london', 'august', 'colony', 'of', 'new', 'zealand', 'agency', 'gentleman', 'of', 'active', 'business', 'habits', 'possessing', 'practical', 'knowledge', 'of', 'the', 'art', 'of', 'surveying', 'and', 'being', 'well', 'acquainted', 'with', 'the', 'value', 'of', 'land', 'is

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['agency', 'rphe', 'subscribers', 'haying', 'formed', 'co', 'partnership', 'purpose', 'proceeding', 'with', 'the', 'first', 'colony', 'on', 'the', 'th', 'of', 'august', 'to', 'niw', 'zealand', 'where', 'they', 'will', 'establish', 'themselves', 'they', 'take', 'the', 'present', 'opportunity', 'to', 'offer', 'their', 'services', 'to', 'their', 'friends', 'and', 'others', 'as', 'agents', 'for', 'the', 'management', 'of', 'landed', 'property', 'the', 'pur_chase', 'and', 'sale', 'of', 'merchandise', 'and', 'the', 'superin_tendence', 'of', 'shipping', 'and', 'other', 'agency', 'business', 'their', 'cotrespondents', 'in', 'london', 'are', 'messrs', 'buckle', 'bagster', 'and', 'buckle', 'daniell', 'and', 'riddiford', 'london', 'august', 'colony', 'of', 'new', 'zealand', 'agency', 'gentleman', 'of', 'active', 'business', 'habits', 'possessing', 'practical', 'knowledge', 'of', 'the', 'art', 'of', 'surveying', 'and', 'being', 'well', 'acquainted', 'with', 'the', 'value', 'of', 'land', 'is', 'abo

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
     |████████████████████████████████| 13.6 MB 67 kB/s              
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['agency', 'rphe', 'subscriber', 'hay', 'form', 'co', 'partnership', 'purpose', 'proceeding', 'first', 'establish', 'take', 'present', 'opportunity', 'offer', 'service', 'friend', 'other', 'agent', 'management', 'land', 'property', 'pur_chase', 'sale', 'merchandise', 'superin_tendence', 'shipping', 'agency', 'business', 'cotrespondent', 'gentleman', 'active', 'business', 'habit', 'possess', 'practical', 'knowledge', 'art', 'survey', 'well', 'acquaint', 'value', 'land', 'establish', 'self', 'colony', 'willing', 'undertake', 'local', 'management', 'ai', 'estate', 'gentleman', 'invest', 'capital', 'purchase', 'land', 'require', 'confidential', 'agent', 'superintend', 'choice', 'location', 'see', 'allotment', 'time', 'time', 'dispose', 'advantageous', 'manner', 'generally', 'preserve', 'right', 'interest', 'proprietor', 'unexceptionable', 'reference', 'give', 'require', 'address', 'letter', 'post', 'pay', 'none', 'principal', 'need', 'apply', 'british', 'colonial', 'export', 'company', 'l

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 3), (12, 3), (13, 2), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 4), (28, 1), (29, 3), (30, 1), (31, 2), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 2), (39, 3), (40, 1), (41, 7), (42, 1), (43, 1), (44, 5), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 2), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 2), (69, 1), (70, 3), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 2), (77, 1), (78, 1), (79, 1), (80, 1), (81, 2), (82, 2), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 4), (91, 1), (92, 1), (93, 4), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 2), (102, 1), (103, 1), (104, 2), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 2)

In [None]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('absence', 1),
  ('account', 1),
  ('acquaint', 1),
  ('active', 1),
  ('adapt', 1),
  ('addition', 1),
  ('address', 1),
  ('advantageous', 1),
  ('advocate', 1),
  ('affix', 1),
  ('afford', 2),
  ('agency', 3),
  ('agent', 3),
  ('agricultural', 2),
  ('ai', 1),
  ('ail', 1),
  ('allotment', 1),
  ('appli', 1),
  ('application', 1),
  ('apply', 1),
  ('art', 1),
  ('article', 1),
  ('assortment', 1),
  ('attention', 1),
  ('australia', 1),
  ('become', 1),
  ('beg', 1),
  ('british', 4),
  ('busi_ness', 1),
  ('business', 3),
  ('call', 1),
  ('capital', 2),
  ('carefully', 1),
  ('catalogue', 1),
  ('cation', 1),
  ('cause', 1),
  ('choice', 1),
  ('climate', 1),
  ('co', 2),
  ('colonial', 3),
  ('colonization', 1),
  ('colony', 7),
  ('commercial', 1),
  ('commission', 1),
  ('company', 5),
  ('confidential', 1),
  ('contract', 1),
  ('cotrespondent', 1),
  ('cottage', 1),
  ('country', 1),
  ('crown', 1),
  ('day', 1),
  ('department', 1),
  ('description', 1),
  ('dispose', 

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.012*"gun" + 0.011*"heavy" + 0.010*"colonial_secretary" + 0.009*"news" + '
  '0.009*"urge" + 0.008*"tax" + 0.008*"thank" + 0.007*"mail" + 0.007*"ore" + '
  '0.007*"night"'),
 (1,
  '0.040*"bill" + 0.034*"payment" + 0.031*"say" + 0.026*"issue" + 0.016*"sum" '
  '+ 0.016*"notice" + 0.015*"pay" + 0.015*"deed" + 0.015*"amount" + '
  '0.014*"person"'),
 (2,
  '0.024*"distance" + 0.022*"iron" + 0.013*"firm" + 0.012*"dress" + '
  '0.010*"black" + 0.010*"mill" + 0.009*"cloth" + 0.008*"box" + 0.007*"wool" + '
  '0.007*"bar"'),
 (3,
  '0.011*"vessel" + 0.008*"day" + 0.008*"take" + 0.008*"place" + 0.008*"leave" '
  '+ 0.007*"fire" + 0.007*"ship" + 0.007*"native" + 0.007*"water" + '
  '0.007*"captain"'),
 (4,
  '0.089*"land" + 0.064*"acre" + 0.048*"company" + 0.035*"town" + 0.021*"sale" '
  '+ 0.021*"road" + 0.020*"country" + 0.019*"plan" + 0.018*"district" + '
  '0.017*"section"'),
 (5,
  '0.011*"year" + 0.010*"make" + 0.008*"receive" + 0.007*"take" + 0.007*"last" '
  '+ 0.007*"state" + 

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -9.100933267757853

Coherence Score:  0.4910006363718223


In [None]:
import os       #importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

openjdk version "11.0.11" 2021-04-20
OpenJDK Runtime Environment (build 11.0.11+9-Ubuntu-0ubuntu2.18.04)
OpenJDK 64-Bit Server VM (build 11.0.11+9-Ubuntu-0ubuntu2.18.04, mixed mode, sharing)


In [None]:
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip

--2021-10-14 09:22:43--  http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
Resolving mallet.cs.umass.edu (mallet.cs.umass.edu)... 128.119.246.70
Connecting to mallet.cs.umass.edu (mallet.cs.umass.edu)|128.119.246.70|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16184794 (15M) [application/zip]
Saving to: ‘mallet-2.0.8.zip’


2021-10-14 09:22:45 (9.15 MB/s) - ‘mallet-2.0.8.zip’ saved [16184794/16184794]

Archive:  mallet-2.0.8.zip
   creating: mallet-2.0.8/
   creating: mallet-2.0.8/bin/
  inflating: mallet-2.0.8/bin/classifier2info  
  inflating: mallet-2.0.8/bin/csv2classify  
  inflating: mallet-2.0.8/bin/csv2vectors  
  inflating: mallet-2.0.8/bin/mallet  
  inflating: mallet-2.0.8/bin/mallet.bat  
  inflating: mallet-2.0.8/bin/mallethon  
  inflating: mallet-2.0.8/bin/prepend-license.sh  
  inflating: mallet-2.0.8/bin/svmlight2vectors  
  inflating: mallet-2.0.8/bin/text2classify  
  inflating: mallet-2.0.8/bin/text2vectors  
  inflating: mallet-2.0.8/bin/

In [None]:
mallet_path = '/content/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

In [None]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))


# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(18,
  [('lie', 0.011776222970832974),
   ('tin', 0.009184878062825717),
   ('light', 0.008249114623823097),
   ('week', 0.0077164492816216065),
   ('tor', 0.005960093288416689),
   ('effect', 0.005312257061414874),
   ('ill', 0.005038726210014109),
   ('deponent', 0.005024329849414068),
   ('follow', 0.004923555325213786),
   ('case', 0.004837177161613544)]),
 (10,
  [('native', 0.01934341212696502),
   ('man', 0.015552288565276781),
   ('fire', 0.010001620632972935),
   ('captain', 0.009145000347278495),
   ('leave', 0.008774569953464682),
   ('place', 0.008010557266223694),
   ('officer', 0.00790637371796356),
   ('arrive', 0.007622762947699859),
   ('chief', 0.007518579399439724),
   ('make', 0.0072754844534994095)]),
 (17,
  [('paper', 0.05755001370238422),
   ('company', 0.025563167991230474),
   ('office', 0.021682652781583996),
   ('book', 0.017506166072896685),
   ('meat', 0.01360372704850644),
   ('sale', 0.013165250753631131),
   ('receive', 0.010150726226363387),
   ('orde