# Implementing LDA in Python

## Introduction

## Importing the Required Libraries

In [1]:
import pandas as pd
from nltk.corpus import stopwords
import string
import gensim.corpora as corpora
from gensim.models import LdaMulticore

In [2]:
df = pd.read_csv("../data/trc.csv")
df = df[["Last", "First", "Description"]]
df

Unnamed: 0,Last,First,Description
0,AARON,Thabo Simon,An ANCYL member who was shot and severely inju...
1,ABBOTT,Montaigne,A member of the SADF who was severely injured ...
2,ABRAHAM,Nzaliseko Christopher,A COSAS supporter who was kicked and beaten wi...
3,ABRAHAMS,Achmat Fardiel,Was shot and blinded in one eye by members of ...
4,ABRAHAMS,Annalene Mildred,Was shot and injured by members of the SAP in ...
...,...,...,...
20829,XUZA,Mandla,Was severely injured when he was stoned by a f...
20830,YAKA,Mbangomuni,An IFP supporter and acting induna who was sho...
20831,YALI,Khayalethu,"Was shot by members of the SAP in Lingelihle, ..."
20832,YALO,Bikiwe,An IFP supporter whose house and possessions w...


In [3]:
docs = df.Description.tolist()
docs[:1]

["An ANCYL member who was shot and severely injured by SAP members at Lephoi, Bethulie, Orange Free State (OFS) on 17 April 1991. Police opened fire on a gathering at an ANC supporter's house following a dispute between two neighbours, one of whom was linked to the ANC and the other to the SAP and a councillor."]

In [4]:
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
def clean_doc(doc):
    no_punct = ''
    for c in doc:
        if c not in string.punctuation:
            no_punct = no_punct+c
    # with list comprehension
    # no_punct = ''.join([c for c in doc if c not in string.punctuation])
    
    words = no_punct.lower().split()
    
    final_words = []
    for word in words:
        if word not in stop_words:
            final_words.append(word)
    
    # with list comprehension
    # final_words = [word for word in words if word not in stop_words]

    return final_words
cleaned = clean_doc(docs[0])
print(docs[0])
print(cleaned)

An ANCYL member who was shot and severely injured by SAP members at Lephoi, Bethulie, Orange Free State (OFS) on 17 April 1991. Police opened fire on a gathering at an ANC supporter's house following a dispute between two neighbours, one of whom was linked to the ANC and the other to the SAP and a councillor.
['ancyl', 'member', 'shot', 'severely', 'injured', 'sap', 'members', 'lephoi', 'bethulie', 'orange', 'free', 'state', 'ofs', '17', 'april', '1991', 'police', 'opened', 'fire', 'gathering', 'anc', 'supporters', 'house', 'following', 'dispute', 'two', 'neighbours', 'one', 'linked', 'anc', 'sap', 'councillor']


In [6]:
cleaned_docs = [clean_doc(doc) for doc in docs]
print(cleaned_docs[:3])

[['ancyl', 'member', 'shot', 'severely', 'injured', 'sap', 'members', 'lephoi', 'bethulie', 'orange', 'free', 'state', 'ofs', '17', 'april', '1991', 'police', 'opened', 'fire', 'gathering', 'anc', 'supporters', 'house', 'following', 'dispute', 'two', 'neighbours', 'one', 'linked', 'anc', 'sap', 'councillor'], ['member', 'sadf', 'severely', 'injured', 'landmine', 'explosion', 'messina', 'transvaal', '5', 'may', '1987'], ['cosas', 'supporter', 'kicked', 'beaten', 'batons', 'riflebutts', 'members', 'ciskei', 'police', 'protests', 'ciskei', 'government', 'zwelitsha', 'mdantsane', 'ciskei', 'september', '1985']]


## Create ID-Word Index

In [7]:
id2word = corpora.Dictionary(cleaned_docs)

In [8]:
id2word[250]

'bmw'

In [9]:
id_docs = [id2word.doc2bow(cleaned_doc) for cleaned_doc in cleaned_docs]

In [22]:
print(id_docs[0])

[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]


In [11]:
cleaned_docs[0]

['ancyl',
 'member',
 'shot',
 'severely',
 'injured',
 'sap',
 'members',
 'lephoi',
 'bethulie',
 'orange',
 'free',
 'state',
 'ofs',
 '17',
 'april',
 '1991',
 'police',
 'opened',
 'fire',
 'gathering',
 'anc',
 'supporters',
 'house',
 'following',
 'dispute',
 'two',
 'neighbours',
 'one',
 'linked',
 'anc',
 'sap',
 'councillor']

In [24]:
for num in id_docs[0]:
    num = num[0]
    print(f"{num}\t{id2word[num]}")

0	17
1	1991
2	anc
3	ancyl
4	april
5	bethulie
6	councillor
7	dispute
8	fire
9	following
10	free
11	gathering
12	house
13	injured
14	lephoi
15	linked
16	member
17	members
18	neighbours
19	ofs
20	one
21	opened
22	orange
23	police
24	sap
25	severely
26	shot
27	state
28	supporters
29	two


## Creating LDA Topic Model

In [13]:
topic_nums = 100
lda_model = LdaMulticore(corpus=id_docs, id2word=id2word, num_topics=topic_nums)

In [14]:
lda_model.print_topics(1)

[(35,
  '0.032*"supporters" + 0.029*"anc" + 0.025*"ifp" + 0.019*"supporter" + 0.019*"shot" + 0.017*"near" + 0.016*"natal" + 0.016*"burnt" + 0.015*"conflict" + 0.014*"home"')]

## Analyze a Document

In [15]:
topics = lda_model.get_document_topics(id_docs)
topics[0]

[(27, 0.5007427), (97, 0.46954644)]

In [16]:
print(docs[0])

An ANCYL member who was shot and severely injured by SAP members at Lephoi, Bethulie, Orange Free State (OFS) on 17 April 1991. Police opened fire on a gathering at an ANC supporter's house following a dispute between two neighbours, one of whom was linked to the ANC and the other to the SAP and a councillor.


In [17]:
for topic in topics[0]:
    terms = lda_model.get_topic_terms(topic[0], 10)
    print(topic)
    for num in terms:
        num = num[0]
        print(num, id2word[num])
    print()

(27, 0.595477)
17 members
26 shot
2 anc
136 amnesty
49 supporter
28 supporters
13 injured
94 attack
24 sap
23 police

(56, 0.277554)
26 shot
2 anc
17 members
23 police
28 supporters
24 sap
347 ifp
4 april
100 killed
121 near

(65, 0.09756228)
2 anc
17 members
145 mk
13 injured
94 attack
28 supporters
136 amnesty
49 supporter
148 operatives
143 granted



## Analyze the Topic Model

In [18]:
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

  from imp import reload


In [19]:
vis = pyLDAvis.gensim_models.prepare(lda_model, id_docs, id2word, mds="mmds", R=30)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [20]:
# vis