# Topic modelingâ€”LDA

In [2]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from gensim.models.ldamodel import LdaModel
import gensim.corpora as corpora
from pprint import pprint
from gensim.corpora import MmCorpus

In [7]:
stop_words = stopwords.words('english')
stop_words.append("said")
bbc_df = pd.read_csv("../data/bbc-text.csv")
bbc_df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [8]:
def clean_text(input_string):
    input_string = re.sub(r'[^\w\s]', ' ', input_string) # line 1
    input_string = re.sub(r'\d', '', input_string) # line 2
    input_list = simple_preprocess(input_string)
    input_list = [word for word in input_list if word not in stop_words]
    return input_list
    

In [None]:
# Simplified apply compared to the book
bbc_df['text'] = bbc_df['text'].apply(clean_text)
bbc_df.head()

Unnamed: 0,category,text
0,tech,"[tv, future, hands, viewers, home, theatre, sy..."
1,business,"[worldcom, boss, left, books, alone, former, w..."
2,sport,"[tigers, wary, farrell, gamble, leicester, say..."
3,sport,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,"[ocean, twelve, raids, box, office, ocean, twe..."


In [None]:
# Use gensim.corpora.Dictionary class to map each word to its integer ID
# For BOW processing later
texts = bbc_df['text'].values # extract the text column as a numpy array
id_dict = corpora.Dictionary(texts)
corpus = [id_dict.doc2bow(text) for text in texts]
corpus[0][:5]  # first document, first five word IDs and their counts

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 2)]

In [17]:
# Initialize LDA model
num_topics = 5
lda_model = LdaModel(corpus=corpus,
                     id2word=id_dict,
                     num_topics=num_topics,
                     random_state=42,
                     chunksize=100,
                     passes=20,)

In [21]:
# Save the model
def save_model(lda, lda_path, id_dict, dict_path, corpus, corpus_path):
    lda.save(lda_path)
    id_dict.save(dict_path)
    MmCorpus.serialize(corpus_path, corpus)
    
model_path = "../models/bbc_gensim/lda.model"
dict_path = "../models/bbc_gensim/id2word.dict"
corpus_path = "../models/bbc_gensim/corpus.mm"
save_model(lda_model, model_path, id_dict, dict_path, corpus, corpus_path)

In [22]:
# Load the model
lda_model = LdaModel.load(model_path)
id_dict = corpora.Dictionary.load(dict_path)

In [23]:
new_example = """Manchester United players slumped to the turf 
at full-time in Germany on Tuesday in acknowledgement of what their 
latest pedestrian first-half display had cost them. The 3-2 loss at 
RB Leipzig means United will not be one of the 16 teams in the draw 
for the knockout stages of the Champions League. And this is not the 
only price for failure. The damage will be felt in the accounts, in 
the dealings they have with current and potentially future players 
and in the faith the fans have placed in manager Ole Gunnar Solskjaer. 
With Paul Pogba's agent angling for a move for his client and ex-United 
defender Phil Neville speaking of a "witchhunt" against his former team-mate 
Solskjaer, BBC Sport looks at the ramifications and reaction to a big loss for United."""

In [25]:
input_list = clean_text(new_example)
bow = id_dict.doc2bow(input_list)
topics = lda_model[bow]
print(topics)

[(0, np.float32(0.08144318)), (2, np.float32(0.16211627)), (3, np.float32(0.7502739))]
