## Topic Modelling

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# Importing Gensim
import gensim
from gensim import corpora

In [2]:
articles = pd.read_csv("risk_articles.csv", index_col=0)

In [3]:
articles.head()

Unnamed: 0_level_0,Companies,Search Query,url,Text
Risk Terms,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Layoffs,Twitter,Layoffs Twitter,https://fortune.com/2022/11/18/twitter-former-...,"When Twitter’s new owner, Elon Musk, decided t..."
Layoffs,Meta,Layoffs Meta,https://about.fb.com/news/2022/11/mark-zuckerb...,Mark Zuckerberg just shared the following with...
Layoffs,Amazon,Layoffs Amazon,https://economictimes.indiatimes.com/news/inte...,The mass layoffs that began in Amazon 's corpo...
Sexual Harrassment,Twitter,Sexual Harrassment Twitter,https://twitter.com/hashtag/sexualharassment,JavaScript is not available.\n\nWe’ve detected...
Sexual Harrassment,Meta,Sexual Harrassment Meta,https://nypost.com/2022/05/27/women-are-being-...,Disturbing accounts of women being sexually as...


In [7]:
articles.dtypes

Companies       object
Search Query    object
url             object
Text            object
dtype: object

In [9]:
corpus = list(articles["Text"].astype(str))
############## Pre-proc
# stop loss words
stop = set(stopwords.words("english"))

# punctuation
exclude = set(string.punctuation)

# lemmatization
lemma = WordNetLemmatizer()

# One function for all the steps:
def clean(doc):

    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])

    # remove any stop words present
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)

    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized


# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus]

dict_ = corpora.Dictionary(clean_corpus)
doc_term_matrix = [dict_.doc2bow(i) for i in clean_corpus]


In [10]:
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(
    doc_term_matrix,
    num_topics=10,
    id2word=dict_,
    passes=1,
    random_state=0,
    eval_every=None,
)
ldamodel.print_topics()
print(ldamodel.print_topics(num_topics=10, num_words=5))


[(0, '0.004*"people" + 0.003*"we’re" + 0.003*"meta" + 0.003*"employee" + 0.003*"business"'), (1, '0.009*"people" + 0.007*"company" + 0.007*"employee" + 0.007*"meta" + 0.006*"we’re"'), (2, '0.007*"meta" + 0.007*"company" + 0.007*"people" + 0.006*"another" + 0.005*"metaverse"'), (3, '0.008*"people" + 0.007*"we’re" + 0.006*"metaverse" + 0.005*"i’m" + 0.005*"meta"'), (4, '0.007*"people" + 0.006*"employee" + 0.005*"expense" + 0.005*"company" + 0.004*"work"'), (5, '0.012*"company" + 0.011*"people" + 0.011*"employee" + 0.010*"expense" + 0.006*"work"'), (6, '0.017*"company" + 0.011*"people" + 0.011*"employee" + 0.010*"expense" + 0.008*"said"'), (7, '0.006*"people" + 0.006*"employee" + 0.006*"meta" + 0.006*"said" + 0.006*"company"'), (8, '0.012*"fraud" + 0.010*"company" + 0.008*"year" + 0.008*"employee" + 0.007*"people"'), (9, '0.017*"javascript" + 0.017*"browser" + 0.012*"supported" + 0.008*"help" + 0.007*"we’ve"')]


In [11]:
count = 0
for i in ldamodel[doc_term_matrix]:
    print("doc : ", count, i)
    count += 1


doc :  0 [(5, 0.9977823)]
doc :  1 [(3, 0.97542244), (5, 0.023514185)]
doc :  2 [(6, 0.9954065)]
doc :  3 [(9, 0.9608687)]
doc :  4 [(2, 0.9985517)]
doc :  5 [(0, 0.050001524), (1, 0.050001524), (2, 0.050001524), (3, 0.050001524), (4, 0.050001524), (5, 0.050001524), (6, 0.050001524), (7, 0.050001524), (8, 0.5499863), (9, 0.050001524)]
doc :  6 [(9, 0.9608687)]
doc :  7 [(0, 0.05000152), (1, 0.05000152), (2, 0.05000152), (3, 0.05000152), (4, 0.05000152), (5, 0.05000152), (6, 0.05000152), (7, 0.05000152), (8, 0.5499863), (9, 0.05000152)]
doc :  8 [(8, 0.9799988)]
