In [1]:
import spacy
from spacy.cli.download import download
download(model="en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
from spacy.lang.en import English
parser = English()

In [4]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [5]:
!python3 -m spacy download en_core_web_sm

C:\Users\vaibh\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe: No module named spacy


In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vaibh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [8]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vaibh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [10]:
import random
text_data = []
with open('C:\\Users\\vaibh\\OneDrive\\Documents\\dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['anthropocentric', 'system']
['friend', 'close', 'incorporate', 'trust', 'social', 'network', 'base', 'sybil', 'defense']
['automate', 'semantic', 'services', 'orchestration', 'concept', 'covering']
['energy', 'function', 'base', 'neural', 'network', 'transient', 'stability', 'enhancement', 'network', 'preserve', 'power', 'system']
['bound', 'power', 'function', 'wireless', 'network']
['hyperqueries', 'dynamic', 'distribute', 'query', 'processing', 'internet']
['memory', 'efficient', 'layer', 'decoder', 'design', 'early', 'termination', 'code']
['single', 'miller', 'compensation', 'using', 'invert', 'current', 'buffer', 'multi', 'stage', 'amplifier']
['video', 'suggestion', 'discovery', 'youtube', 'taking', 'random', 'walk', 'graph']
['factor', 'formatting', 'perceptional', 'impression', 'cyber', 'space', 'cross', 'cultural', 'study', 'korean', 'american', 'user']
['tunable', 'aware', 'network', 'survivability']
['adaptive', 'neuron', 'circuit', 'signal', 'compression']
['step', 'hybr

In [11]:
from gensim import corpora

# Create a dictionary and corpus for topic modeling with Gensim
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(doc) for doc in text_data]

In [12]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
# ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.062*"network" + 0.038*"base" + 0.038*"system" + 0.026*"function"')
(1, '0.022*"preserve" + 0.022*"foam" + 0.022*"trouble" + 0.022*"circulation"')
(2, '0.034*"code" + 0.019*"multi" + 0.019*"compensation" + 0.019*"amplifier"')
(3, '0.035*"base" + 0.019*"social" + 0.019*"query" + 0.019*"close"')
(4, '0.008*"anthropocentric" + 0.008*"network" + 0.008*"synapse" + 0.008*"biomimetic"')


In [14]:
from gensim.models.ldamodel import LdaModel
from pprint import pprint

# Build the LDA (Latent Dirichlet Allocation) model
lda_model = LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15)

for i, doc in enumerate(text_data):
    bow = dictionary.doc2bow(doc)
    print(f"Document {i+1} belongs to topic: {lda_model.get_document_topics(bow)}")

Document 1 belongs to topic: [(0, 0.7694471), (1, 0.115524076), (2, 0.1150288)]
Document 2 belongs to topic: [(0, 0.03384763), (1, 0.9317049), (2, 0.034447525)]
Document 3 belongs to topic: [(0, 0.90417093), (1, 0.047929566), (2, 0.047899533)]
Document 4 belongs to topic: [(0, 0.027547581), (1, 0.030740645), (2, 0.9417118)]
Document 5 belongs to topic: [(0, 0.057945102), (1, 0.8821757), (2, 0.059879184)]
Document 6 belongs to topic: [(0, 0.049193848), (1, 0.9028901), (2, 0.04791604)]
Document 7 belongs to topic: [(0, 0.9249249), (1, 0.037276186), (2, 0.037798904)]
Document 8 belongs to topic: [(0, 0.030954663), (1, 0.9385659), (2, 0.03047947)]
Document 9 belongs to topic: [(0, 0.037336), (1, 0.03731975), (2, 0.9253443)]
Document 10 belongs to topic: [(0, 0.025836233), (1, 0.025826203), (2, 0.9483376)]
Document 11 belongs to topic: [(0, 0.06744495), (1, 0.8647494), (2, 0.06780567)]
Document 12 belongs to topic: [(0, 0.88819534), (1, 0.055919945), (2, 0.055884734)]
Document 13 belongs to