<a href="https://colab.research.google.com/github/thanusree02/Natural-Language-Processing/blob/main/NLP_LAB_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

LDA WITH ASSIGNMENT EXAMPLE DATA

In [None]:
# Install necessary libraries
!pip install gensim
!pip install nltk



In [None]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import PorterStemmer
import numpy as np

# Download necessary NLTK data (run this once)
import nltk
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('omw-1.4')


# Assuming your data is in a DataFrame named 'df' and the text column is 'text_column'
# If your DataFrame or text column has a different name, please modify the line below.
# For example, if your file is 'data.csv' and the text column is 'description':
# df = pd.read_csv('data.csv')
# documents = df['description']

df = pd.read_csv('/content/data.csv') # Assuming data.csv is your file

# Explicitly set the text column to 'text'
text_column_name = 'text'

# Ensure the column exists and is of string type
if text_column_name not in df.columns:
    raise ValueError(f"Column '{text_column_name}' not found in the DataFrame. Please specify the correct text column.")

documents = df[text_column_name].astype(str)

print("First 5 documents after loading:")
for i, doc in enumerate(documents.head()):
    print(f"{i+1}. {doc}")


First 5 documents after loading:
1. virat scored century in match
2. BJP won in elections
3. bumara look 5 wickets in a match
4. congress from state government


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Text Preprocessing

We need to clean and prepare the text data for LDA. This usually involves:

1.  **Lemmatization**: Reducing words to their base form (e.g., 'running' -> 'run').
2.  **Tokenization**: Splitting text into individual words.
3.  **Stop word removal**: Removing common words that don't add much meaning (e.g., 'the', 'is', 'a').
4.  **Short word removal**: Removing words that are too short (e.g., 2 characters or less).



In [None]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) >= 3: # Changed from > 3 to >= 3
            result.append(lemmatize_stemming(token))
    return result

# Apply preprocessing to your documents
processed_docs = documents.apply(preprocess)

print("First 5 preprocessed documents:")
for i, doc in enumerate(processed_docs.head()):
    print(f"{i+1}. {doc}")


First 5 preprocessed documents:
1. ['virat', 'score', 'centuri', 'match']
2. ['bjp', 'win', 'elect']
3. ['bumara', 'look', 'wicket', 'match']
4. ['congress', 'state', 'govern']


### Creating a Dictionary and Corpus

Next, we'll create a dictionary mapping each word to an ID and then convert the preprocessed documents into a bag-of-words corpus.

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)

# Filter out words that appear in less than 15 documents or more than 0.5 fraction of the corpus (minimum number of documents)
# This filtering is too aggressive for small datasets, resulting in an empty dictionary.
# For this small dataset, we will comment it out to ensure words are not discarded.
# dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Create the Bag-of-Words corpus
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

print("Example of a processed document (bag-of-words representation):")
print(corpus[0])
print("\nMapping the first few word IDs to words:")
for i, (word_id, freq) in enumerate(corpus[0]):
    if i >= 5: # Print only first 5 for brevity
        break
    print(f"Word ID {word_id}: {dictionary[word_id]} (Frequency: {freq})")


Example of a processed document (bag-of-words representation):
[(0, 1), (1, 1), (2, 1), (3, 1)]

Mapping the first few word IDs to words:
Word ID 0: centuri (Frequency: 1)
Word ID 1: match (Frequency: 1)
Word ID 2: score (Frequency: 1)
Word ID 3: virat (Frequency: 1)


### Training the LDA Model

Now we can train the Latent Dirichlet Allocation (LDA) model using the `gensim` library. You can adjust the number of topics (`num_topics`).

In [None]:
# Set the number of topics you want to extract
num_topics = 10

# Train the LDA model
lda_model = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=100,
    chunksize=100,
    passes=10,
    per_word_topics=True
)

print("LDA model training complete.")


LDA model training complete.


### Interpreting the LDA Results

Let's print the topics and their associated keywords to understand what each topic represents.

In [None]:
print("\nTopics and their keywords:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic: {idx} \nWords: {topic}\n")

# You can also get the dominant topic for each document
def format_topics_sentences(ldamodel, corpus, texts):
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = pd.concat([sent_topics_df, pd.DataFrame([{'Dominant_Topic': int(topic_num),
                                                         'Perc_Contribution': round(prop_topic,4),
                                                         'Topic_Keywords': topic_keywords}])], ignore_index=True)
            else:
                break
    return sent_topics_df

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=processed_docs)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords']

# Add original text to the dataframe
df_dominant_topic['Original_Text'] = documents

print("\nDominant topic for first 5 documents:")
display(df_dominant_topic.head())



Topics and their keywords:
Topic: 0 
Words: 0.077*"match" + 0.077*"win" + 0.077*"state" + 0.077*"elect" + 0.077*"govern" + 0.077*"congress" + 0.077*"score" + 0.077*"bumara" + 0.077*"bjp" + 0.077*"virat"

Topic: 1 
Words: 0.077*"govern" + 0.077*"match" + 0.077*"win" + 0.077*"bumara" + 0.077*"score" + 0.077*"elect" + 0.077*"congress" + 0.077*"centuri" + 0.077*"bjp" + 0.077*"wicket"

Topic: 2 
Words: 0.077*"match" + 0.077*"govern" + 0.077*"win" + 0.077*"elect" + 0.077*"congress" + 0.077*"bjp" + 0.077*"wicket" + 0.077*"score" + 0.077*"bumara" + 0.077*"virat"

Topic: 3 
Words: 0.208*"centuri" + 0.208*"match" + 0.208*"virat" + 0.208*"score" + 0.019*"govern" + 0.019*"win" + 0.019*"bumara" + 0.019*"elect" + 0.019*"state" + 0.019*"congress"

Topic: 4 
Words: 0.077*"match" + 0.077*"win" + 0.077*"state" + 0.077*"elect" + 0.077*"govern" + 0.077*"congress" + 0.077*"bumara" + 0.077*"bjp" + 0.077*"centuri" + 0.077*"wicket"

Topic: 5 
Words: 0.077*"win" + 0.077*"score" + 0.077*"elect" + 0.077*"match"

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Original_Text
0,0,3,0.82,"centuri, match, virat, score, govern, win, bum...",virat scored century in match
1,1,8,0.775,"bjp, elect, state, congress, win, govern, matc...",BJP won in elections
2,2,9,0.82,"look, match, wicket, bumara, govern, elect, wi...",bumara look 5 wickets in a match
3,3,8,0.775,"bjp, elect, state, congress, win, govern, matc...",congress from state government


NMF FOR LAB ASSIGNMENT WITH SAMPLE DATA SET

Import libraries

In [None]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Load CSV (Prepare corpus)

In [None]:
df = pd.read_csv("/content/data.csv")

print(df.head())
print("Columns:", df.columns)


   id                              text
0   1     virat scored century in match
1   2              BJP won in elections
2   3  bumara look 5 wickets in a match
3   4    congress from state government
Columns: Index(['id', 'text'], dtype='object')


Select text column

In [None]:
text_col = df.columns[0]
documents = df[text_col].fillna("").astype(str).tolist()

print(documents[:3])


['1', '2', '3']


Text preprocessing

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(doc):
    doc = doc.lower()
    doc = re.sub(r'[^a-zA-Z ]', ' ', doc)
    tokens = doc.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

clean_docs = [clean_text(doc) for doc in documents]

print(clean_docs[:3])


['', '', '']


Reduce size (fast run)

In [None]:
clean_docs_small = clean_docs[:5000]


TF-IDF matrix (BoW alternative for NMF)

In [None]:
# Check documents before TF-IDF
print("Total docs:", len(clean_docs_small))

# Show first few docs
for i, doc in enumerate(clean_docs_small[:5]):
    print(i, "->", repr(doc))

# Remove empty/blank docs safely
clean_docs_small = [doc for doc in clean_docs_small if isinstance(doc, str) and doc.strip()]

print("Docs after cleaning:", len(clean_docs_small))

# If still empty → stop
if len(clean_docs_small) == 0:
    raise ValueError("No valid text found after cleaning!")

# Now run TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b')  # allow all words
X = vectorizer.fit_transform(clean_docs_small)

print("Matrix shape:", X.shape)



Total docs: 4
0 -> 'virat scored century match'
1 -> 'bjp election'
2 -> 'bumara look wicket match'
3 -> 'congress state government'
Docs after cleaning: 4
Matrix shape: (4, 12)


Apply NMF + Show Topics

In [None]:
from sklearn.decomposition import NMF

# Apply NMF
nmf = NMF(n_components=5, random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

print("NMF completed!")


NMF completed!


Show top words per topic

In [None]:
words = vectorizer.get_feature_names_out()

for i, topic in enumerate(H):
    print(f"\nTopic {i+1}:")
    print([words[j] for j in topic.argsort()[-10:]])



Topic 1:
['century', 'election', 'match', 'look', 'virat', 'scored', 'wicket', 'congress', 'government', 'state']

Topic 2:
['congress', 'election', 'government', 'scored', 'state', 'virat', 'bumara', 'wicket', 'look', 'match']

Topic 3:
['congress', 'government', 'look', 'wicket', 'state', 'scored', 'match', 'virat', 'election', 'bjp']

Topic 4:
['congress', 'election', 'look', 'government', 'wicket', 'state', 'match', 'virat', 'century', 'scored']

Topic 5:
['congress', 'election', 'government', 'virat', 'state', 'scored', 'match', 'look', 'wicket', 'bumara']


Assign dominant topic to each document

In [None]:
df_small = df.iloc[:len(clean_docs_small)].copy()
df_small['Dominant_Topic'] = W.argmax(axis=1)

df_small.head()


Unnamed: 0,titles,summaries,terms,Dominant_Topic
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']",3
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']",2
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']",4
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV'],0


LDA FOR LAB ASSIGNMENT WITH KAGGLE DATA SET

Import libraries

In [None]:
# Step 1: Import libraries
import pandas as pd
import nltk
import re

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 2: Load CSV file
df = pd.read_csv("/content/arxiv_data.csv")

# IMPORTANT: change column name if needed
documents = df.iloc[:,0].astype(str).tolist()   # takes first column as text

print("Sample documents:", documents[:5])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Sample documents: ['Survey on Semantic Stereo Matching / Semantic Depth Estimation', 'FUTURE-AI: Guiding Principles and Consensus Recommendations for Trustworthy Artificial Intelligence in Future Medical Imaging', 'Enforcing Mutual Consistency of Hard Regions for Semi-supervised Medical Image Segmentation', 'Parameter Decoupling Strategy for Semi-supervised 3D Left Atrium Segmentation', 'Background-Foreground Segmentation for Interior Sensing in Automotive Industry']


Load CSV file

In [None]:
df = pd.read_csv("/content/arxiv_data.csv")

print(df.head())
print("Columns:", df.columns)


                                              titles  \
0  Survey on Semantic Stereo Matching / Semantic ...   
1  FUTURE-AI: Guiding Principles and Consensus Re...   
2  Enforcing Mutual Consistency of Hard Regions f...   
3  Parameter Decoupling Strategy for Semi-supervi...   
4  Background-Foreground Segmentation for Interio...   

                                           summaries  \
0  Stereo matching is one of the widely used tech...   
1  The recent advancements in artificial intellig...   
2  In this paper, we proposed a novel mutual cons...   
3  Consistency training has proven to be an advan...   
4  To ensure safety in automated driving, the cor...   

                         terms  
0           ['cs.CV', 'cs.LG']  
1  ['cs.CV', 'cs.AI', 'cs.LG']  
2           ['cs.CV', 'cs.AI']  
3                    ['cs.CV']  
4           ['cs.CV', 'cs.LG']  
Columns: Index(['titles', 'summaries', 'terms'], dtype='object')


Select text column

In [None]:
text_col = df.columns[0]
documents = df[text_col].fillna("").astype(str).tolist()

print(documents[:3])


['Survey on Semantic Stereo Matching / Semantic Depth Estimation', 'FUTURE-AI: Guiding Principles and Consensus Recommendations for Trustworthy Artificial Intelligence in Future Medical Imaging', 'Enforcing Mutual Consistency of Hard Regions for Semi-supervised Medical Image Segmentation']


Clean text

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(doc):
    doc = doc.lower()
    doc = re.sub(r'[^a-zA-Z ]', ' ', doc)
    tokens = doc.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

clean_docs = [clean_text(doc) for doc in documents]

print(clean_docs[:3])


['survey semantic stereo matching semantic depth estimation', 'future ai guiding principle consensus recommendation trustworthy artificial intelligence future medical imaging', 'enforcing mutual consistency hard region semi supervised medical image segmentation']


Bag of Words (BoW)

In [None]:
vectorizer = CountVectorizer(max_df=0.9, min_df=5)
X = vectorizer.fit_transform(clean_docs)

print("BoW shape:", X.shape)


BoW shape: (51774, 4384)


Apply LDA

In [None]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)


Show topic words

In [None]:
words = vectorizer.get_feature_names_out()

for i, topic in enumerate(lda.components_):
    print(f"\nTopic {i+1}:")
    print([words[j] for j in topic.argsort()[-10:]])



Topic 1:
['segmentation', 'self', 'graph', 'data', 'unsupervised', 'supervised', 'deep', 'representation', 'image', 'learning']

Topic 2:
['end', 'estimation', 'based', 'temporal', 'image', 'visual', 'transformer', 'recognition', 'attention', 'video']

Topic 3:
['graph', 'using', 'series', 'deep', 'model', 'time', 'generative', 'neural', 'adversarial', 'network']

Topic 4:
['optimization', 'super', 'policy', 'via', 'resolution', 'based', 'deep', 'multi', 'reinforcement', 'learning']

Topic 5:
['based', 'point', 'convolutional', 'graph', 'image', 'neural', 'segmentation', 'object', 'detection', 'network']


Assign topic to each document

In [None]:
topic_results = lda.transform(X)

df['Dominant_Topic'] = topic_results.argmax(axis=1)

df.head()


Unnamed: 0,titles,summaries,terms,Dominant_Topic
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']",4
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']",0
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']",0
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV'],0
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']",4


NMF FOR LAB ASSIGNMENT WITH KAGGLE DATA SET

# Task
Review the NMF topic modeling results from the 'arxiv_data.csv' dataset, including the identified topics and their assigned documents.

In [25]:
import pandas as pd

# df = pd.read_csv("/content/arxiv_data.csv") # Remove this line as df is already loaded

# Identify the text column, which is 'titles'
text_column_name = 'titles'

# Extract the text data, handle missing values, and convert to string type
documents = df[text_column_name].fillna("").astype(str).tolist()

print("First 5 documents after loading and extraction:")
for i, doc in enumerate(documents[:5]):
    print(f"{i+1}. {doc}")

First 5 documents after loading and extraction:
1. Survey on Semantic Stereo Matching / Semantic Depth Estimation
2. FUTURE-AI: Guiding Principles and Consensus Recommendations for Trustworthy Artificial Intelligence in Future Medical Imaging
3. Enforcing Mutual Consistency of Hard Regions for Semi-supervised Medical Image Segmentation
4. Parameter Decoupling Strategy for Semi-supervised 3D Left Atrium Segmentation
5. Background-Foreground Segmentation for Interior Sensing in Automotive Industry


## Preprocess Text Data

### Subtask:
Clean the text data by converting to lowercase, removing non-alphabetic characters, tokenizing, removing stopwords, and lemmatizing words. This prepares the text for feature extraction.


In [13]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(doc):
    # Convert to lowercase
    doc = doc.lower()
    # Remove non-alphabetic characters
    doc = re.sub(r'[^a-zA-Z ]', ' ', doc)
    # Tokenize
    tokens = doc.split()
    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    # Join tokens back into a string
    return " ".join(tokens)

clean_docs = [clean_text(doc) for doc in documents]

print("First 5 cleaned documents:")
for i, doc in enumerate(clean_docs[:5]):
    print(f"{i+1}. {doc}")

First 5 cleaned documents:
1. survey semantic stereo matching semantic depth estimation
2. future ai guiding principle consensus recommendation trustworthy artificial intelligence future medical imaging
3. enforcing mutual consistency hard region semi supervised medical image segmentation
4. parameter decoupling strategy semi supervised left atrium segmentation
5. background foreground segmentation interior sensing automotive industry


In [14]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(doc):
    # Convert to lowercase
    doc = doc.lower()
    # Remove non-alphabetic characters
    doc = re.sub(r'[^a-zA-Z ]', ' ', doc)
    # Tokenize
    tokens = doc.split()
    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    # Join tokens back into a string
    return " ".join(tokens)

clean_docs = [clean_text(doc) for doc in documents]

print("First 5 cleaned documents:")
for i, doc in enumerate(clean_docs[:5]):
    print(f"{i+1}. {doc}")

First 5 cleaned documents:
1. survey semantic stereo matching semantic depth estimation
2. future ai guiding principle consensus recommendation trustworthy artificial intelligence future medical imaging
3. enforcing mutual consistency hard region semi supervised medical image segmentation
4. parameter decoupling strategy semi supervised left atrium segmentation
5. background foreground segmentation interior sensing automotive industry


In [15]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(doc):
    # Convert to lowercase
    doc = doc.lower()
    # Remove non-alphabetic characters
    doc = re.sub(r'[^a-zA-Z ]', ' ', doc)
    # Tokenize
    tokens = doc.split()
    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    # Join tokens back into a string
    return " ".join(tokens)

clean_docs = [clean_text(doc) for doc in documents]

print("First 5 cleaned documents:")
for i, doc in enumerate(clean_docs[:5]):
    print(f"{i+1}. {doc}")

First 5 cleaned documents:
1. survey semantic stereo matching semantic depth estimation
2. future ai guiding principle consensus recommendation trustworthy artificial intelligence future medical imaging
3. enforcing mutual consistency hard region semi supervised medical image segmentation
4. parameter decoupling strategy semi supervised left atrium segmentation
5. background foreground segmentation interior sensing automotive industry


## Create TF-IDF Matrix

### Subtask:
Transform the preprocessed text data into a TF-IDF (Term Frequency-Inverse Document Frequency) matrix, which is suitable for NMF. This step will convert the text into numerical features.


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the clean_docs to create the TF-IDF matrix
X = vectorizer.fit_transform(clean_docs)

print("TF-IDF matrix shape:", X.shape)

TF-IDF matrix shape: (2384, 3136)


## Apply NMF Model

### Subtask:
Apply Non-negative Matrix Factorization (NMF) to the TF-IDF matrix to discover latent topics within the documents.


In [17]:
from sklearn.decomposition import NMF

# Instantiate NMF model with 10 topics and a random state for reproducibility
n_components = 10
nmf = NMF(n_components=n_components, random_state=42)

# Fit the NMF model to the TF-IDF matrix and transform it
W = nmf.fit_transform(X) # Document-topic matrix
H = nmf.components_ # Topic-word matrix

print(f"NMF model applied with {n_components} topics.")
print("Shape of Document-Topic matrix (W):", W.shape)
print("Shape of Topic-Word matrix (H):", H.shape)

NMF model applied with 10 topics.
Shape of Document-Topic matrix (W): (2384, 10)
Shape of Topic-Word matrix (H): (10, 3136)


## Display Top Words per Topic

### Subtask:
Extract and display the most representative words for each identified NMF topic to understand the theme of each topic.


In [18]:
words = vectorizer.get_feature_names_out()

for i, topic in enumerate(H):
    # Get the indices of the top 10 words for the current topic
    top_words_indices = topic.argsort()[-10:]
    # Get the actual words using the indices
    top_words = [words[j] for j in top_words_indices]
    print(f"\nTopic {i+1}:")
    print(top_words)


Topic 1:
['semi', 'framework', 'active', 'net', 'interactive', 'using', 'based', 'medical', 'segmentation', 'image']

Topic 2:
['invariant', 'object', 'reinforcement', 'disentangled', 'via', 'video', 'unsupervised', 'visual', 'learning', 'representation']

Topic 3:
['via', 'attention', 'segmentation', 'using', 'net', 'recurrent', 'fully', 'convolutional', 'neural', 'network']

Topic 4:
['transformation', 'medical', 'video', 'co', 'training', 'weakly', 'learning', 'semi', 'self', 'supervised']

Topic 5:
['heterogeneous', 'network', 'via', 'node', 'cut', 'knowledge', 'adaptive', 'embedding', 'based', 'graph']

Topic 6:
['structured', 'method', 'model', 'based', 'survey', 'detection', 'clustering', 'using', 'learning', 'deep']

Topic 7:
['fast', 'loss', 'time', 'aware', 'real', 'level', 'video', 'image', 'segmentation', 'semantic']

Topic 8:
['negative', 'generative', 'representation', 'view', 'medical', 'prototypical', 'global', 'augmentation', 'learning', 'contrastive']

Topic 9:
['net

## Assign Dominant Topic to Documents

### Subtask:
Determine the dominant topic for each document based on the NMF results and add this information back to the original DataFrame.


In [19]:
df_topic_distribution = df.copy()
df_topic_distribution['Dominant_Topic'] = W.argmax(axis=1)

print("DataFrame with Dominant Topic assigned:")
df_topic_distribution.head()

DataFrame with Dominant Topic assigned:


Unnamed: 0,titles,summaries,terms,Dominant_Topic
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']",6
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']",0
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']",3
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV'],3
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']",6
