<a href="https://colab.research.google.com/github/vvrgit/NLP-LAB/blob/main/Assignment6_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Task #1 : Sample Data**

In [47]:
import pandas as pd
data=pd.read_excel('LDA-Data.xlsx')
display(data.head())

Unnamed: 0,News
0,Virat scored century in match
1,BJP won in elections
2,Bumra took 5 wicket in a match
3,Congress form state government


In [48]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def nltk_preprocessing_pipeline(text):
    # Initialize NLTK tools
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # 1. Preprocess text (from previous steps)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)

    text = text.lower()  # Convert to lowercase

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    # 2. Word Tokenization
    tokenized_words = word_tokenize(text)

    # 3. Stopword Removal
    filtered_words = [word for word in tokenized_words if word not in stop_words]

    # 4. Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    # 5. Rejoin words
    clean_summary = ' '.join(lemmatized_words)

    return clean_summary

print("NLTK preprocessing pipeline function created successfully!")

NLTK preprocessing pipeline function created successfully!


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [49]:
data['clean_News'] = data['News'].apply(nltk_preprocessing_pipeline)
print("\nComparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):")
print(data[['clean_News']].head())


Comparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):
                       clean_News
0      virat scored century match
1                    bjp election
2       bumra took 5 wicket match
3  congress form state government


In [50]:
from sklearn.feature_extraction.text import CountVectorizer
# Vectorize the cleaned summaries
count_vectorizer = CountVectorizer(max_df=0.95, min_df=1, stop_words='english')
doc_term_matrix = count_vectorizer.fit_transform(data['clean_News'])

In [51]:
import pandas as pd
# Get feature (word) names
feature_names = count_vectorizer.get_feature_names_out()
# Convert sparse matrix to DataFrame
bow_df = pd.DataFrame(doc_term_matrix.toarray(),columns=feature_names)
# Display BoW matrix for top 10 documents
bow_top_10 = bow_df.head(10)
print(bow_top_10)

   bjp  bumra  century  congress  election  form  government  match  scored  \
0    0      0        1         0         0     0           0      1       1   
1    1      0        0         0         1     0           0      0       0   
2    0      1        0         0         0     0           0      1       0   
3    0      0        0         1         0     1           1      0       0   

   state  took  virat  wicket  
0      0     0      1       0  
1      0     0      0       0  
2      0     1      0       1  
3      1     0      0       0  


In [52]:
from sklearn.decomposition import LatentDirichletAllocation

# Initialize and fit LDA model
num_topics = 2
LDA = LatentDirichletAllocation(n_components=num_topics, random_state=42)
LDA.fit(doc_term_matrix)

In [53]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx in range(len(model.components_)):
        print(f"\nTopic {topic_idx}:")

        # Get word weights for this topic
        topic_weights = model.components_[topic_idx]

        # Get indices of words sorted by weight (descending)
        sorted_indices = topic_weights.argsort()[::-1]

        # Take top N words
        top_indices = sorted_indices[:num_top_words]

        # Print top words
        for idx in top_indices:
            print(feature_names[idx], end=" ")
        print()

In [54]:
# Display top words for each topic
num_top_words = 10
print(f"\nTop {num_top_words} words per topic:")
display_topics(LDA, count_vectorizer.get_feature_names_out(), num_top_words)


Top 10 words per topic:

Topic 0:
form government congress state election bjp match wicket bumra took 

Topic 1:
match virat century scored took bumra wicket bjp election state 


In [55]:
# Assign topics to each document
document_topics = LDA.transform(doc_term_matrix)
data['topic'] = document_topics.argmax(axis=1)

print("\nDataFrame with assigned topics (first 5 rows):")
print(data[['clean_News', 'topic']].head())


DataFrame with assigned topics (first 5 rows):
                       clean_News  topic
0      virat scored century match      1
1                    bjp election      0
2       bumra took 5 wicket match      1
3  congress form state government      0


# **Task #2 : Research paper Data**

Dataset: https://www.kaggle.com/datasets/spsayakpaul/arxiv-paper-abstracts?resource=download

In [56]:
import pandas as pd
df = pd.read_csv('arxiv_data.csv', engine='python', nrows=1000)
display(df.head())

Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [57]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def nltk_preprocessing_pipeline(text):
    # Initialize NLTK tools
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # 1. Preprocess text (from previous steps)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)

    text = text.lower()  # Convert to lowercase

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    # 2. Word Tokenization
    tokenized_words = word_tokenize(text)

    # 3. Stopword Removal
    filtered_words = [word for word in tokenized_words if word not in stop_words]

    # 4. Lemmatization
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    # 5. Rejoin words
    clean_summary = ' '.join(lemmatized_words)

    return clean_summary

print("NLTK preprocessing pipeline function created successfully!")

NLTK preprocessing pipeline function created successfully!


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [58]:
df['clean_summaries_pipeline'] = df['summaries'].apply(nltk_preprocessing_pipeline)
print("\nComparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):")
print(df[['clean_summaries_pipeline']].head())


Comparison of previous clean_summaries and new clean_summaries_pipeline (first 5 rows):
                            clean_summaries_pipeline
0  stereo matching one widely used technique infe...
1  recent advancement artificial intelligence ai ...
2  paper proposed novel mutual consistency networ...
3  consistency training proven advanced semisuper...
4  ensure safety automated driving correct percep...


In [59]:
from sklearn.feature_extraction.text import CountVectorizer
# Vectorize the cleaned summaries
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
doc_term_matrix = count_vectorizer.fit_transform(df['clean_summaries_pipeline'])

In [60]:
import pandas as pd
# Get feature (word) names
feature_names = count_vectorizer.get_feature_names_out()
# Convert sparse matrix to DataFrame
bow_df = pd.DataFrame(doc_term_matrix.toarray(),columns=feature_names)
# Display BoW matrix for top 10 documents
bow_top_10 = bow_df.head(10)
print(bow_top_10)

   01  011  014  049  059  060  065  084  089  091  ...  xray  xrays  year  \
0   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
1   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
2   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
3   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
4   0    0    0    0    0    0    0    0    0    0  ...     0      0     1   
5   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
6   0    0    0    0    0    0    0    0    0    0  ...     0      0     1   
7   0    0    0    0    0    0    0    0    0    0  ...     1      0     0   
8   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   
9   0    0    0    0    0    0    0    0    0    0  ...     0      0     0   

   yes  yield  yielded  yielding  youtube  youtubevos  zurich  
0    0      0        0         0        0           0       0  
1    0      0

In [61]:
from sklearn.decomposition import LatentDirichletAllocation

# Initialize and fit LDA model
num_topics = 2
LDA = LatentDirichletAllocation(n_components=num_topics, random_state=42)
LDA.fit(doc_term_matrix)


In [62]:
def display_topics(model, feature_names, num_top_words):
    for topic_idx in range(len(model.components_)):
        print(f"\nTopic {topic_idx}:")

        # Get word weights for this topic
        topic_weights = model.components_[topic_idx]

        # Get indices of words sorted by weight (descending)
        sorted_indices = topic_weights.argsort()[::-1]

        # Take top N words
        top_indices = sorted_indices[:num_top_words]

        # Print top words
        for idx in top_indices:
            print(feature_names[idx], end=" ")
        print()

In [63]:
# Display top words for each topic
num_top_words = 10
print(f"\nTop {num_top_words} words per topic:")
display_topics(LDA, count_vectorizer.get_feature_names_out(), num_top_words)


Top 10 words per topic:

Topic 0:
method network feature model proposed approach result algorithm based semantic 

Topic 1:
network method model learning data deep training medical task performance 


In [64]:
# Assign topics to each document
document_topics = LDA.transform(doc_term_matrix)
df['topic'] = document_topics.argmax(axis=1)

print("\nDataFrame with assigned topics (first 5 rows):")
print(df[['clean_summaries_pipeline', 'topic']].head())


DataFrame with assigned topics (first 5 rows):
                            clean_summaries_pipeline  topic
0  stereo matching one widely used technique infe...      1
1  recent advancement artificial intelligence ai ...      1
2  paper proposed novel mutual consistency networ...      1
3  consistency training proven advanced semisuper...      1
4  ensure safety automated driving correct percep...      1
