In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Sample text data (could be replaced by actual dataset)
data = [
    "Cell biology is the study of cell structure and function, and it revolves around the concept that the cell is the fundamental unit of life.",
    "Focusing on the cell permits a detailed understanding of the tissues and organisms that cells compose.",
    "Some organisms have only one cell, while others are organized into cooperative groups with huge numbers of cells.",
    "On the whole, cell biology focuses on the structure and function of a cell, from the most general properties shared by all cells, to the unique, highly intricate functions particular to specialized cells."
]

# Text Preprocessing
def preprocess_text(texts):
    vectorizer = CountVectorizer(stop_words='english', lowercase=True, token_pattern=r'\b[a-zA-Z]{3,}\b')
    dt_matrix = vectorizer.fit_transform(texts)
    return dt_matrix, vectorizer.get_feature_names_out()

dt_matrix, feature_names = preprocess_text(data)

# Apply LDA
lda = LatentDirichletAllocation(n_components=3, random_state=0)
lda.fit(dt_matrix)

# Displaying topics
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict[f"Topic {topic_idx + 1}"] = [feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

# Generate a DataFrame with the topics
topics_df = display_topics(lda, feature_names, 5)
topics_df