In [None]:
import csv
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Read the CSV file and extract the text
text_data = []
with open('your_csv_file.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip header row if present
    for row in reader:
        text_data.append(row[0])  # Assuming the text is in the first column

# Preprocess the text
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

preprocessed_text = []
for text in text_data:
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    preprocessed_text.append(' '.join(tokens))

# Keyword extraction using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_text)
feature_names = vectorizer.get_feature_names()
top_keywords = []

for i in range(len(text_data)):
    feature_index = tfidf_matrix[i, :].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[i, x] for x in feature_index])
    sorted_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    keywords = [feature_names[index] for index, score in sorted_scores[:5]]  # Extract top 5 keywords
    top_keywords.append(keywords)

# Topic modeling using LDA
num_topics = 5  # Number of desired topics
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda_matrix = lda_model.fit_transform(tfidf_matrix)
topic_keywords = []

for topic_weights in lda_model.components_:
    sorted_indexes = topic_weights.argsort()[::-1]
    keywords = [feature_names[i] for i in sorted_indexes[:5]]  # Extract top 5 keywords per topic
    topic_keywords.append(keywords)

# Print top keywords for each document and associated topics
for i, (keywords, topic) in enumerate(zip(top_keywords, lda_matrix.argmax(axis=1))):
    print(f"Document {i+1}: Keywords - {', '.join(keywords)} | Topic: {topic}")

# Print the keywords for each topic
for i, keywords in enumerate(topic_keywords):
    print(f"Topic {i+1} Keywords: {', '.join(keywords)}")
