In [15]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import string

# Download stopwords if not already done
nltk.download('stopwords')

# Define your custom stopwords list
custom_stopwords = ['wine', 'flavor', 'aroma', 'tannin', 'palate', 'fruit', 'drink']  # Add words specific to your domain

# Load your dataset
df = pd.read_csv('explore_lda.csv')

# Text preprocessing function
def preprocess(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Lemmatize the tokens (converting to base forms)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Remove NLTK stopwords (standard English stopwords)
    stop_words = set(stopwords.words('english'))  # NLTK stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Remove custom stopwords after lemmatization
    tokens = [word for word in tokens if word not in custom_stopwords]
    
    return ' '.join(tokens)

# Apply preprocessing to the 'description' column
df['cleaned'] = df['description'].apply(preprocess)

# Vectorize the text (creates a document-term matrix)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned'])

# Train the LDA model using scikit-learn
n_topics = 10  # Adjust the number of topics as needed
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_model.fit(X)

# Get the top words for each topic
n_top_words = 20  # Number of top words to display per topic
feature_names = vectorizer.get_feature_names_out()

# Display the topics and top keywords
for topic_idx, topic in enumerate(lda_model.components_):
    print(f"Topic #{topic_idx + 1}:")
    top_keywords = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    print(", ".join(top_keywords))
    print("\n")


[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


Topic #1:
cherry, black, spice, blackberry, nose, berry, oak, red, firm, plum, finish, ripe, offer, pepper, lead, open, ha, dark, dried, hint


Topic #2:
acidity, show, finish, crisp, nose, ha, note, blend, pear, oak, dark, apple, lemon, fresh, light, ripe, dry, blackberry, sauvignon, white


Topic #3:
cherry, finish, note, acidity, black, red, fresh, soft, white, juicy, spice, herb, berry, apple, bright, plum, dry, nose, peach, blend


Topic #4:
note, rich, pear, finish, cherry, ripe, apple, ha, acidity, peach, vineyard, citrus, lemon, nose, bit, concentration, white, herb, stone, pineapple


Topic #5:
finish, oak, cherry, acidity, note, soft, show, quality, elegant, creamy, red, vanilla, melon, dry, rich, barrel, pineapple, 100, seems, varietal


Topic #6:
acidity, ripe, blend, rich, finish, balanced, sweet, well, juicy, give, fruity, black, note, herb, age, red, raspberry, green, good, light


Topic #7:
cherry, acidity, ha, red, dry, finish, show, fruity, texture, bright, rich, spic