In [None]:
import os
import pandas as pd
from googleapiclient.discovery import build

# Set up API credentials
api_service_name = "youtube"
api_version = "v3"
DEVELOPER_KEY = "YOUR_API_KEY"  # Replace with your API key

# Set up YouTube Data API client
youtube = build(api_service_name, api_version, developerKey=DEVELOPER_KEY)

# Specify the video ID
video_id = "<VIDEO_ID>"

# Retrieve comments from the video
def get_comments(video_id):
    comments = []
    next_page_token = None

    while True:
        # Make API request to get comments
        response = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,  # Adjust as needed
            pageToken=next_page_token
        ).execute()

        # Process comments and append to the list
        for item in response["items"]:
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)

        # Check if there are more comments to fetch
        if "nextPageToken" in response:
            next_page_token = response["nextPageToken"]
        else:
            break

    return comments

# Extract comments from the video
comments = get_comments(video_id)

# Store comments in a DataFrame
df = pd.DataFrame(comments, columns=["Comment"])

# Save comments to a CSV file
csv_filename = "comments.csv"
df.to_csv(csv_filename, index=False)

print(f"Comments extracted and saved to {csv_filename}")


In [None]:
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models

# Preprocess the comments
# Assuming you have a list of comments called 'comments'
# Perform tokenization, lowercasing, and removal of stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

processed_comments = []
for comment in comments:
    # Tokenization and lowercasing
    tokens = comment.lower().split()
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    processed_comments.append(tokens)

# Create a dictionary from the processed comments
dictionary = corpora.Dictionary(processed_comments)

# Convert the dictionary into a document-term matrix
doc_term_matrix = [dictionary.doc2bow(comment) for comment in processed_comments]

# Apply LDA model
num_topics = 5  # Specify the number of topics to extract
lda_model = models.LdaModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary)

# Print the topics and their associated keywords
for idx, topic in lda_model.print_topics():
    print(f"Topic #{idx}: {topic}")

# Get the most dominant topic for each comment
for i, comment in enumerate(processed_comments):
    bow_vector = dictionary.doc2bow(comment)
    dominant_topic = max(lda_model[bow_vector], key=lambda x: x[1])
    print(f"Comment #{i + 1} belongs to Topic #{dominant_topic[0]}")

