In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from ast import literal_eval

# Read the CSV file
df = pd.read_csv('YTC_all_song_cleaned.csv')

# If the tokens are not already a list, convert string representation of list to list
if df['tokens'].dtype == 'object':
    df['tokens'] = df['tokens'].apply(literal_eval)

# Create a list of all words across all comments
all_words = [word for tokens in df['tokens'] for word in tokens]

# Get the 20 most common words
counter = Counter(all_words)
most_common_words = counter.most_common(20)

# Print the 20 most common words
for word, count in most_common_words:
    print(f'{word}: {count}')

# Plot the 20 most common words
word_names = [word for word, _ in most_common_words]
word_counts = [count for _, count in most_common_words]

plt.figure(figsize=(15, 10))
plt.bar(word_names, word_counts)
plt.title('Top 20 most common words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.show()