In [None]:
#top 5 songs
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('spotifydata.csv')

# 排序歌曲流行度分数找出前5首
top_songs = df.nlargest(5, 'Popularity')

# 区分每首歌曲颜色
colors = sns.light_palette("red", n_colors=5, reverse=True)

# 制图
plt.figure(figsize=(10, 6))
bar_plot = sns.barplot(x='Popularity', y='Track Name', data=top_songs, palette=colors)
plt.title("Taylor's Top 5 Songs")
plt.xlabel('Popularity')
bar_plot.set(ylabel='')

plt.show()

In [None]:
#top 30 words in comments

import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from ast import literal_eval
import numpy as np
from PIL import Image

# Read the CSV file
df = pd.read_csv('YTC_all_song_cleaned.csv')

# If the tokens are not already a list, convert string representation of list to list
if df['tokens'].dtype == 'object':
    df['tokens'] = df['tokens'].apply(literal_eval)

# Create a list of all words across all comments
all_words = [word for tokens in df['tokens'] for word in tokens]

# Get the 20 most common words
counter = Counter(all_words)
most_common_words = dict(counter.most_common(30))

# Generate wordcloud from the most common words
guitar_mask = np.array(Image.open("red-clipart-2018-27.png"))
wordcloud = WordCloud(mask=guitar_mask, background_color='white', contour_color='steelblue', contour_width=1, colormap='PuRd').generate_from_frequencies(most_common_words)

# Display the wordcloud
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#top 30 words in lyrics

import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
from ast import literal_eval
import numpy as np
from PIL import Image

# Read the CSV file
df = pd.read_csv('lyrics_data_cleaned.csv')

# If the tokens are not already a list, convert string representation of list to list
if df['tokens'].dtype == 'object':
    df['tokens'] = df['tokens'].apply(literal_eval)

# Create a list of all words across all comments
all_words = [word for tokens in df['tokens'] for word in tokens]

# Get the 20 most common words
counter = Counter(all_words)
most_common_words = dict(counter.most_common(30))

# Generate wordcloud from the most common words
guitar_mask = np.array(Image.open("red-clipart-2018-27.png"))
wordcloud = WordCloud(mask=guitar_mask, background_color='white', contour_color='steelblue', contour_width=1, colormap='RdPu', max_font_size=200).generate_from_frequencies(most_common_words)

# Display the wordcloud
plt.figure(figsize=(10, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#lyrics sentiment overview

import pandas as pd
import matplotlib.pyplot as plt
import squarify

data = pd.read_csv('lyrsentiment.csv')

# 区分积极、消极、中立情感颜色
def assign_color(score):
    if score > 0:
        return 'red'
    elif score < 0:
        return 'green'
    else:
        return 'white'

data['color'] = data['compound'].apply(assign_color)
data['area'] = data['compound'].abs()+0.1

# 设置歌名标签
songs_to_label = data['songname'].apply(lambda x: ' '.join(x.split()[:2]) + '...' if len(x.split()) > 3 else x)

# 制图
fig, ax = plt.subplots(figsize=(10, 8))  
squarify.plot(sizes=data['area'], label=songs_to_label, color=data['color'], alpha=0.7)  
plt.axis('off')
plt.rcParams['font.size'] = 8
plt.rcParams['text.color'] = 'white'
plt.text(0, -5, "Red=Pos, Green=Neg, White=Neu", fontsize=10, color='black')
plt.title('Songs Lyrics Sentiment Overview', fontsize=14,color='black')

plt.show()

In [None]:
#lyrics sentiment change

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('lyrsentiment.csv')

# 设置变量数据
x = df['year']
y = df['compound']

# 设置x轴单位
x_ticks = range(min(x), max(x) + 1, 2)

# 区分积极、消极、中立情感颜色
colors = ['r' if val > 0 else 'g' if val < 0 else 'b' for val in y]

# 制图
plt.figure(figsize=(12, 8))
plt.scatter(x, y, s=abs(y)*200, c=colors, alpha=0.5, label='_')  
plt.title('Songs Lyrics Sentiment Change Over Time')
plt.xlabel('Year')
plt.ylabel('Sentiment Compound')
plt.xticks(x_ticks)
plt.figtext(0.5, 0.02, 'Red=Pos, Green=Neg', wrap=True, horizontalalignment='center', fontsize=12, color='black')
plt.grid(True)

plt.show()

In [None]:
#comments sentiment change

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('commsentiment.csv')

# 设置变量数据
x = df['year']
y = df['compound']

# 设置x轴单位
x_ticks = range(min(x), max(x) + 1, 2)

# 区分积极、消极、中立情感颜色
colors = ['r' if val > 0 else 'g' if val < 0 else 'b' for val in y]

# 制图
plt.figure(figsize=(12, 8))
plt.scatter(x, y, s=abs(y)*200, c=colors, alpha=0.5, label='_')  
plt.title('Comments Sentiment Change Over Time')
plt.xlabel('Year')
plt.ylabel('Sentiment Compound')
plt.xticks(x_ticks)
plt.figtext(0.5, 0.02, 'Red=Pos, Green=Neg', wrap=True, horizontalalignment='center', fontsize=12, color='black')
plt.grid(True)

plt.show()

In [None]:
#popularity change 

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('spotifydata.csv')

# 将歌曲发行日期转换为年份
df['Year'] = pd.to_datetime(df['Release Date']).dt.year

# 去掉Popularity为0的异常值
df = df[df['Popularity'] != 0]

# 制图
plt.scatter(df['Year'], df['Popularity'], s=100, c='blue', alpha=0.5)
plt.title('Songs Popularity Change Over Time')
plt.xlabel('Year')
plt.ylabel('Popularity')

# 设置X轴单位和y轴范围
plt.xticks(range(min(df['Year']), max(df['Year'])+1, 2))
plt.ylim(0, max(df['Popularity']) * 1.1)

plt.show()

In [None]:
#differences between phases

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('spotifydata.csv')

# 设置两个阶段数据
phase1_data = df[df['Phase'] == 'Phase 1']['Energy']
phase2_data = df[df['Phase'] == 'Phase 2']['Energy']

#制图
plt.boxplot([phase1_data, phase2_data], labels=['Phase1', 'Phase2'])
box = plt.boxplot([phase1_data, phase2_data], labels=['Phase1', 'Phase2'])
for element in ['boxes', 'whiskers', 'fliers', 'means', 'medians', 'caps']:
    plt.setp(box[element], color='lightblue')  
plt.setp(box['boxes'][1], color='lightgreen')  
plt.xlabel('Phase')
plt.ylabel('Energy')
plt.title('Comparison of Energy between Phase1 and Phase2')

plt.show()

In [None]:
#correlation analysis

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

data = pd.read_csv('lyrsentiment.csv')

# 设置变量数据
A = data['compound']
B = data['compound2']

# 计算相关系数和P值
correlation, p_value = stats.pearsonr(A, B)
print("Correlation: ", correlation)
print("P-Value: ", p_value)

# 输出相关性分析结果
correlation_table = pd.DataFrame(data={'Variable': ['Lyrics Sentiment', 'Comments Sentiment'], 'Correlation': [A.corr(B), B.corr(A)], 'P-Value': [p_value, p_value]})
print(correlation_table)

# 制图
plt.scatter(A, B)
plt.plot(np.unique(A), np.poly1d(np.polyfit(A, B, 1))(np.unique(A)), color='red')
plt.xlabel('Lyrics Sentiment')
plt.ylabel('Comments Sentiment')
plt.title('Relationship between Song Lyrics Sentiment and Comments Sentiment')

plt.show()