In [1]:
import pandas as pd
from scipy.stats import pearsonr

tiktok_df = pd.read_csv("tiktok_sentiment.csv")  # Ensure the file exists
spotify_df = pd.read_csv("spotify_sentiment.csv")  # Ensure the file exists

# Convert the date column to datetime format
tiktok_df['date'] = pd.to_datetime(tiktok_df['date'])
spotify_df['timestamp'] = pd.to_datetime(spotify_df['timestamp'])

# Extract year-month for grouping
tiktok_df['month'] = tiktok_df['date'].dt.to_period('M')
spotify_df['month'] = spotify_df['timestamp'].dt.to_period('M')

# Group by month and calculate sentiment percentages
tiktok_sentiment = tiktok_df.groupby(['month', 'predicted_sentiment']).size().unstack(fill_value=0)
spotify_sentiment = spotify_df.groupby(['month', 'predicted_sentiment']).size().unstack(fill_value=0)

# Calculate the percentages of positive and negative content
tiktok_sentiment['positive_percentage'] = (tiktok_sentiment['positive'] / tiktok_sentiment.sum(axis=1)) * 100
tiktok_sentiment['negative_percentage'] = (tiktok_sentiment['negative'] / tiktok_sentiment.sum(axis=1)) * 100

spotify_sentiment['positive_percentage'] = (spotify_sentiment['positive'] / spotify_sentiment.sum(axis=1)) * 100
spotify_sentiment['negative_percentage'] = (spotify_sentiment['negative'] / spotify_sentiment.sum(axis=1)) * 100

# Align months for comparison
common_months = tiktok_sentiment.index.intersection(spotify_sentiment.index)

# Extract the positive and negative sentiment percentages for those months
tiktok_positive = tiktok_sentiment.loc[common_months, 'positive_percentage']
spotify_positive = spotify_sentiment.loc[common_months, 'positive_percentage']

tiktok_negative = tiktok_sentiment.loc[common_months, 'negative_percentage']
spotify_negative = spotify_sentiment.loc[common_months, 'negative_percentage']

# Perform Pearson correlation test
corr_pos, p_value_pos = pearsonr(tiktok_positive, spotify_positive)
corr_neg, p_value_neg = pearsonr(tiktok_negative, spotify_negative)

# Print results
print(f"📌 Pearson Correlation for Positive Sentiments: r = {corr_pos:.3f}, p-value = {p_value_pos:.5f}")
print(f"📌 Pearson Correlation for Negative Sentiments: r = {corr_neg:.3f}, p-value = {p_value_neg:.5f}")

# Interpret p-values
alpha = 0.05  # Significance level

if p_value_pos < alpha:
    print("✅ Reject the null hypothesis for positive sentiments: There is a significant correlation.")
else:
    print("❌ Fail to reject the null hypothesis for positive sentiments: No significant correlation.")

if p_value_neg < alpha:
    print("✅ Reject the null hypothesis for negative sentiments: There is a significant correlation.")
else:
    print("❌ Fail to reject the null hypothesis for negative sentiments: No significant correlation.")


📌 Pearson Correlation for Positive Sentiments: r = 0.198, p-value = 0.55889
📌 Pearson Correlation for Negative Sentiments: r = 0.089, p-value = 0.79422
❌ Fail to reject the null hypothesis for positive sentiments: No significant correlation.
❌ Fail to reject the null hypothesis for negative sentiments: No significant correlation.


  spotify_df['month'] = spotify_df['timestamp'].dt.to_period('M')
