# **Spotify Analysis: Sentiment Analysis on Lyrics using a Pre-trained Large Language Model**


In [1]:
#Install all relevant packages: 
%run pip_installs.py
%run packages.py

Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/varshis./nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/varshis./nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/varshis./nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/varshis./nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/varshis./nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/varshis./nltk_data...
[nltk_data]    |   Package

In [2]:
import os  # Importing the os module for operating system functionalities (not used in the current code)
import pandas as pd  # Importing the pandas library for data manipulation and analysis

def read_and_process_csv(file_path):
    # Read the CSV file located at file_path into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # Check if the 'Added At' column exists in the DataFrame
    if 'Added At' in df.columns:
        # Convert the 'Added At' column to datetime format, ensuring it's in UTC
        df['Added At'] = pd.to_datetime(df['Added At'], utc=True)
    
    # Check if the 'Release Date' column exists in the DataFrame
    if 'Release Date' in df.columns:
        # Convert the 'Release Date' column to datetime format, coercing errors to NaT (Not a Time)
        df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')
    
    # Return the processed DataFrame
    return df

# Define the path to the specific CSV file that contains cleaned data
file_path = 'processed_csv/clean_data_all.csv'

# Read and process the CSV file by calling the function defined above
df = read_and_process_csv(file_path)

# Sort the DataFrame based on the 'Added At' column in ascending order
df = df.sort_values('Added At')

# Print the shape (number of rows and columns) of the final DataFrame
print(f"Final dataframe shape: {df.shape}")
# Print the date range from the minimum to maximum date in the 'Added At' column
print(f"Date range: from {df['Added At'].min()} to {df['Added At'].max()}")


Final dataframe shape: (5853, 26)
Date range: from 2016-04-25 23:31:56+00:00 to 2024-08-19 16:15:35+00:00


__________________

In [3]:
import re  # Import the regex library for string manipulation using regular expressions
import nltk  # Import the Natural Language Toolkit library for text processing
from nltk.corpus import stopwords  # Import the stopwords corpus from NLTK

# Download the stopwords data (only needed once)
nltk.download('stopwords', quiet=True)  # Add quiet=True to suppress download messages

# Get the set of English stop words for faster lookup
stop_words = set(stopwords.words('english'))

def clean_lyrics(text):
    # Return an empty list if the input is not a string
    if not isinstance(text, str):
        return []
    
    # Replace multiple consecutive commas with a single space and clean up spacing
    text = re.sub(r'\s*,\s*', ' ', text)  # Replace spaces around commas and multiple commas
    text = re.sub(r'[^\w\s]', '', text)  # Remove symbols and non-word characters
    text = re.sub(r'\b\w{1,2}\b', '', text)  # Remove non-word tokens (1 or 2 characters)
    text = text.strip()  # Remove leading or trailing spaces

    # Split text into words, remove stopwords, and filter out empty strings in one line
    words = [word for word in text.lower().split() if word not in stop_words]
    
    return words  # Return the list of cleaned words

# Apply the function to the DataFrame without creating a larger intermediate table
df['Words'] = df['Lyrics_Clean'].apply(clean_lyrics)

In [4]:
import matplotlib.pyplot as plt  # Import the matplotlib library for plotting
from wordcloud import WordCloud  # Import WordCloud for generating word clouds
from collections import Counter  # Import Counter for counting word frequencies
import ipywidgets as widgets  # Import ipywidgets for interactive widgets
from IPython.display import display, clear_output  # Import display and clear_output for managing outputs in Jupyter notebooks

def create_word_cloud(track_name):
    """
    Generate and display a word cloud for the selected track.
    This function takes the name of a track as input and creates a word cloud based on the words in the lyrics.
    """
    # Get the row of data corresponding to the selected track name
    selected_row = df[df['Track Name'] == track_name].iloc[0]
    # Extract the cleaned words and artist name from the selected row
    selected_words = selected_row['Words']
    artist_name = selected_row['Artist Name(s)']
    
    # Count the frequency of each word using Counter
    word_freq = Counter(selected_words)
    
    # Create and generate a word cloud image based on the word frequencies
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
    
    # Clear previous output to prepare for displaying the new word cloud
    clear_output(wait=True)
    
    # Set up the figure size for the word cloud display
    plt.figure(figsize=(10, 5))
    # Display the word cloud image
    plt.imshow(wordcloud, interpolation='bilinear')
    # Remove the axis for a cleaner look
    plt.axis('off')
    # Set the title for the word cloud with track and artist name
    plt.title(f'Word Cloud for "{track_name}" by {artist_name}')
    # Show the plot with the word cloud
    plt.show()

# Create an interactive widget to allow the user to select a track name
widgets.interact(create_word_cloud, track_name=df['Track Name'].tolist());


interactive(children=(Dropdown(description='track_name', options=('Tum Tak', 'Nagada Sang Dhol', 'Raanjhanaa',…

---

### SENTIMENT ANALYSIS

In [5]:
pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
from transformers import pipeline

In [7]:
import torch  # Import PyTorch for tensor operations and model handling
from transformers import AutoTokenizer, AutoModelForSequenceClassification  # Import the tokenizer and model class from Hugging Face Transformers
import pandas as pd  # Import pandas for DataFrame manipulation
from torch.nn.functional import softmax  # Import softmax for probability computation

# Initialize the tokenizer with a pretrained model for emotion classification
tokenizer = AutoTokenizer.from_pretrained('SamLowe/roberta-base-go_emotions', clean_up_tokenization_spaces=False)

# Initialize the emotion classification model
model = AutoModelForSequenceClassification.from_pretrained('SamLowe/roberta-base-go_emotions')
# Move the model to GPU if available, otherwise use CPU
model.to('cuda' if torch.cuda.is_available() else 'cpu')

def sliding_window_sentiment_analysis(text, window_size=512, stride=256):
    """
    Perform sentiment analysis using a sliding window approach.
    
    Args:
        text (str): Input text to analyze.
        window_size (int): Size of the token window for analysis.
        stride (int): Step size for moving the window.
    
    Returns:
        tuple: Predicted sentiment label and confidence score.
    """
    # Tokenize the input text and prepare input tensors
    inputs = tokenizer(text, return_tensors='pt', truncation=False, padding=False)
    
    # Extract input IDs from the tokenized input
    input_ids = inputs['input_ids'].squeeze()
    
    # Calculate the number of sliding windows based on input size
    num_windows = max(1, (input_ids.size(0) - window_size) // stride + 1)
    
    # Initialize a list to store the logits from each window
    all_logits = []
    
    # Process each sliding window
    for i in range(num_windows):
        # Extract the current window of input IDs
        window_input_ids = input_ids[i*stride:i*stride + window_size].unsqueeze(0)
        
        # Create an attention mask (all ones for this use case)
        attention_mask = torch.ones_like(window_input_ids)
        
        # Move the window input IDs and attention mask to the same device as the model
        window_input_ids = window_input_ids.to(model.device)
        attention_mask = attention_mask.to(model.device)
        
        # Disable gradient calculation for inference
        with torch.no_grad():
            # Get the model's output for the current window
            outputs = model(window_input_ids, attention_mask=attention_mask)
        
        # Append the logits to the list
        all_logits.append(outputs.logits)
    
    # Aggregate the logits by averaging them across all windows
    aggregated_logits = torch.mean(torch.stack(all_logits), dim=0)
    
    # Apply softmax to the aggregated logits to get probabilities
    probs = softmax(aggregated_logits, dim=1)
    
    # Get the predicted class (highest probability) and its confidence score
    predicted_class = torch.argmax(probs).item()
    confidence = probs[0][predicted_class].item()
    
    # Return the predicted sentiment label and confidence score
    return model.config.id2label[predicted_class], confidence

# Prepare the input data from the DataFrame, converting lyrics to a list of strings
lyrics_list = df['Lyrics_Clean'].astype(str).tolist()

# Initialize a list to store results for each set of lyrics
results = []
# Process each set of lyrics using the sliding window sentiment analysis function
for lyrics in lyrics_list:
    label, score = sliding_window_sentiment_analysis(lyrics)  # Get the sentiment label and score
    results.append({'label': label, 'score': score})  # Append the results to the list

# Convert the results list into a DataFrame for easier analysis
lyrics_sentiment = pd.DataFrame(results)

# Display the results DataFrame containing sentiment labels and scores
print(lyrics_sentiment)

# Add the sentiment results back to the original DataFrame
df['Sentiment_Label'] = lyrics_sentiment['label']  # Add the sentiment labels to the original DataFrame
df['Sentiment_Score'] = lyrics_sentiment['score']  # Add the sentiment scores to the original DataFrame

Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors


          label  score
0       neutral   0.99
1       neutral   0.97
2       neutral   0.99
3       neutral   1.00
4       neutral   0.82
...         ...    ...
5848   optimism   0.38
5849  confusion   0.25
5850  amusement   0.64
5851      anger   0.78
5852       love   0.52

[5853 rows x 2 columns]


In [9]:
import plotly.express as px  # Import Plotly Express for creating interactive visualizations

# Count the number of occurrences for each sentiment label
sentiment_counts = df['Sentiment_Label'].value_counts().reset_index()
sentiment_counts.columns = ['Sentiment_Label', 'Count']  # Rename columns for clarity

# Define a custom rainbow color sequence
rainbow_colors = ['#FF0000', '#FF7F00', '#FFFF00', '#00FF00', '#0000FF', '#4B0082', '#9400D3']

# Create an interactive bar chart using Plotly Express
fig = px.bar(
    sentiment_counts, 
    x='Sentiment_Label', 
    y='Count', 
    title='Number of Songs per Sentiment',
    hover_data={'Count': True},  # Display count on hover
    labels={'Count': 'Number of Songs', 'Sentiment_Label': 'Sentiment Label'},  # Axis labels
    color='Sentiment_Label',  # Color bars by sentiment label
    color_discrete_sequence=rainbow_colors  # Custom rainbow color sequence
)

# Update layout for better readability
fig.update_layout(
    xaxis_title='Sentiment Label',
    yaxis_title='Number of Songs',
    plot_bgcolor='rgba(0,0,0,0)',  # Set background color to transparent
)

# Save the interactive bar chart as an HTML file
fig.write_html('sentiment_bar_chart.html')

# Display the chart (optional in a notebook environment)
fig.show()

In [10]:
import os  # Import the os module for interacting with the operating system

# Define the output directory where the processed CSV files will be saved
output_dir = 'processed_csv'

# Define the filename for the cleaned DataFrame CSV file
cleaned_df_filename = 'music_df.csv'

# Create the full path to the output CSV file by joining the output directory and filename
clean_track_path = os.path.join(output_dir, cleaned_df_filename)

# Save the DataFrame 'df' as a CSV file at the specified path without the index
df.to_csv(clean_track_path, index=False)


---