In [3]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import MinMaxScaler

# Cell 2: Load datasets
spotify_df = pd.read_csv('spotify_millsongdata.csv')
music_df = pd.read_csv('tcc_ceds_music.csv')

# Cell 3: Clean lyrics
def clean_lyrics(text):
    return re.sub(r'[^a-zA-Z\s]', '', str(text).lower()).strip()

spotify_df['text'] = spotify_df['text'].apply(clean_lyrics)
music_df['lyrics'] = music_df['lyrics'].apply(clean_lyrics)

# Cell 4: Preview
print(spotify_df.head())
print(music_df.head())

  artist                   song                                        link  \
0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   
3   ABBA                   Bang                  /a/abba/bang_20598415.html   
4   ABBA       Bang-A-Boomerang      /a/abba/bang+a+boomerang_20002668.html   

                                                text  
0  look at her face its a wonderful face  \r\nand...  
1  take it easy with me please  \r\ntouch me gent...  
2  ill never know why i had to go  \r\nwhy i had ...  
3  making somebody happy is a question of give an...  
4  making somebody happy is a question of give an...  
   Unnamed: 0           artist_name            track_name  release_date genre  \
0           0                mukesh  mohabbat bhi jhoothi          1950   pop   
1           4         frankie lai

In [4]:
# Cell: Normalize features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
music_df[['sadness', 'danceability', 'energy']] = scaler.fit_transform(
    music_df[['sadness', 'danceability', 'energy']]
)

# Check normalization
print(music_df[['sadness', 'danceability', 'energy']].describe())

            sadness  danceability        energy
count  28372.000000  28372.000000  28372.000000
mean       0.131587      0.534298      0.569875
std        0.184625      0.175307      0.244385
min        0.000000      0.000000      0.000000
25%        0.000876      0.412474      0.380361
50%        0.005074      0.539625      0.580567
75%        0.239342      0.659103      0.772766
max        1.000000      1.000000      1.000000


In [5]:
# Cell: Mood mapping
def assign_mood(row):
    if row['energy'] > 0.6 and row['danceability'] > 0.6:
        return 'happy'
    elif row['sadness'] > 0.6:
        return 'sad'
    else:
        return 'neutral'

music_df['mood'] = music_df.apply(assign_mood, axis=1)

# Verify
print(music_df['mood'].value_counts())

mood
neutral    22868
happy       4938
sad          566
Name: count, dtype: int64


In [10]:
# Cell: Install and import transformers
%pip install transformers
%pip install torch
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get embeddings
def get_lyrics_embedding(lyrics):
    inputs = tokenizer(lyrics, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Apply to samples (Spotify and Music Dataset)
spotify_sample = spotify_df.head(100)
music_sample = music_df.head(100)

spotify_sample['lyrics_embedding'] = spotify_sample['text'].apply(get_lyrics_embedding)
music_sample['lyrics_embedding'] = music_sample['lyrics'].apply(get_lyrics_embedding)

# Check
print("Spotify embedding shape:", spotify_sample['lyrics_embedding'].iloc[0].shape)  # Should be (768,)
print("Music embedding shape:", music_sample['lyrics_embedding'].iloc[0].shape)






Collecting torch
  Using cached torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting networkx (from torch)
  Using cached networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Using cached torch-2.6.0-cp312-cp312-win_amd64.whl (204.1 MB)
Using cached networkx-3.4.2-py3-none-any.whl (1.7 MB)
Installing collected packages: networkx, torch
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [WinError 2] The system cannot find the file specified: 'c:\\Python312\\Scripts\\torchfrtrace.exe' -> 'c:\\Python312\\Scripts\\torchfrtrace.exe.deleteme'

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


ImportError: 
BertModel requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


In [13]:

pip install torch

Collecting torchNote: you may need to restart the kernel to use updated packages.

  Using cached torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Using cached torch-2.6.0-cp312-cp312-win_amd64.whl (204.1 MB)
Installing collected packages: torch


ERROR: Could not install packages due to an OSError: [WinError 2] The system cannot find the file specified: 'c:\\Python312\\Scripts\\torchfrtrace.exe' -> 'c:\\Python312\\Scripts\\torchfrtrace.exe.deleteme'



In [17]:
import torch
print(torch.__version__)


2.6.0+cpu


In [1]:
import torch
print("PyTorch installed successfully:", torch.__version__)

PyTorch installed successfully: 2.6.0+cpu


In [4]:
# Cell: Install and import transformers
%pip install transformers  # Only needed once
%pip install torch         # Should be installed now
from transformers import BertTokenizer, BertModel
import torch

# Initialize BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get embeddings
def get_lyrics_embedding(lyrics):
    inputs = tokenizer(lyrics, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Apply to samples (Spotify and Music Dataset)
spotify_sample = spotify_df.head(100)
music_sample = music_df.head(100)

spotify_sample['lyrics_embedding'] = spotify_sample['text'].apply(get_lyrics_embedding)
music_sample['lyrics_embedding'] = music_sample['lyrics'].apply(get_lyrics_embedding)

# Check
print("Spotify embedding shape:", spotify_sample['lyrics_embedding'].iloc[0].shape)  # Should be (768,)
print("Music embedding shape:", music_sample['lyrics_embedding'].iloc[0].shape)

Note: you may need to restart the kernel to use updated packages.


ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^
ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


Note: you may need to restart the kernel to use updated packages.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spotify_sample['lyrics_embedding'] = spotify_sample['text'].apply(get_lyrics_embedding)


Spotify embedding shape: (768,)
Music embedding shape: (768,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  music_sample['lyrics_embedding'] = music_sample['lyrics'].apply(get_lyrics_embedding)


In [5]:
# Cell 3: Save processed data
spotify_sample.to_pickle('spotify_processed.pkl')
music_sample.to_pickle('music_processed.pkl')
print("Data saved successfully!")

Data saved successfully!


In [6]:
import mido
print("Mido installed!")

Mido installed!


In [7]:
import pandas as pd
import re
from sklearn.preprocessing import MinMaxScaler
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load original dataset
music_df = pd.read_csv('tcc_ceds_music.csv')  # Update path

# Clean lyrics
def clean_lyrics(text):
    return re.sub(r'[^a-zA-Z\s]', '', str(text).lower()).strip()

music_df['lyrics'] = music_df['lyrics'].apply(clean_lyrics)

# Normalize features
scaler = MinMaxScaler()
music_df[['sadness', 'danceability', 'energy']] = scaler.fit_transform(
    music_df[['sadness', 'danceability', 'energy']]
)

# Map features to mood
def assign_mood(row):
    if row['energy'] > 0.6 and row['danceability'] > 0.6:
        return 'happy'
    elif row['sadness'] > 0.6:
        return 'sad'
    else:
        return 'neutral'

music_df['mood'] = music_df.apply(assign_mood, axis=1)

# Generate lyrics embeddings (sample 100)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_lyrics_embedding(lyrics):
    inputs = tokenizer(lyrics, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

music_sample = music_df.head(100)
music_sample['lyrics_embedding'] = music_sample['lyrics'].apply(get_lyrics_embedding)

# Save with mood column
music_sample.to_pickle('C:/Users/vivek gupta/OneDrive/Desktop/musicVae/music_processed.pkl')
print("Updated music_processed.pkl saved with 'mood' column!")

Updated music_processed.pkl saved with 'mood' column!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  music_sample['lyrics_embedding'] = music_sample['lyrics'].apply(get_lyrics_embedding)


In [8]:
df = pd.read_pickle('C:/Users/vivek gupta/OneDrive/Desktop/musicVae/music_processed.pkl')
print(df.columns)  # Should include 'mood'
print(df['mood'].head())

Index(['Unnamed: 0', 'artist_name', 'track_name', 'release_date', 'genre',
       'lyrics', 'len', 'dating', 'violence', 'world/life', 'night/time',
       'shake the audience', 'family/gospel', 'romantic', 'communication',
       'obscene', 'music', 'movement/places', 'light/visual perceptions',
       'family/spiritual', 'like/girls', 'sadness', 'feelings', 'danceability',
       'loudness', 'acousticness', 'instrumentalness', 'valence', 'energy',
       'topic', 'age', 'mood', 'lyrics_embedding'],
      dtype='object')
0    neutral
1    neutral
2    neutral
3      happy
4    neutral
Name: mood, dtype: object
