In [10]:
import pandas as pd

# Load the dataset
data = pd.read_csv('../data/spotify_millsongdata.csv')

# Drop the 'link' column as it's not needed for the recommender
data.drop('link', axis=1, inplace=True)

# Display the first few rows to understand what the data looks like
print(data.head())


  artist                   song  \
0   ABBA  Ahe's My Kind Of Girl   
1   ABBA       Andante, Andante   
2   ABBA         As Good As New   
3   ABBA                   Bang   
4   ABBA       Bang-A-Boomerang   

                                                text  
0  Look at her face, it's a wonderful face  \r\nA...  
1  Take it easy with me, please  \r\nTouch me gen...  
2  I'll never know why I had to go  \r\nWhy I had...  
3  Making somebody happy is a question of give an...  
4  Making somebody happy is a question of give an...  


In [11]:
# Sample 10,000 records from the dataset
data_sampled = data.sample(n=10000, random_state=42)  # Seed for reproducibility

# Reset index after sampling
data_sampled.reset_index(drop=True, inplace=True)

# Display the shape and head of the sampled data to verify
print(data_sampled.shape)
print(data_sampled.head())


(10000, 3)
         artist                       song  \
0  Wishbone Ash             Right Or Wrong   
1     Aerosmith  This Little Light Of Mine   
2  Fall Out Boy               Dance, Dance   
3  Janis Joplin                 Easy Rider   
4   Moody Blues                  Peak Hour   

                                                text  
0  Like to have you 'round  \r\nWith all the lies...  
1  This Little Light of Mine (Light of Mine),  \r...  
2  She says she's no good with words but I'm wors...  
3  Hey mama, mama, come a look at sister,  \r\nSh...  
4  I see it all through my window it seems.  \r\n...  


In [12]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download stopwords and wordnet data
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lower case
    text = text.lower()
    
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize text
    words = text.split()
    
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    
    # Join words to form the cleaned up text
    text = ' '.join(words)
    
    return text

# Apply preprocessing to each entry in the 'text' column
data_sampled['text'] = data_sampled['text'].apply(preprocess_text)

# Verify preprocessing
print(data_sampled['text'].head())


0    like round lie make thing darkness people say ...
1    little light mine light mine im let shine alei...
2    say shes good word im worse barely stuttered j...
3    hey mama mama come look sister shes astanding ...
4    see window seems never failing like million ee...
Name: text, dtype: object


In [13]:

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer()

# Fit and transform the preprocessed 'text' column
tfidf_matrix = tfidf.fit_transform(data_sampled['text'])

# Check the shape of the resulting TF-IDF matrix
print(tfidf_matrix.shape)

(10000, 33481)


In [18]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Check the first element to verify calculations
print(cosine_sim[0])


[1.         0.         0.04397211 ... 0.02830541 0.03655577 0.0710031 ]


In [19]:
def recommend_songs(song_title, data, cosine_sim, top_n=10):
    # Check if the song is in the dataset
    if song_title not in data['song'].values:
        return f"No recommendations found: '{song_title}' is not in the dataset."

    # Find the index of the song that matches the title
    idx = data[data['song'] == song_title].index[0]

    # Get the pairwise similarity scores of all songs with that song
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the songs based on the similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top-n most similar songs
    sim_scores = sim_scores[1:top_n+1]

    # Get the song indices
    song_indices = [i[0] for i in sim_scores]

    # Return the top-n most similar songs
    return data['song'].iloc[song_indices]

# Make sure to replace 'Enter Your Song Title Here' with an actual song title from your dataset
test_song_title = 'Salute'
print(recommend_songs(test_song_title, data_sampled, cosine_sim))


9226                     Age
2373             i hate boys
4259        Along Came Jones
2382          Living In Fame
3612          Hammer To Fall
6247         Light The Shade
2820    Sweeter Than Fiction
6822    My Baby's Good To Me
8744            Bold As Love
4850          Le Ballet D'or
Name: song, dtype: object


In [20]:
print(data_sampled['song'].sample(5))  # Print five random song titles from the dataset


1817      Riot Reunion
6115          Case 795
7911           Forever
3400      Messiah Ward
7705    Wild And Crazy
Name: song, dtype: object


In [22]:
import pickle
pickle.dump(cosine_sim,open('similarity.pkl','wb'))


In [23]:
pickle.dump(data,open('df.pkl','wb'))