**Importing the Dependencies**

In [2]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**Importing the Dataset**

In [4]:
df= pd.read_csv("spotify_millsongdata.csv")
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [7]:
#checking for null values in dataset
df.isnull().sum()

Unnamed: 0,0
artist,0
song,0
link,0
text,0


In [8]:
# Checking the size of the Dataframe
df.shape

(57650, 4)

In [9]:
# Taking sample 10,000 rows from the DataFrame, droping  the 'link' column, and reset the index without adding the old index as a column.
df = df.sample(10000).drop('link', axis=1).reset_index(drop=True)


In [10]:
# Convert all text in the 'text' column to lowercase, replace all alphanumeric characters at the start of the string with a space,
# and replace newline characters with a space.
df['text'] = df['text'].str.lower().replace(r'^a-zA-Z0-9', ' ').replace(r'\n', ' ', regex=True)


In [11]:
# Initialize a PorterStemmer object for stemming words in text
port_stem = PorterStemmer()

In [12]:
def token(txt):  # Define a function to tokenize and stem text
    token = nltk.word_tokenize(txt)  # Tokenize the text into individual words
    token = [port_stem.stem(w) for w in token]  # Apply stemming to each word
    return " ".join(token)  # Join the stemmed words into a single string and return it


In [13]:
# Applying the token function to the 'text' column of the DataFrame
df['text'] = df['text'].apply(lambda x: token(x))

In [14]:
# Initialize a TfidfVectorizer to convert text into TF-IDF features and removing common English stop words.
tfid = TfidfVectorizer(analyzer='word', stop_words='english')


In [15]:
# Convert the tokenized text into a TF-IDF matrix
matrix = tfid.fit_transform(df['text'])

In [16]:
# Calculating the cosine similarity between the TF-IDF matrix.
similar =cosine_similarity(matrix)

In [17]:
 def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]  # Get the index of the input song in the DataFrame
    distances = sorted(list(enumerate(similar[idx])),reverse=True,key=lambda x:x[1])  # Calculate the similarity scores between the input song and all other songs in the DataFrame

    if 'song' not in df.columns:  # Check if the DataFrame contains a 'song' column
            return "The dataframe does not contain a 'song' column."

    songs = []  # Initialize an empty list to store the recommended songs
    for m_id in distances[1:5]:  # Iterate over the top 5 most similar songs
        songs.append(df.iloc[m_id[0]].song)  # Append the name of the recommended song to the list

    return songs

In [None]:
print(recommendation("Dirt")) #Checking the Output

['Finding My Way', 'Tomorrow May Never Come', "I Can't Deny It", 'Now Is The Time']


In [None]:
#Saving the objects
import pickle
pickle.dump(similar , open ("similarity", "wb"))
pickle.dump(df , open ("df", "wb"))