In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [5]:
# 1. Load Dataset
df = pd.read_csv('data/Spotify_Million_Song_Dataset_exported.csv')

In [6]:
# 2. Text Preprocessing
def clean_lyrics(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['cleaned_text'] = df['text'].apply(clean_lyrics)

In [7]:
# 3. Build Similarity Engine
# Stop-words are removed during vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
tfidf_matrix = vectorizer.fit_transform(df['cleaned_text'])

def identify_song(snippet):
    cleaned_snippet = clean_lyrics(snippet)
    snippet_vec = vectorizer.transform([cleaned_snippet])
    
    # Calculate similarity scores
    similarities = cosine_similarity(snippet_vec, tfidf_matrix).flatten()
    best_idx = similarities.argmax()
    
    return {
        "song": df.iloc[best_idx]['song'],
        "artist": df.iloc[best_idx]['artist'],
        "confidence": round(similarities[best_idx], 4)
    }

In [8]:
# 4. Demonstration of Accuracy
# Test with a snippet: "Take it easy with me please touch me gently"
print(identify_song("Take it easy with me please touch me gently"))

{'song': 'Gently', 'artist': 'Elvis Presley', 'confidence': np.float64(0.6816)}
