In [32]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests

In [44]:
# Fetch data from the URL
url = "music_indian.csv"
df = pd.read_csv(url)

In [46]:
# Create features for recommendation
df['features'] = (
    df['Region'].fillna('') + ' ' + 
    df['Festival'].fillna('') + ' ' + 
    df['Tradition'].fillna('') + ' ' + 
    df['Song Name'].fillna('') + ' ' + 
    df['Author'].fillna('')
)

In [48]:
# Create TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['features'])

In [40]:
# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [54]:
# Function to get recommendations
def get_recommendations(region='', festival='', tradition='', cosine_sim=cosine_sim):
    # Filter the dataframe based on user selections
    mask = (
        (df['Region'].str.contains(region, case=False, na=False) if region else True) &
        (df['Festival'].str.contains(festival, case=False, na=False) if festival else True) &
        (df['Tradition'].str.contains(tradition, case=False, na=False) if tradition else True)
    )
    filtered_df = df[mask]
    
    if filtered_df.empty:
        return []

# Get the Sdataframe
    idx = filtered_df.sample().index[0]
    
    # Get the pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the items based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar items
    sim_scores = sim_scores[1:11]
    
    # Get the item indices
    item_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar items
    return df.iloc[item_indices][['Song Name', 'Author', 'Region', 'Festival', 'Tradition', 'URL']].to_dict('records')


In [56]:
# Get unique values for dropdowns
regions = sorted(df['Region'].dropna().unique().tolist())
festivals = sorted(df['Festival'].dropna().unique().tolist())
traditions = sorted(df['Tradition'].dropna().unique().tolist())

In [58]:
# Save the preprocessed data and categories
df.to_csv('preprocessed_music_data.csv', index=False)
import joblib
joblib.dump(cosine_sim, 'cosine_sim_matrix.joblib')
joblib.dump({
    'regions': regions,
    'festivals': festivals,
    'traditions': traditions
}, 'categories.joblib')

print("Data processing complete!")

Data processing complete!
