In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')


# Loading the dataset with 'ISO-8859-1' encoding
data = pd.read_csv('/kaggle/input/top-spotify-songs-2023/spotify-2023.csv', encoding='ISO-8859-1')

# Display the first few rows of the dataset
data.head()



In [None]:
len(data['released_year'].unique())

In [None]:
# Check the structure of the dataset
num_rows, num_columns = data.shape

num_rows, num_columns


In [None]:
# Check for missing values
missing_values = data.isnull().sum()

missing_values[missing_values > 0]


In [None]:
# Get basic statistics of the numerical columns
basic_statistics = data.describe()

basic_statistics


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Top 10 artists with most songs in the dataset
top_artists = data['artist(s)_name'].value_counts().head(10)

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=top_artists.values, y=top_artists.index, palette='viridis')
plt.xlabel('Number of Songs')
plt.ylabel('Artist(s) Name')
plt.title('Top 10 Artists with Most Songs')
plt.show()

top_artists


**Here are the top 10 artists with the most songs in the dataset:**

* Taylor Swift: 34 songs
* The Weeknd: 22 songs
* Bad Bunny: 19 songs
* SZA: 19 songs
* Harry Styles: 17 songs
* Kendrick Lamar: 12 songs
* Morgan Wallen: 11 songs
* Ed Sheeran: 9 songs
* BTS: 8 songs
* Feid: 8 songs

In [None]:
# Check data types of all columns
data.dtypes


In [None]:
# Remove non-numeric values from 'streams' column and convert to integer
data['streams'] = pd.to_numeric(data['streams'], errors='coerce')

# Check data types again
data.dtypes


In [None]:
# Top 10 songs with most streams on Spotify
top_spotify_streams = data[['track_name', 'artist(s)_name', 'streams']].sort_values(by='streams', ascending=False).head(10)

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=top_spotify_streams['streams'], y=top_spotify_streams['track_name'], palette='viridis')
plt.xlabel('Streams (in billions)')
plt.ylabel('Track Name')
plt.title('Top 10 Songs with Most Streams on Spotify')
plt.xticks(rotation=45)
plt.show()

top_spotify_streams


**Here are the top 10 songs with the most streams on Spotify:**

1. "Blinding Lights" by The Weeknd: ~3.70 billion streams
2. "Shape of You" by Ed Sheeran: ~3.56 billion streams
3. "Someone You Loved" by Lewis Capaldi: ~2.89 billion streams
4. "Dance Monkey" by Tones and I: ~2.86 billion streams
5. "Sunflower - Spider-Man: Into the Spider-Verse" by Post Malone, Swae Lee: ~2.81 billion streams
6. "One Dance" by Drake, WizKid, Kyla: ~2.71 billion streams
7. "STAY (with Justin Bieber)" by Justin Bieber, The Kid Laroi: ~2.67 billion streams
8. "Believer" by Imagine Dragons: ~2.59 billion streams
9. "Closer" by The Chainsmokers, Halsey: ~2.59 billion streams
10. "Starboy" by The Weeknd, Daft Punk: ~2.57 billion streams

In [None]:
# Top 10 songs with highest presence in Apple Music playlists
top_apple_playlists = data[['track_name', 'artist(s)_name', 'in_apple_playlists']].sort_values(by='in_apple_playlists', ascending=False).head(10)

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=top_apple_playlists['in_apple_playlists'], y=top_apple_playlists['track_name'], palette='viridis')
plt.xlabel('Number of Apple Music Playlists')
plt.ylabel('Track Name')
plt.title('Top 10 Songs with Highest Presence in Apple Music Playlists')
plt.xticks(rotation=45)
plt.show()

top_apple_playlists


In [None]:
# Plot histogram for danceability
plt.figure(figsize=(10, 6))
sns.histplot(data['danceability_%'], bins=20, kde=True, color='purple')
# sns.histplot(data['danceability_%'], bins=20, kde=True, color='purple')
plt.xlabel('Danceability (%)')
plt.ylabel('Frequency')
plt.title('Distribution of Danceability')
plt.show()


In [None]:
# Calculate average danceability for each year
average_danceability_by_year = data.groupby('released_year')['danceability_%'].mean()

# Plot trends in danceability over the years
plt.figure(figsize=(10, 6))
sns.lineplot(x=average_danceability_by_year.index, y=average_danceability_by_year.values, color='blue')
plt.xlabel('Year')
plt.ylabel('Average Danceability (%)')
plt.title('Trends in Danceability Over the Years')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Select columns for cross-platform metrics
cross_platform_columns = [
    'in_spotify_playlists', 'in_spotify_charts', 'in_apple_playlists', 
    'in_apple_charts', 'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts'
]

# Calculate the correlation matrix
correlation_matrix = data[cross_platform_columns].corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap: Cross-Platform Metrics')
plt.show()


In [None]:
# Select columns for audio features and popularity
audio_features_columns = ['danceability_%', 'energy_%', 'valence_%']
popularity_column = 'streams'

# Create scatter plots
plt.figure(figsize=(15, 5))
for idx, feature in enumerate(audio_features_columns, start=1):
    plt.subplot(1, 3, idx)
    sns.scatterplot(x=data[feature], y=data[popularity_column], color='blue', alpha=0.5)
    plt.xlabel(feature.replace('_', ' ').title())
    plt.ylabel('Number of Streams')
    plt.title(f'{feature.replace("_", " ").title()} vs. Number of Streams')

plt.tight_layout()
plt.show()


In [None]:
# Select columns for audio features and popularity
audio_features_columns = ['danceability_%', 'energy_%', 'valence_%']
popularity_column = 'streams'

# Create scatter plots
plt.figure(figsize=(12, 4))
for feature in audio_features_columns:
    sns.scatterplot(data=data, x=feature, y=popularity_column, alpha=0.5, label=feature.replace('_', ' ').title())

plt.xlabel('Audio Feature')
plt.ylabel('Number of Streams')
plt.title('Relationship Between Audio Features and Song Popularity')
plt.legend()
plt.show()


In [None]:
import plotly.express as px

# Select columns for audio features and popularity
audio_features_columns = ['danceability_%', 'energy_%', 'valence_%']
popularity_column = 'streams'

# Create an interactive scatter plot with dropdown menus
fig = px.scatter(data, x=popularity_column, y=popularity_column, color=audio_features_columns[0],
                 labels={'x': 'Number of Streams', 'y': 'Number of Streams'},
                 title='Relationship Between Audio Features and Song Popularity')

# Add dropdown menu for audio features
fig.update_layout(
    updatemenus=[
        {
            'buttons': [
                {'method': 'relayout', 'label': feature.replace('_', ' ').title(), 'args': [{'yaxis.title.text': feature.replace('_', ' ').title()}]}
                for feature in audio_features_columns
            ],
            'direction': 'down',
            'showactive': True,
            'x': 0.15,
            'xanchor': 'left',
            'y': 1.15,
            'yanchor': 'top'
        }
    ]
)

fig.show()


In [None]:
import plotly.express as px

# Create scatter plot for Popularity vs. Danceability
fig = px.scatter(data, x='danceability_%', y='streams', title='Popularity vs. Danceability')
fig.update_layout(xaxis_title='Danceability (%)', yaxis_title='Number of Streams')
fig.show()


In [None]:
import plotly.express as px

# Create box plot for Valence distribution by Year
fig = px.box(data, x='released_year', y='valence_%', title='Valence Distribution by Year')
fig.update_layout(xaxis_title='Year', yaxis_title='Valence (%)')
fig.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select columns for audio features
audio_features_columns = ['danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 'instrumentalness_%']

# Calculate the correlation matrix
correlation_matrix = data[audio_features_columns].corr()

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap: Audio Features')
plt.show()


In [None]:
pip install plotly


In [None]:
import plotly.express as px

# Create an interactive scatter plot with dropdown menu for Spotify
fig = px.scatter(data, x='danceability_%', y='streams', color='streams',
                 title='Danceability vs. Spotify Streams',
                 labels={'danceability_%': 'Danceability (%)', 'streams': 'Number of Streams'},
                 hover_name='track_name', template='plotly_dark')

# Add dropdown menu for artists
fig.update_layout(
    updatemenus=[
        {
            'buttons': [
                {'method': 'relayout', 'label': artist, 'args': [{'yaxis.title.text': f'Number of Streams for {artist}'}]}
                for artist in data['artist(s)_name'].unique()
            ],
            'direction': 'down',
            'showactive': True,
            'x': 0.15,
            'xanchor': 'left',
            'y': 1.15,
            'yanchor': 'top'
        }
    ]
)

# Show the interactive plot
fig.show()


In [None]:
import plotly.express as px

# Create an interactive scatter plot with dropdown menu for multiple audio features on Spotify
fig = px.scatter(data, x='danceability_%', y='streams', color='streams',
                 title='Audio Features vs. Spotify Streams',
                 labels={'danceability_%': 'Danceability (%)', 'streams': 'Number of Streams'},
                 hover_name='track_name', template='plotly_dark')

# Add dropdown menu for audio features
fig.update_layout(
    updatemenus=[
        {
            'buttons': [
                {'method': 'relayout', 'label': feature.replace('_', ' ').title(), 'args': [{'xaxis.title.text': feature.replace('_', ' ').title()}]}
                for feature in data.select_dtypes(include=['float']).columns
            ],
            'direction': 'down',
            'showactive': True,
            'x': 0.15,
            'xanchor': 'left',
            'y': 1.15,
            'yanchor': 'top'
        }
    ]
)

# Show the interactive plot
fig.show()


In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
# track_name,artist(s)_name,mode,released_year,in_spotify_playlists,streams
data['title']=data["track_name"]
data=data[["track_name","artist(s)_name","mode","in_spotify_playlists","title","released_year","streams"]]

data.head()

In [None]:
data.dtypes

In [None]:
# data["released_year"]=pd.DataFrame.from_dict(data["released_year"], errors='1234567890')
# # pd.DataFrame.from_dict(dict)

In [None]:
data.dtypes

In [None]:
data.head()

In [None]:
data.duplicated().sum()

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
plt.scatter(data['released_year'],data['streams'])

In [None]:
#  Convert double variable into integers 

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data['streams'] = le.fit_transform(data['streams'])
# data.dtypes


In [None]:


#  dataset "streams" and "years" columns
X = data[['streams', 'released_year']]

# Standardize the data (important for K-Means)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Determine the optimal number of clusters using the Elbow method
inertia = []
for k in range(1,11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the Elbow method graph
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o', linestyle='--')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.grid()
plt.show()



In [None]:
# Based on the Elbow method, choose an appropriate number of clusters (K)
# Let's say you choose K=3

# Perform K-Means clustering with the chosen K
k = 3
kmeans = KMeans(n_clusters=k, random_state=0)
data['cluster'] = kmeans.fit_predict(X_scaled)

# Now, you can analyze the results
# You have clustered the data into 'k' clusters and assigned each data point to a cluster
# 'cluster' column in the DataFrame contains cluster labels

# You can further analyze and visualize the clusters, e.g., by plotting them
for i in range(k):
    cluster_data = data[data['cluster'] == i]
    plt.scatter(cluster_data['streams'], cluster_data['released_year'], label=f'Cluster {i + 1}')

plt.xlabel('Streams')
plt.ylabel('Released_Year')
plt.title('Clustering Results')
plt.legend()
plt.show()

# data['cluster']=data['cluster']+1;

In [None]:
data.head()

In [None]:
movie=data.drop_duplicates()

In [None]:
movie.duplicated().sum()

In [None]:
movie['track_name']=movie['track_name'].apply(lambda x:x.split())


In [None]:
 movie.head()

In [None]:
movie['artist(s)_name']=movie['artist(s)_name'].apply(lambda x:x.split())

In [None]:
movie.head()

In [None]:
movie['mode']=movie['mode'].apply(lambda x:x.split())

In [None]:
movie['artist(s)_name']=movie['artist(s)_name'].apply(lambda x:[i.replace(" ","")for i in x])


In [None]:
movie.head()

In [None]:
movie['tags']=movie['track_name']+movie['artist(s)_name']+movie['mode']

In [None]:
movie.head(5)

In [None]:
new_df=movie[['in_spotify_playlists','title','tags','cluster']]

In [None]:
new_df.head()

In [None]:
new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))

In [None]:
new_df.head()

In [None]:
new_df['tags'][0]

In [None]:
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=5000,stop_words='english')

In [None]:
 vector=cv.fit_transform(new_df['tags']).toarray()

In [None]:
# # vector[0]
# cv.get_feature_names_out()

In [None]:
import nltk

In [None]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
def stem(text):
    y=[]
    for i in text:
        y.append(ps.stem(i))
    return " ".join(y)  

In [None]:
new_df['tags']=new_df['tags'].apply(stem)
# cv.get_feature_names_out()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity=cosine_similarity(vector)

In [None]:
similarity[0][3]

In [None]:
def recommend(rmovie,cluster=False):
#     newd=new_df
    movie_index=new_df[new_df['title']==rmovie].index[0]
    distances=enumerate(similarity[movie_index])
    movie_list=sorted(distances,reverse=True,key=lambda x:x[1])
    c=0;
    for i in movie_list:
        if(c>5):
            break
        if(cluster==True):
            if(new_df.iloc[movie_index].cluster==new_df.iloc[i[0]].cluster):
                print(new_df.iloc[i[0]].in_spotify_playlists,new_df.iloc[i[0]].cluster,new_df.iloc[i[0]].title)
                c+=1;
        else:
            print(new_df.iloc[i[0]].in_spotify_playlists,new_df.iloc[i[0]].cluster,new_df.iloc[i[0]].title)
            c+=1;
#         print(new_df.iloc[i[0]].in_spotify_playlists,new_df.iloc[i[0]].cluster,new_df.iloc[i[0]].title)


        

In [None]:
# recommend same cluster songs
recommend('Normal',True)

In [None]:
#recommend all clusters songs
recommend('Hey Mor')

In [None]:
#recommend all clusters songs
recommend('Jingle Bells - Remastered 1999',False)