## Import Required Libraries

In [123]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Exploring the Dataset

In [124]:
sp_tracks = pd.read_csv('tracks.csv')
sp_artists = pd.read_csv('artists.csv')

In [None]:
#viewing the tracks data
sp_tracks.head()

In [None]:
#viewing the artists data
sp_artists.head()

## Identifying Null Values in the Dataset

In [None]:
#checking null in tracks data
pd.isnull(sp_tracks).sum()

In [None]:
#checking null in artists data
pd.isnull(sp_artists).sum()

## Dataset Overview: Rows, Columns, Data Types, and Memory Usage

In [None]:
#checking info in tracks data
sp_tracks.info()

In [None]:
#checking info in artists data
sp_artists.info()

## Extracting Insights from the Dataset through Analysis📊

### 1. Exploring the 10 Least Popular Songs in the Spotify Dataset

In [None]:
a=sp_tracks.sort_values('popularity',ascending=True)[0:10]
a[['name','popularity']]

### 2. Descriptive Statistics

In [None]:
# descriptive statistics of tracks
sp_tracks.describe().transpose()

### 3. Discovering the Top 10 Popular Songs in the Spotify Dataset

In [None]:
a=sp_tracks
b=a[a['popularity']>90].sort_values('popularity',ascending=False)[:10]
b[['name','popularity','artists']]

### 4. Setting Release Date as the Index Column

In [None]:
sp_tracks.set_index('release_date', inplace=True)
sp_tracks.index = pd.to_datetime(sp_tracks.index, errors='coerce', format='mixed')
years = sp_tracks.index.year
sp_tracks.head()


### 5. Extracting Artist Name from the 18th Row of the Dataset

In [None]:
sp_tracks[['artists']].iloc[18]

### 6. Converting Song Duration from Milliseconds to Seconds

In [None]:
sp_tracks['duration'] = sp_tracks['duration_ms'].apply (lambda x : round(x/1000))
sp_tracks.drop('duration_ms', inplace = True, axis=1)
sp_tracks.duration.head()

### 7. Visualization: Pearson Correlation Heatmap for Two Variables

In [None]:
td = sp_tracks.select_dtypes(include='number').corr(method='pearson')
plt.figure(figsize=(9,5))
hmap = sns.heatmap(td, annot = True, fmt = '.1g', vmin=-1, vmax=1, center=0, cmap='Greens', linewidths=0.1, linecolor='black')
hmap.set_title('Correlation HeatMap')
hmap.set_xticklabels(hmap.get_xticklabels(), rotation=90)

### 8. Creating a 4% Sample of the Entire Dataset

In [None]:
sample_sp=sp_tracks.sample(int(0.004*len(sp_tracks)))
print(len(sample_sp))

### 9. Regression Plot of Loudness vs. Energy with Regression Line

In [None]:
plt.figure(figsize=(8,4))
sns.regplot(data=sample_sp, y='loudness', x='energy', color='#054907').set(title='Regression Plot - Loudness vs Energy Correlation')

### 10. Regression Plot of Popularity vs. Acousticness with Regression Line

In [None]:
plt.figure(figsize=(8,4))
sns.regplot(data=sample_sp, y='popularity', x='acousticness', color='#008000').set(title='Regression Plot - Popularity vs Acousticness Correlation')

### 11. Adding a New Column to the Tracks Table

In [None]:
sp_tracks['dates']=sp_tracks.index.get_level_values('release_date')
sp_tracks.dates=pd.to_datetime(sp_tracks.dates)
years=sp_tracks.dates.dt.year
sp_tracks.head()

### 12. Graph: Number of Songs per Year

In [None]:
sns.displot(years, discrete=True, aspect=2, height=4, kind='hist',color='g').set(title='No of songs - per year')

### 13. Line Graph: Duration of Songs Over Each Year

In [None]:
total_dr = sp_tracks.duration
fig_dims = (15,5)
fig, ax = plt.subplots(figsize=fig_dims)
fig = sns.barplot(x = years, y = total_dr, ax = ax, err_kws={'linewidth': False}).set(title='Years vs Duration')
plt.xticks(rotation=90)

### 14. Horizontal Bar Plot: Song Duration Across Different Genres

In [145]:
# Cleaning the id_artists from sp_tracks to match with the id from sp_artists
sp_tracks['id_artists_clean'] = sp_tracks['id_artists'].str.strip("[]'\"").str.split(',').str[0]


In [146]:
merged_df = sp_tracks.merge(sp_artists[['id', 'genres']], left_on='id_artists_clean', right_on='id')


In [None]:
merged_df.head()

In [148]:
genre_duration = merged_df.groupby('genres')['duration'].sum().sort_values(ascending=False).reset_index()

top_genres = genre_duration.head(10)

In [None]:
plt.figure(figsize=(10, 6))
sns.set_palette('crest')

sns.barplot(y='genres', x='duration', data=top_genres)

plt.title('Average Duration of Songs by Genre')
plt.xlabel('Duration')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()

### 15. Bar Plot: Top Five Genres by Popularity

In [None]:
sns.set_style(style='darkgrid')
plt.figure(figsize=(8,4))
Top = merged_df.sort_values('popularity', ascending=False)[:10]
sns.barplot(y = 'genres', x = 'popularity', data = Top).set(title='Genres by Popularity-Top 5')