In [9]:
import pandas as pd


spotify_data = pd.read_csv("spotify.csv")
spotify_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32833 entries, 0 to 32832
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  32833 non-null  object 
 1   track_name                32828 non-null  object 
 2   track_artist              32828 non-null  object 
 3   track_popularity          32833 non-null  int64  
 4   track_album_id            32833 non-null  object 
 5   track_album_name          32828 non-null  object 
 6   track_album_release_date  32833 non-null  object 
 7   playlist_name             32833 non-null  object 
 8   playlist_id               32833 non-null  object 
 9   playlist_genre            32833 non-null  object 
 10  playlist_subgenre         32833 non-null  object 
 11  danceability              32833 non-null  float64
 12  energy                    32833 non-null  float64
 13  key                       32833 non-null  int64  
 14  loudne

In [13]:

grouped_genre = spotify_data.groupby('playlist_genre')

avg_loudness = grouped_genre.loudness.mean()
avg_mode = grouped_genre.mode.mean()
avg_speechiness = grouped_genre.speechiness.mean()
avg_acousticness = grouped_genre.acousticness.mean()
avg_instrumentalness = grouped_genre.instrumentalness.mean()
avg_liveness = grouped_genre.liveness.mean()
avg_valence = grouped_genre.valence.mean()
avg_tempo = grouped_genre.tempo.mean()


spotify_data['duration_minutes'] = spotify_data.duration_ms / (1000 * 60)
avg_duration = grouped_genre.duration_minutes.mean()


In [15]:
average_stats = pd.DataFrame({
    'Average Loudness': avg_loudness,
    'Average Mode': avg_mode,
    'Average Speechiness': avg_speechiness,
    'Average Acousticness': avg_acousticness,
    'Average Instrumentalness': avg_instrumentalness,
    'Average Liveness': avg_liveness,
    'Average Valence': avg_valence,
    'Average Tempo': avg_tempo,
    'Average Duration (minutes)': avg_duration
})


average_stats.to_csv('spotify_2017_genre_averages.csv')
print("Average statistics by genre have been saved to 'spotify_2017_genre_averages.csv'.")


Average statistics by genre have been saved to 'spotify_2017_genre_averages.csv'.


In [17]:
from sklearn.tree import DecisionTreeRegressor


spotify_features = ["danceability", "loudness", "speechiness", "acousticness", 
                    "instrumentalness", "liveness", "valence", "tempo", "duration_minutes"]
X = spotify_data[spotify_features]
y = spotify_data["energy"]


spotify_model = DecisionTreeRegressor(random_state=1)
spotify_model.fit(X, y)


predictions = spotify_model.predict(X)
print("Predicted energy values: ")
print(predictions)


Predicted energy values: 
[0.916 0.815 0.931 ... 0.821 0.888 0.884]


In [19]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split


error = mean_absolute_error(y, predictions)
print("You are off by", round(error, 2), "energy units")


train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)


spotify_model = DecisionTreeRegressor(random_state=1)
spotify_model.fit(train_X, train_y)
val_predictions = spotify_model.predict(val_X)

validation_error = mean_absolute_error(val_y, val_predictions)
print("Validation Error (Mean Absolute Error): ", validation_error)


You are off by 0.0 energy units
Validation Error (Mean Absolute Error):  0.08379742965038371


In [21]:

correlation_matrix = spotify_data[["danceability", "energy", "loudness", "speechiness", "acousticness", 
                                   "instrumentalness", "liveness", "valence", "tempo", "duration_minutes"]].corr()

print("Correlation matrix for danceability, energy, and other features:")
print(correlation_matrix)


Correlation matrix for danceability, energy, and other features:
                  danceability    energy  loudness  speechiness  acousticness  \
danceability          1.000000 -0.086073  0.025335     0.181721     -0.024519   
energy               -0.086073  1.000000  0.676625    -0.032150     -0.539745   
loudness              0.025335  0.676625  1.000000     0.010339     -0.361638   
speechiness           0.181721 -0.032150  0.010339     1.000000      0.026092   
acousticness         -0.024519 -0.539745 -0.361638     0.026092      1.000000   
instrumentalness     -0.008655  0.033247 -0.147824    -0.103424     -0.006850   
liveness             -0.123859  0.161223  0.077613     0.055426     -0.077243   
valence               0.330523  0.151103  0.053384     0.064659     -0.016845   
tempo                -0.184084  0.149951  0.093767     0.044603     -0.112724   
duration_minutes     -0.096879  0.012611 -0.115058    -0.089431     -0.081581   

                  instrumentalness  livenes