**Spotify Top 50**

Predicting if top 50 songs from 2019 will be more or less popular in future.

In [20]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

In [21]:
data = pd.read_csv('../input/top50spotify2019/top50.csv', encoding='latin-1')
data = data.drop(['Unnamed: 0', 'Track.Name'], axis=1)
data

Unnamed: 0,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability,Loudness..dB..,Liveness,Valence.,Length.,Acousticness..,Speechiness.,Popularity
0,Shawn Mendes,canadian pop,117,55,76,-6,8,75,191,4,3,79
1,Anuel AA,reggaeton flow,105,81,79,-4,8,61,302,8,9,92
2,Ariana Grande,dance pop,190,80,40,-4,16,70,186,12,46,85
3,Ed Sheeran,pop,93,65,64,-8,8,55,198,12,19,86
4,Post Malone,dfw rap,150,65,58,-4,11,18,175,45,7,94
5,Ed Sheeran,pop,102,68,80,-5,9,84,220,9,4,84
6,Lil Tecca,trap music,180,64,75,-6,7,23,131,2,29,92
7,Sam Smith,pop,111,68,48,-5,8,35,202,15,9,90
8,Lil Nas X,country rap,136,62,88,-6,11,64,157,5,10,87
9,Billie Eilish,electropop,135,43,70,-11,10,56,194,33,38,95


In [22]:
data['Popularity'] = pd.qcut(data['Popularity'], q=2, labels=[0, 1])
data

Unnamed: 0,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability,Loudness..dB..,Liveness,Valence.,Length.,Acousticness..,Speechiness.,Popularity
0,Shawn Mendes,canadian pop,117,55,76,-6,8,75,191,4,3,0
1,Anuel AA,reggaeton flow,105,81,79,-4,8,61,302,8,9,1
2,Ariana Grande,dance pop,190,80,40,-4,16,70,186,12,46,0
3,Ed Sheeran,pop,93,65,64,-8,8,55,198,12,19,0
4,Post Malone,dfw rap,150,65,58,-4,11,18,175,45,7,1
5,Ed Sheeran,pop,102,68,80,-5,9,84,220,9,4,0
6,Lil Tecca,trap music,180,64,75,-6,7,23,131,2,29,1
7,Sam Smith,pop,111,68,48,-5,8,35,202,15,9,1
8,Lil Nas X,country rap,136,62,88,-6,11,64,157,5,10,0
9,Billie Eilish,electropop,135,43,70,-11,10,56,194,33,38,1


In [23]:
def onehot_encode(df, column, prefix):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=prefix)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df


In [24]:
data = onehot_encode(data, 'Artist.Name', 'artist')
data = onehot_encode(data, 'Genre', 'genre')

In [25]:
data

Unnamed: 0,Beats.Per.Minute,Energy,Danceability,Loudness..dB..,Liveness,Valence.,Length.,Acousticness..,Speechiness.,Popularity,...,genre_electropop,genre_escape room,genre_latin,genre_panamanian pop,genre_pop,genre_pop house,genre_r&b en espanol,genre_reggaeton,genre_reggaeton flow,genre_trap music
0,117,55,76,-6,8,75,191,4,3,0,...,0,0,0,0,0,0,0,0,0,0
1,105,81,79,-4,8,61,302,8,9,1,...,0,0,0,0,0,0,0,0,1,0
2,190,80,40,-4,16,70,186,12,46,0,...,0,0,0,0,0,0,0,0,0,0
3,93,65,64,-8,8,55,198,12,19,0,...,0,0,0,0,1,0,0,0,0,0
4,150,65,58,-4,11,18,175,45,7,1,...,0,0,0,0,0,0,0,0,0,0
5,102,68,80,-5,9,84,220,9,4,0,...,0,0,0,0,1,0,0,0,0,0
6,180,64,75,-6,7,23,131,2,29,1,...,0,0,0,0,0,0,0,0,0,1
7,111,68,48,-5,8,35,202,15,9,1,...,0,0,0,0,1,0,0,0,0,0
8,136,62,88,-6,11,64,157,5,10,0,...,0,0,0,0,0,0,0,0,0,0
9,135,43,70,-11,10,56,194,33,38,1,...,1,0,0,0,0,0,0,0,0,0


In [26]:
y = data.loc[:, 'Popularity']
X = data.drop('Popularity', axis=1)

In [27]:
scaler = StandardScaler()

X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=20)

In [28]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_acc = log_model.score(X_test, y_test)
print("Logistic Regression Accuracy:", log_acc)

Logistic Regression Accuracy: 0.7333333333333333


In [29]:
fig = px.bar(
    x=["Logistic Regression"],
    y=[log_acc],
    color=["Logistic Regression"],
    labels={'x': "Model", 'y': "Accuracy"},
    title="Model Accuracy"
)

fig.show()