In [2]:
# Import Library

# Library Dataframe
import pandas as pd

# Library Numerical Data
import numpy as np

# Library Statistic
from scipy import stats
from sklearn import metrics
from scipy.stats import uniform, randint
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Library Data Visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import plotly.express as px

# Library Preprocessing data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

# Library Outlier Handling
from feature_engine.outliers import Winsorizer

# Library Correlation
from scipy.stats import kendalltau, spearmanr

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Library Machine Learning Model
from sklearn.neighbors import KNeighborsClassifier

# Model Evaluation
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score

# Save Model
import pickle
import joblib
import json

# To Ignore Warning
import warnings
warnings.filterwarnings("ignore")

In [5]:
# Membaca data dari file CSV
df = pd.read_csv('playlist_2010to2023.csv', encoding='ISO-8859-1')


In [13]:
# Set the display option to show the full content of each column
pd.set_option('display.max_colwidth', None)

# Set the display option to show all columns
pd.set_option('display.max_columns', None)

In [14]:
df

Unnamed: 0,playlist_url,year,track_id,track_name,track_popularity,album,artist_id,artist_name,artist_genres,artist_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,https://open.spotify.com/playlist/37i9dQZF1DWUZv12GM5cFk,2000,6naxalmIoLFWR0siv8dnQQ,Oops!...I Did It Again,81,Oops!... I Did It Again,26dSoYclwsYLMAKD3tpOr4,Britney Spears,"['dance pop', 'pop']",81,0.751,0.834,1,-5.444,0,0.0437,0.3000,0.000018,0.3550,0.894,95.053,211160,4
1,https://open.spotify.com/playlist/37i9dQZF1DWUZv12GM5cFk,2000,2m1hi0nfMR9vdGC8UcrnwU,All The Small Things,83,Enema Of The State,6FBDaR13swtiWwGhX1WQsP,blink-182,"['alternative metal', 'modern rock', 'pop punk', 'punk', 'rock', 'socal pop punk']",79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.000000,0.6120,0.684,148.726,167067,4
2,https://open.spotify.com/playlist/37i9dQZF1DWUZv12GM5cFk,2000,3y4LxiYMgDl4RethdzpmNe,Breathe,66,Breathe,25NQNriVT2YbSW80ILRWJa,Faith Hill,"['contemporary country', 'country', 'country dawn', 'country road']",62,0.529,0.496,7,-9.007,1,0.0290,0.1730,0.000000,0.2510,0.278,136.859,250547,4
3,https://open.spotify.com/playlist/37i9dQZF1DWUZv12GM5cFk,2000,0v1XpBHnsbkCn7iJ9Ucr1l,It's My Life,81,Crush,58lV9VcRSjABbAbfWS6skp,Bon Jovi,"['glam metal', 'rock']",79,0.551,0.913,0,-4.063,0,0.0466,0.0263,0.000013,0.3470,0.544,119.992,224493,4
4,https://open.spotify.com/playlist/37i9dQZF1DWUZv12GM5cFk,2000,62bOmKYxYg7dhrC6gH9vFn,Bye Bye Bye,75,No Strings Attached,6Ff53KvcvAj5U7Z1vojB5o,*NSYNC,"['boy band', 'dance pop', 'pop']",70,0.610,0.926,8,-4.843,0,0.0479,0.0310,0.001200,0.0821,0.861,172.638,200400,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,https://open.spotify.com/playlist/6unJBM7ZGitZYFJKkO0e4P,2023,0HD8mbiPjp3o94X3EaZp0o,exes,56,exes,45dkTj5sMRSjrmBSBeiHym,Tate McRae,['pop'],85,0.838,0.569,5,-6.324,0,0.0621,0.1520,0.000136,0.1020,0.551,136.965,159400,4
2396,https://open.spotify.com/playlist/6unJBM7ZGitZYFJKkO0e4P,2023,3XB5uhhlYSnkxpSihkNQwh,QLONA,79,MAÑANA SERÁ BONITO (BICHOTA SEASON),790FomKkXshlbRYZFtlgla,KAROL G,"['reggaeton', 'reggaeton colombiano', 'trap latino', 'urbano latino']",90,0.842,0.756,7,-7.409,0,0.3380,0.5250,0.000002,0.0871,0.421,169.925,172798,4
2397,https://open.spotify.com/playlist/6unJBM7ZGitZYFJKkO0e4P,2023,4sx6NRwL6Ol3V6m9exwGlQ,LOVE AGAIN,69,LOVE AGAIN,2tIP7SsRs7vjIcLrU85W8J,The Kid LAROI,['australian hip hop'],79,0.662,0.398,11,-6.691,0,0.0275,0.7160,0.000000,0.1110,0.468,107.001,145850,4
2398,https://open.spotify.com/playlist/6unJBM7ZGitZYFJKkO0e4P,2023,2Zo1PcszsT9WQ0ANntJbID,Feather,91,emails i cant send fwd:,74KM79TiuVKeVCqs8QtB0B,Sabrina Carpenter,['pop'],82,0.787,0.686,6,-4.370,0,0.0339,0.0893,0.000000,0.0927,0.836,123.510,185553,4


In [15]:
# Menampilkan informasi umum tentang DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   playlist_url       2400 non-null   object 
 1   year               2400 non-null   int64  
 2   track_id           2400 non-null   object 
 3   track_name         2400 non-null   object 
 4   track_popularity   2400 non-null   int64  
 5   album              2400 non-null   object 
 6   artist_id          2400 non-null   object 
 7   artist_name        2400 non-null   object 
 8   artist_genres      2400 non-null   object 
 9   artist_popularity  2400 non-null   int64  
 10  danceability       2400 non-null   float64
 11  energy             2400 non-null   float64
 12  key                2400 non-null   int64  
 13  loudness           2400 non-null   float64
 14  mode               2400 non-null   int64  
 15  speechiness        2400 non-null   float64
 16  acousticness       2400 

In [16]:
# Menampilkan statistik deskriptif dari DataFrame
print(df.describe())

              year  track_popularity  artist_popularity  danceability  \
count  2400.000000       2400.000000        2400.000000   2400.000000   
mean   2011.500000         70.277083          75.882500      0.661767   
std       6.923629         12.958245          12.016746      0.140335   
min    2000.000000          0.000000          36.000000      0.162000   
25%    2005.750000         65.000000          68.000000      0.573750   
50%    2011.500000         72.000000          77.000000      0.673000   
75%    2017.250000         78.000000          85.000000      0.760000   
max    2023.000000         96.000000         100.000000      0.975000   

            energy          key     loudness         mode  speechiness  \
count  2400.000000  2400.000000  2400.000000  2400.000000  2400.000000   
mean      0.688705     5.238333    -5.811735     0.589167     0.097995   
std       0.165345     3.596909     2.086712     0.492088     0.093276   
min       0.051900     0.000000   -21.107000  

In [17]:
# Menampilkan jumlah baris duplikat dalam DataFrame
print(df.duplicated().sum())

0


In [18]:
for column in df.columns:
    print('Number of unique value in the variable {:<25}: {}'.format(column, len(df[column].unique())))


Number of unique value in the variable playlist_url             : 24
Number of unique value in the variable year                     : 24
Number of unique value in the variable track_id                 : 2302
Number of unique value in the variable track_name               : 2220
Number of unique value in the variable track_popularity         : 72
Number of unique value in the variable album                    : 1733
Number of unique value in the variable artist_id                : 908
Number of unique value in the variable artist_name              : 908
Number of unique value in the variable artist_genres            : 718
Number of unique value in the variable artist_popularity        : 60
Number of unique value in the variable danceability             : 594
Number of unique value in the variable energy                   : 649
Number of unique value in the variable key                      : 12
Number of unique value in the variable loudness                 : 1971
Number of unique valu

In [22]:
genre_counts = df['artist_genres'].apply(pd.Series).stack().value_counts()
print(genre_counts)


['pop']                                                                                                      153
['dance pop', 'pop']                                                                                         138
['canadian hip hop', 'canadian pop', 'hip hop', 'pop rap', 'rap']                                             34
['barbadian pop', 'pop', 'urban contemporary']                                                                29
['dance pop', 'pop', 'pop rap']                                                                               25
                                                                                                            ... 
['downtempo', 'dream pop', 'indietronica']                                                                     1
['contemporary r&b', 'hip pop', 'pop rap', 'r&b', 'southern hip hop', 'trap', 'urban contemporary']            1
['alternative rock', 'britpop', 'garage rock', 'leicester indie', 'modern rock', 'rock', 'scotti

In [24]:
genre_percentage = genre_counts / len(df) * 100
print(genre_percentage)


['pop']                                                                                                      6.375000
['dance pop', 'pop']                                                                                         5.750000
['canadian hip hop', 'canadian pop', 'hip hop', 'pop rap', 'rap']                                            1.416667
['barbadian pop', 'pop', 'urban contemporary']                                                               1.208333
['dance pop', 'pop', 'pop rap']                                                                              1.041667
                                                                                                               ...   
['downtempo', 'dream pop', 'indietronica']                                                                   0.041667
['contemporary r&b', 'hip pop', 'pop rap', 'r&b', 'southern hip hop', 'trap', 'urban contemporary']          0.041667
['alternative rock', 'britpop', 'garage rock', 'leiceste