# Importando Bibliotecas

In [None]:
import pandas as pd
import numpy as np

from scipy.stats import normaltest

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn import metrics

from yellowbrick.cluster import KElbowVisualizer

import warnings
warnings.filterwarnings('ignore')

# Lendo os Dados

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df.head()

# EDA

### Valores Nulos

In [None]:
df.isna().sum()

### Distribuição das Variáveis

##### Valence

In [None]:
df['valence'].describe()

In [None]:
plt.figure(figsize=(5,5))
plt.title('Valence')
sns.boxplot(data=df['valence'], orient='v', color='gray')

In [None]:
plt.figure(figsize=(15,5))
plt.title('Valence')
sns.distplot(df['valence'], color='gray')

##### Year

In [None]:
plt.figure(figsize=(15,5))
plt.title('Music by Year')
sns.countplot(df['year'], color='gray')

Muitas músicas e anos, não?

##### Acousticness

In [None]:
plt.figure(figsize=(15,5))
plt.title('Valence')
sns.distplot(df['acousticness'], color='gray')

##### Artists

In [None]:
df['artists'] = df['artists'].str.replace('\[', '', regex=True)
df['artists'] = df['artists'].str.replace('\]', '', regex=True)
df['artists'] = df['artists'].str.split(',')

In [None]:
df['artists']

In [None]:
df[['artists']]

In [None]:
df_new = df[['artists']].explode('artists')
df_new

In [None]:
artists_list = df_new['artists'].unique().tolist()

In [None]:
df_new['artists'] = df_new['artists'].str.replace('$', 's')

In [None]:
df_artists = pd.DataFrame(df_new['artists'].value_counts())
df_artists.reset_index(inplace=True)
df_artists

In [None]:
plt.figure(figsize=(25,5))
plt.title('10 Artists with most Songs in Spotify')
sns.barplot(data=df_artists[:10], x='index', y='artists', color='gray')

##### Danceability

In [None]:
df['danceability'].describe()

In [None]:
plt.figure(figsize=(5,5))
plt.title('Danceability')
sns.boxplot(data=df['danceability'], orient='v', color='gray')

In [None]:
plt.figure(figsize=(15,5))
plt.title('Danceability')
sns.distplot(df['danceability'], color='gray')

In [None]:
def normal_test(feature, array, alpha=0.05):
    p = normaltest(array).pvalue
    if p < alpha:
        print(f'Rejeita H0. A feature {feature} NÃO está normalmente distribuída')
    else:
        print(f'Aceita H0. A feature {feature} está normalmente distribuída. P-valor = {p} >= {alpha}')

In [None]:
normal_test('danceability', df['danceability'].values)

##### Duration

In [None]:
df['duration_ms'].describe()

In [None]:
plt.figure(figsize=(5,5))
plt.title('Duration (miliseconds)')
sns.boxplot(data=df['duration_ms'], orient='v', color='gray')

In [None]:
plt.figure(figsize=(15,5))
plt.title('Duration (miliseconds)')
sns.distplot(np.log(df['duration_ms']), color='gray')

In [None]:
normal_test('duration', df['duration_ms'].values)

##### Energy

In [None]:
df['energy'].describe()

In [None]:
plt.figure(figsize=(5,5))
plt.title('Energy')
sns.boxplot(data=df['energy'], orient='v', color='gray')

In [None]:
plt.figure(figsize=(15,5))
plt.title('Energy')
sns.distplot(df['energy'], color='gray')

##### Explicit

In [None]:
df['explicit'].value_counts()

In [None]:
plt.figure(figsize=(15,5))
plt.title('Musics with Explicit Content')
sns.countplot(df['explicit'])

##### Instrumentalness

In [None]:
df['instrumentalness'].describe()

In [None]:
plt.figure(figsize=(15,5))
plt.title('Instrumentalness')
sns.distplot(df['instrumentalness'], color='gray')

##### Key

In [None]:
df['key'].describe()

In [None]:
plt.figure(figsize=(15,5))
plt.title('Key')
sns.countplot(df['key'], color='gray')

##### Liveness

In [None]:
df['liveness'].describe()

In [None]:
plt.figure(figsize=(5,5))
plt.title('Liveness')
sns.boxplot(data=df['liveness'], orient='v', color='gray')

In [None]:
plt.figure(figsize=(15,5))
plt.title('Liveness')
sns.distplot(df['liveness'], color='gray')

##### Loudness

In [None]:
df['loudness'].describe()

In [None]:
plt.figure(figsize=(5,5))
plt.title('Loudeness')
sns.boxplot(data=df['loudness'], orient='v', color='gray')

In [None]:
plt.figure(figsize=(15,5))
plt.title('Loudness')
sns.distplot(df['loudness'], color='gray')

In [None]:
normal_test('loudness', df['loudness'].values)

##### Mode

In [None]:
df['mode'].value_counts()

In [None]:
sns.countplot(df['mode'], color='gray')

##### Popularity

In [None]:
df['popularity'].describe()

In [None]:
plt.figure(figsize=(5,5))
plt.title('Popularity')
sns.boxplot(data=df['popularity'], orient='v', color='gray')

In [None]:
plt.figure(figsize=(15,5))
plt.title('Popularity')
sns.distplot(df['popularity'], color='gray')

##### Speechiness

In [None]:
df['speechiness'].describe()

In [None]:
plt.figure(figsize=(5,5))
plt.title('Speechiness')
sns.boxplot(data=df['speechiness'], orient='v', color='gray')

In [None]:
plt.figure(figsize=(15,5))
plt.title('Speechiness')
sns.distplot(df['speechiness'], color='gray')

##### Tempo

In [None]:
df['tempo'].describe()

In [None]:
plt.figure(figsize=(5,5))
plt.title('Tempo')
sns.boxplot(data=df['tempo'], orient='v', color='gray')

In [None]:
plt.figure(figsize=(15,5))
plt.title('Tempo')
sns.distplot(df['tempo'], color='gray')

In [None]:
normal_test('tempo', df['tempo'].values)

# Modelo

In [None]:
df.columns

In [None]:
X = df.drop(['artists', 'id', 'name', 'release_date'], axis=1)

### K-Means

##### Elbow

In [None]:
plt.figure(figsize=(8,8))

wcss = []

for i in range(1,31):
    kmeans = KMeans(n_clusters=i, init='random')
    kmeans.fit(X)
    print(i, kmeans.inertia_)
    wcss.append(kmeans.inertia_)
    
plt.plot(range(1,31), wcss, '*-')
plt.title('Método Elbow - WCSS')
plt.xlabel('Clusters')
plt.ylabel('WCSS')

In [None]:
vis = KElbowVisualizer(kmeans, k=(1,31))
vis.fit(X)
vis.show();

##### Davies-Bouldin

In [None]:
plt.figure(figsize=(8,8))

scores = []
n_cluster_list = np.arange(2,31).astype(int)

for n_cluster in n_cluster_list:
    kmeans = KMeans(n_clusters=i, init='random')
    cluster_found = kmeans.fit_predict(X)
    score = davies_bouldin_score(X, cluster_found)
    scores.append(score)
    print(n_cluster, score)

plt.plot(range(2,31), scores, '*-')
plt.title('Davies-Bouldin')
plt.xlabel('Clusters')
plt.ylabel('Score')

##### Modelo

In [None]:
k = 6

In [None]:
kmeans = KMeans(n_clusters=k)

In [None]:
cluster_found = kmeans.fit_predict(X)
silhouette = silhouette_score(X, cluster_found)
davies_bouldin = davies_bouldin_score(X, cluster_found)

In [None]:
print(f'''
      Os resultados foram:
      Silhouette: {round(silhouette,2)}
      Davies-Bouldin: {round(davies_bouldin,2)}
      ''')