In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
import seaborn as sns
import plotly.express as px
import scipy.stats as scp
import matplotlib.pyplot as plt

In [2]:
song60s = pd.read_csv("../1960.csv")

In [3]:

# Descriptive statistics
# Statistics for numeric attributes (mean, median, mode, RUB, RLB, )
numericalValues = ["bpm", "nrgy", "dnce", "dB", "live", "val", "dur", "acous", "spch", "popularity"]

for key in numericalValues:
    chosenTable = song60s[key]
    
    q1 = np.quantile(chosenTable, 0.25, interpolation="midpoint")
    q3 = np.quantile(chosenTable, 0.75, interpolation="midpoint")
    iqr = q3 - q1
    rlb = q1 - 1.5 * iqr
    rub = q3 + 1.5 * iqr
    
    print(f"Description for {key}:")
    descriptionTable = [
        ["Minimum", np.min(chosenTable)],
        ["Maximum", np.max(chosenTable)],
        ["Mean", np.mean(chosenTable)],
        ["Median", np.median(chosenTable)],
        ["Mode", scp.mode(chosenTable)[0][0]],
        ["Variance", np.var(chosenTable)],
        ["Standard Deviation", np.std(chosenTable)],
        ["Q1", q1],
        ["Q3", q3],
        ["IQR", iqr],
        ["RLB", rlb],
        ["RUB", rub]
    ]
    
    display(pd.DataFrame(descriptionTable, columns=["Measure", "Value"]))

Description for bpm:


Unnamed: 0,Measure,Value
0,Minimum,63.0
1,Maximum,199.0
2,Mean,117.474227
3,Median,116.0
4,Mode,97.0
5,Variance,717.857583
6,Standard Deviation,26.792864
7,Q1,98.0
8,Q3,131.0
9,IQR,33.0


Description for nrgy:


Unnamed: 0,Measure,Value
0,Minimum,6.0
1,Maximum,99.0
2,Mean,51.958763
3,Median,51.0
4,Mode,43.0
5,Variance,452.390052
6,Standard Deviation,21.269463
7,Q1,39.0
8,Q3,68.0
9,IQR,29.0


Description for dnce:


Unnamed: 0,Measure,Value
0,Minimum,20.0
1,Maximum,79.0
2,Mean,51.175258
3,Median,51.0
4,Mode,51.0
5,Variance,156.04145
6,Standard Deviation,12.491655
7,Q1,43.0
8,Q3,61.0
9,IQR,18.0


Description for dB:


Unnamed: 0,Measure,Value
0,Minimum,-24.0
1,Maximum,-1.0
2,Mean,-10.659794
3,Median,-10.0
4,Mode,-10.0
5,Variance,14.554363
6,Standard Deviation,3.815018
7,Q1,-13.0
8,Q3,-8.0
9,IQR,5.0


Description for live:


Unnamed: 0,Measure,Value
0,Minimum,4.0
1,Maximum,90.0
2,Mean,19.247423
3,Median,14.0
4,Mode,9.0
5,Variance,241.072803
6,Standard Deviation,15.526519
7,Q1,9.0
8,Q3,24.0
9,IQR,15.0


Description for val:


Unnamed: 0,Measure,Value
0,Minimum,6.0
1,Maximum,97.0
2,Mean,57.391753
3,Median,57.0
4,Mode,96.0
5,Variance,553.475396
6,Standard Deviation,23.526058
7,Q1,42.0
8,Q3,75.0
9,IQR,33.0


Description for dur:


Unnamed: 0,Measure,Value
0,Minimum,122.0
1,Maximum,466.0
2,Mean,205.061856
3,Median,178.0
4,Mode,182.0
5,Variance,6874.944627
6,Standard Deviation,82.915286
7,Q1,152.0
8,Q3,209.0
9,IQR,57.0


Description for acous:


Unnamed: 0,Measure,Value
0,Minimum,0.0
1,Maximum,99.0
2,Mean,45.742268
3,Median,44.0
4,Mode,3.0
5,Variance,784.005739
6,Standard Deviation,28.000102
7,Q1,23.0
8,Q3,68.0
9,IQR,45.0


Description for spch:


Unnamed: 0,Measure,Value
0,Minimum,2.0
1,Maximum,31.0
2,Mean,4.721649
3,Median,4.0
4,Mode,3.0
5,Variance,11.582315
6,Standard Deviation,3.40328
7,Q1,3.0
8,Q3,5.0
9,IQR,2.0


Description for popularity:


Unnamed: 0,Measure,Value
0,Minimum,30.0
1,Maximum,82.0
2,Mean,53.597938
3,Median,52.0
4,Mode,46.0
5,Variance,139.518759
6,Standard Deviation,11.811806
7,Q1,45.0
8,Q3,60.0
9,IQR,15.0


In [4]:
# Statistics for categorical attributes
categoricalValues = ["artist", "genre", "has_win_award"]
for key in categoricalValues:
    print(f"Description for {key}")
    descriptionTable = [
        ["Unique", len(pd.unique(song60s[key]))],
        ["Mode", song60s[key].mode()[0]]
    ]
    
    display(pd.DataFrame(descriptionTable, columns=["Measure", "Value"]))


Description for artist


NameError: name 'song50s' is not defined

In [None]:
# Dataframe information
song60s.info()

In [None]:
# KDE plots

for key in numericalValues:
    chosenTable = song60s[key]
    
    plt.figure(figsize=(7,5))
    sns.kdeplot(data = chosenTable, shade = True)
    plt.vlines(x=np.mean(chosenTable),ymin=0, ymax=0.35, color="blue", linestyle="--")
    plt.vlines(x=np.median(chosenTable),ymin=0, ymax=0.35, color="brown", linestyle="--")
    plt.vlines(x=scp.mode(chosenTable)[0][0],ymin=0, ymax=0.35, color="red", linestyle="--")
    
    plt.legend([key, "Mean", "Median", "Mode"])

In [None]:
# Box plots
for key in numericalValues:
    print(f"Box plot for {key} per genre")
    figure = px.box(song60s.dropna(), y=key, color="genre")
    figure.show()
    
    print(f"Overall box plot for {key}")
    figure = px.box(song60s[key], y=key)
    figure.show()

In [None]:
# Correlation map
corr = song60s.corr()

print("Overall correlation map")
f, ax = plt.subplots(figsize=(18,18))
sns.heatmap(corr, annot=True, linewidths=.5, fmt=".2f", ax=ax)

In [None]:
# Pie chart of all songs by genre

groupByGenre = song60s.groupby("genre").count().sort_values(by=["title"], ascending=False)

figure = px.bar(groupByGenre["title"], y=groupByGenre.index, x="title", orientation="h")
figure.show()