In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [23]:
df = pd.read_csv('../data/raw_data/data.csv')
df.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [24]:
df.shape

(170653, 19)

# Exploration / Outlier cleaning

In [25]:
df.isna().sum()

valence             0
year                0
acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
dtype: int64

In [26]:
# Keep only numerical features
df = df.select_dtypes(include=np.number)
df.head()

Unnamed: 0,valence,year,acousticness,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
0,0.0594,1921,0.982,0.279,831667,0.211,0,0.878,10,0.665,-20.096,1,4,0.0366,80.954
1,0.963,1921,0.732,0.819,180533,0.341,0,0.0,7,0.16,-12.441,1,5,0.415,60.936
2,0.0394,1921,0.961,0.328,500062,0.166,0,0.913,3,0.101,-14.85,1,5,0.0339,110.339
3,0.165,1921,0.967,0.275,210000,0.309,0,2.8e-05,5,0.381,-9.316,1,3,0.0354,100.109
4,0.253,1921,0.957,0.418,166693,0.193,0,2e-06,3,0.229,-10.096,1,2,0.038,101.665


In [27]:
# Get rid of categorical features and year
# To remove: Year, Explicit, Key, mode, popularity

df = df.drop(columns=['year', 'explicit', 'key', 'mode', 'popularity'])
df.head()

Unnamed: 0,valence,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo
0,0.0594,0.982,0.279,831667,0.211,0.878,0.665,-20.096,0.0366,80.954
1,0.963,0.732,0.819,180533,0.341,0.0,0.16,-12.441,0.415,60.936
2,0.0394,0.961,0.328,500062,0.166,0.913,0.101,-14.85,0.0339,110.339
3,0.165,0.967,0.275,210000,0.309,2.8e-05,0.381,-9.316,0.0354,100.109
4,0.253,0.957,0.418,166693,0.193,2e-06,0.229,-10.096,0.038,101.665


In [28]:
# The distribution of both columns does not give opportunity for extracting meaningful information, in a first moment
df = df.drop(columns=['speechiness', 'instrumentalness'])

## Outliers

In [29]:
# By IQR method

q25, q75 = df.select_dtypes(include=np.number).quantile(0.25), df.select_dtypes(include=np.number).quantile(0.75)
iqr = q75 - q25

cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off

In [30]:
# 15% removal for our purpose is acceptable
df[(df > lower) & (df < upper)].dropna().shape[0] / df.shape[0]

0.8566799294474753

In [31]:
df = df[(df > lower) & (df < upper)].dropna()
df.shape

(146195, 8)

In [33]:
df.to_csv('../data/clean_data/original_data.csv')

## MinMax scaling

In [63]:
df.head()

Unnamed: 0,valence,acousticness,danceability,duration_ms,energy,liveness,loudness,tempo
1,0.963,0.732,0.819,180533.0,0.341,0.16,-12.441,60.936
3,0.165,0.967,0.275,210000.0,0.309,0.381,-9.316,100.109
4,0.253,0.957,0.418,166693.0,0.193,0.229,-10.096,101.665
5,0.196,0.579,0.697,395076.0,0.346,0.13,-12.506,119.824
6,0.406,0.996,0.518,159507.0,0.203,0.115,-10.589,66.221


In [65]:
df.min(), df.max()

(valence             0.00000
 acousticness        0.00000
 danceability        0.05690
 duration_ms     31107.00000
 energy              0.00024
 liveness            0.00967
 loudness          -25.76200
 tempo              30.94600
 dtype: float64,
 valence              1.000
 acousticness         0.996
 danceability         0.988
 duration_ms     401253.000
 energy               1.000
 liveness             0.504
 loudness             3.744
 tempo              198.695
 dtype: float64)

In [66]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

# feats with values < 0 and/or > 1
cols_to_norm = ['duration_ms', 'loudness', 'tempo']


pipe = ColumnTransformer([
    ('scaler', MinMaxScaler(), cols_to_norm)
], remainder='passthrough', n_jobs=-1, verbose_feature_names_out=False)
pipe.set_output(transform='pandas')

df = pipe.fit_transform(df)

In [69]:
# Save preprocessing steps

df.to_csv('../data/clean_data/clean_data.csv', index=False)

In [5]:
from random import randint, choice

l = [randint(0, 10) for _ in range(10)]
l

[0, 7, 7, 3, 1, 0, 6, 6, 3, 10]

In [11]:
choice(sorted(l)[:5])

0

In [1]:
test = {'hey': 'you'}

In [3]:
import pickle

with open(f'../data/logs/test_run.pkl', 'wb') as f:
    pickle.dump(test, f)

In [4]:
with open('../data/logs/test_run.pkl', 'rb') as f:
    t = pickle.load(f)

t

{'hey': 'you'}

In [8]:
from datetime import datetime


now = datetime.now()
dt_string = now.strftime("%d-%m-%Y--%H:%M:%S")
dt_string

'01-06-2024--20:50:23'

In [2]:
import pandas as pd
import pickle

with open('../data/logs/02-06-2024--00:42:10--ALGO-SELECTION.pkl', 'rb') as f:
    d = pickle.load(f)

d

{'time': [163.34431290626526, 281.2636909484863],
 'loop_time': [[11.816162824630737,
   9.589086055755615,
   11.134979248046875,
   11.798383474349976,
   7.456634044647217,
   9.118575096130371,
   11.617857217788696,
   6.9888081550598145,
   8.694559574127197,
   9.663926839828491,
   10.342803001403809,
   11.112212419509888,
   8.3902428150177,
   9.272813081741333,
   6.769579172134399,
   5.73211669921875,
   13.84536337852478],
  [11.174023628234863,
   10.472558975219727,
   11.035571813583374,
   7.90897536277771,
   2.7563834190368652,
   6.711301565170288,
   9.152772426605225,
   8.365546941757202,
   11.138970851898193,
   10.448672533035278,
   12.506198167800903,
   11.723008871078491,
   8.989989280700684,
   9.773367643356323,
   9.701823234558105,
   13.110367774963379,
   10.239613771438599,
   10.679516315460205,
   9.849774360656738,
   6.970468759536743,
   7.755408763885498,
   4.855293273925781,
   8.299432039260864,
   7.509806394577026,
   8.558347702026367

In [5]:
d['best'][0].shape

(5, 7)