In [18]:
from utils import *
import pandas as pd
import numpy as np
df_tracks = read_avro('data/tracks.avro')
df_artists = read_avro('data/artists.avro')


# Because release_date is a string, and comes in different formats , we will first tackle the YYYY format , then YYYY-MM then YYYY-MM-DD 
mask = df_tracks['release_date'].astype(str).apply(len) == 4

df_tracks.loc[mask, 'months_elapsed'] = (2024 - pd.to_numeric(
    df_tracks.loc[mask, 'release_date'])+1) * 12  # +1 to include the release year

reference_date = pd.to_datetime('2024-01-01')

try:
    months_elapsed, _ = divmod((reference_date - pd.to_datetime(
        df_tracks.loc[~mask, 'release_date'], errors='coerce')).dt.days, 30)
    df_tracks.loc[~mask, 'months_elapsed'] = months_elapsed.fillna(0)
except ValueError as e:
    print(f"Error: {e}")
    print("Problematic values:")
    print(df_tracks.loc[~mask, 'release_date'])
    df_tracks.loc[~mask, 'months_elapsed'] = np.nan


# Normalize 'months_elapsed' to the range [0, 1]
df_tracks['newness_score'] = 1 - (df_tracks['months_elapsed'] - df_tracks['months_elapsed'].min()) / (df_tracks['months_elapsed'].max() - min_months_elapsed)
df_tracks['newness_score'] = df_tracks['newness_score'].clip(0, 1)







import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Extract relevant audio features for clustering
audio_features = df_tracks[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

# Standardize the data
scaler = StandardScaler()
scaled_audio_features = scaler.fit_transform(audio_features)

num_clusters = 3  

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df_tracks['audio_features_type'] = kmeans.fit_predict(scaled_audio_features)




df_tracks['artist'] = df_tracks['artist'].astype(str).str.strip("[]").str.replace("'", "") # remove brackets and quotes

common_artists = set(df_tracks['artist']).intersection(df_artists['name'])
df_common_artists = df_tracks[df_tracks['artist'].isin(common_artists)]
df_common_2 = df_artists[df_artists['name'].isin(common_artists)]
df_merged = pd.merge(df_common_artists, df_common_2, left_on='artist', right_on='name', how='inner')




columns_to_keep = ['track_id', 'duration', 'artist', 'popularity_x', 'album_name', 'track_genre', 'newness_score', 'audio_features_type', 'followers', 'popularity_y']
df_final = df_merged[columns_to_keep]
df_final = df_final.rename(columns={
    'popularity_x': 'song_popularity',
    'followers': 'artist_followers',
    'popularity_y': 'artist_popularity'
})
df_final


In [19]:

# Because release_date is a string, and comes in different formats , we will first tackle the YYYY format , then YYYY-MM then YYYY-MM-DD 
mask = df_tracks['release_date'].astype(str).apply(len) == 4

df_tracks.loc[mask, 'months_elapsed'] = (2024 - pd.to_numeric(
    df_tracks.loc[mask, 'release_date'])+1) * 12  # +1 to include the release year

reference_date = pd.to_datetime('2024-01-01')

try:
    months_elapsed, _ = divmod((reference_date - pd.to_datetime(
        df_tracks.loc[~mask, 'release_date'], errors='coerce')).dt.days, 30)
    df_tracks.loc[~mask, 'months_elapsed'] = months_elapsed.fillna(0)
except ValueError as e:
    print(f"Error: {e}")
    print("Problematic values:")
    print(df_tracks.loc[~mask, 'release_date'])
    df_tracks.loc[~mask, 'months_elapsed'] = np.nan


# Normalize 'months_elapsed' to the range [0, 1]
df_tracks['newness_score'] = 1 - (df_tracks['months_elapsed'] - df_tracks['months_elapsed'].min()) / (df_tracks['months_elapsed'].max() - min_months_elapsed)
df_tracks['newness_score'] = df_tracks['newness_score'].clip(0, 1)




In [20]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Extract relevant audio features for clustering
audio_features = df_tracks[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                            'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']]

# Standardize the data
scaler = StandardScaler()
scaled_audio_features = scaler.fit_transform(audio_features)

num_clusters = 3  

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df_tracks['audio_features_type'] = kmeans.fit_predict(scaled_audio_features)



  super()._check_params_vs_input(X, default_n_init=10)


In [21]:
df_tracks['artist'] = df_tracks['artist'].astype(str).str.strip("[]").str.replace("'", "") # remove brackets and quotes

common_artists = set(df_tracks['artist']).intersection(df_artists['name'])
df_common_artists = df_tracks[df_tracks['artist'].isin(common_artists)]
df_common_2 = df_artists[df_artists['name'].isin(common_artists)]
df_merged = pd.merge(df_common_artists, df_common_2, left_on='artist', right_on='name', how='inner')



In [22]:
df_merged.columns

Index(['track_id', 'duration', 'artist', 'name_x', 'popularity_x',
       'release_date', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'album_name', 'track_genre',
       'months_elapsed', 'newness_score', 'audio_features_type', 'id',
       'followers', 'name_y', 'popularity_y'],
      dtype='object')

In [23]:
columns_to_keep = ['track_id', 'duration', 'artist', 'popularity_x', 'album_name', 'track_genre', 'newness_score', 'audio_features_type', 'followers', 'popularity_y']
df_final = df_merged[columns_to_keep]
df_final = df_final.rename(columns={
    'popularity_x': 'song_popularity',
    'followers': 'artist_followers',
    'popularity_y': 'artist_popularity'
})
df_final

Unnamed: 0,track_id,duration,artist,song_popularity,album_name,track_genre,newness_score,audio_features_type,artist_followers,artist_popularity
0,0BRXJHRNGQ3W4v9frnSfhu,178933,Dick Haymes,0,Classical Christmas,classical,0.000000,2,11327.0,35
1,6hH0kKnTWoGq5wppHKj5ie,172933,Dick Haymes,0,Stardust,dubstep,0.000000,2,11327.0,35
2,5uW3FmIzoRwuMo4zG1FUmv,135533,Dick Haymes,5,Misbehavin',garage,0.135922,2,11327.0,35
3,0kDOvwfAXewtwQKbIBDGOu,187880,Dick Haymes,4,English Rain,acoustic,0.135922,2,11327.0,35
4,0WBFrDVT9sbuglReraQkCh,178467,Dick Haymes,0,On Eagle's Wings,opera,0.165049,2,11327.0,35
...,...,...,...,...,...,...,...,...,...,...
24137,0BEpQDUZxnvqf3KLk5O8kg,282760,Nuradee,11,Diam,folk,0.728964,2,124.0,10
24138,5AM8iryxHyZrIJ3PxsB5JY,53480,Dr. Jean Feldman,38,Dr. Jean and Friends,children,0.748382,1,4905.0,41
24139,5aKeoAh8dF3mZEt5Av3LvZ,208360,Fred Mollin,41,Disney's Lullaby Album Vol. 2,disney,0.813107,2,3030.0,63
24140,3ZVSZs90JLO9QrNqrrhXiz,121071,Listener Kids,54,Jesus Loves Me,children,0.927994,2,6897.0,49
