Competition on DataCamp

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import tensorflow as tf

spotify = pd.read_csv('spotify.csv')
print(spotify.shape)
spotify.head()

In [None]:
spotify.dtypes

check if there are duplicates

In [None]:
duplicated = spotify[spotify['track_id'].duplicated(keep = False)]
#print(duplicated.shape)
duplicated.groupby('track_id')['track_id'].count()

remove duplicate elements

In [None]:
spotify = spotify[spotify['track_id'].duplicated(keep = 'first') == False]
spotify.dropna(inplace=True)
spotify = spotify.set_index('track_id')
spotify

In [None]:
numeric_df = spotify.select_dtypes(exclude='object')
data = numeric_df.values

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

scaled_numeric_df = pd.DataFrame(data_scaled, columns=numeric_df.columns)
scaled_numeric_df.head()

dataplot = sns.heatmap(scaled_numeric_df.corr(), cmap="YlGnBu", annot=True)
sns.set(rc={'figure.figsize':(17,17)})
plt.show()

random forest

In [None]:
N = 5  #top feature num

In [None]:
scaled_numeric_df_wo_d = scaled_numeric_df.drop(columns=['danceability'])

In [None]:
X = scaled_numeric_df_wo_d
y = scaled_numeric_df['danceability']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = RandomForestRegressor(n_estimators=50, random_state=42)

rf_regressor.fit(X_train, y_train)
feature_importances = rf_regressor.feature_importances_

feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

top_features = feature_importance_df.head(N)

print(f"Top {N} Important Features:")
print(top_features)

plt.figure(figsize=(10, 6))
plt.barh(top_features['Feature'], top_features['Importance'])
plt.xlabel('Importance')
plt.title('Top Features for Predicting Target (Random Forest)')
plt.gca().invert_yaxis()
plt.show()

create model -- valence seems the best

In [None]:
used_features = ['valence', 'loudness', 'acousticness', 'tempo', 'track_genre']

In [None]:
X = scaled_numeric_df[used_features].values
y = scaled_numeric_df['danceability'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(4,)),  
    tf.keras.layers.Dense(128, activation='relu'), 
    tf.keras.layers.Dense(64, activation='relu'),  
    tf.keras.layers.Dense(32, activation='relu'),  
    tf.keras.layers.Dense(1) 
])

model.compile(optimizer='adam', loss='mse')

In [None]:
model.fit(X_train, y_train, epochs=10, verbose=0, validation_data=(X_test, y_test))

In [None]:
loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')

predict

In [None]:
to_predict = scaled_numeric_df[used_features + ['danceability']].sample(n = 100, random_state=2)
X_predict = to_predict[used_features].values
y_result = []
for i, row in enumerate(X_predict):
    tmp = model.predict(row.reshape(1, 4), verbose=0)
    y_result.append([tmp, to_predict.iloc[i].at['danceability']])

In [None]:
acc_array = np.array(y_result)
ind = np.argsort(-acc_array[:,0]) # Reverse order
sorted = acc_array[ind]

# summarize history for accuracy
plt.plot(sorted[:,0] , label = 'Real values')
plt.plot(sorted[:,1], label =  'Prediction')
plt.title('Test Accuracy')
plt.ylabel('Values')
plt.xlabel('Datapoints')
plt.legend(loc='upper left')
plt.show()

decision tree regressor

In [None]:
regressor = DecisionTreeRegressor(min_samples_split=10, random_state=42)
regressor.fit(X_train, y_train)

# Predict danceability scores on the testing set
y_pred = regressor.predict(X_test)

# Evaluate the model (you can use any regression metric you prefer)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

In [None]:
y_result_dcr = []
for i, row in enumerate(X_predict):
    tmp = regressor.predict(row.reshape(1, 4))
    y_result_dcr.append([tmp, to_predict.iloc[i].at['danceability']])

In [None]:
acc_array = np.array(y_result)
ind = np.argsort(-acc_array[:,0]) # Reverse order
sorted = acc_array[ind]

# summarize history for accuracy
plt.plot(sorted[:,0] , label = 'Real values')
plt.plot(sorted[:,1], label =  'Prediction')
plt.title('Test Accuracy')
plt.ylabel('Values')
plt.xlabel('Datapoints')
plt.legend(loc='upper left')
plt.show()

limiting tempo

In [None]:
spotify = spotify[spotify['tempo'] >= 116]
spotify = spotify[spotify['tempo'] <= 140]

In [None]:
spotify.shape

In [None]:
sns.scatterplot(data=spotify, x='danceability', y='valence')
sns.set(rc={'figure.figsize':(10,10)})
plt.show()

In [None]:
sns.scatterplot(data=spotify, x='danceability', y='loudness')
plt.show()

In [None]:
sns.scatterplot(data=spotify, x='danceability', y='acousticness')
plt.show()