Competition on DataCamp

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

spotify = pd.read_csv('spotify.csv')
spotify.shape
spotify.head()

check if there are duplicates

In [None]:
duplicated = spotify[spotify['track_id'].duplicated(keep = False)]
print(duplicated.shape)
duplicated.groupby('track_id')['track_id'].count()

remove duplicate elements

In [None]:
spotify = spotify[spotify['track_id'].duplicated(keep = 'first') == False]
spotify.shape

In [None]:
float_df = spotify.select_dtypes(exclude='object')

# Assuming 'df' is your Pandas DataFrame containing your data
data = float_df.values  # Convert DataFrame to a NumPy array

# Standardize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Create a new DataFrame with the standardized data
df_scaled = pd.DataFrame(data_scaled, columns=float_df.columns)
df_scaled.head()

dataplot = sns.heatmap(df_scaled.corr(), cmap="YlGnBu", annot=True)
sns.set(rc={'figure.figsize':(17,17)})
# displaying heatmap
plt.show()

random forest

In [None]:
df_scaled_wo_d = df_scaled.drop(columns=['danceability'])
X = df_scaled_wo_d
y = spotify['danceability']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=50, random_state=42)

# Fit the model
rf_regressor.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_regressor.feature_importances_

# Create a DataFrame to display the feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the top N important features
N = 5  # You can change this value to see more or fewer features
top_features = feature_importance_df.head(N)

print(f"Top {N} Important Features:")
print(top_features)

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.barh(top_features['Feature'], top_features['Importance'])
plt.xlabel('Importance')
plt.title('Top Features for Predicting Target (Random Forest)')
plt.gca().invert_yaxis()
plt.show()

In [None]:
sns.scatterplot(data=df_scaled, x='danceability', y='valence')
sns.set(rc={'figure.figsize':(10,10)})
plt.show()

In [None]:
sns.scatterplot(data=df_scaled, x='danceability', y='loudness')
plt.show()

In [None]:
sns.scatterplot(data=df_scaled, x='danceability', y='acousticness')
plt.show()

create model -- valence seems the best

In [None]:
#X = df_scaled[['valence', 'loudness', 'acousticness']].values
X = df_scaled[['valence']].values
y = df_scaled['danceability'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1,)),   # Input layer with 3 features
    tf.keras.layers.Dense(128, activation='relu'),  # Hidden layer with ReLU activation
    tf.keras.layers.Dense(64, activation='relu'),  # Hidden layer with ReLU activation
    tf.keras.layers.Dense(32, activation='relu'),  # Hidden layer with ReLU activation
    tf.keras.layers.Dense(1)  # Output layer with 1 neuron (for regression)
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Print a summary of the model
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

In [None]:
loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')

residual check for linearity