<a href="https://colab.research.google.com/github/tawhidliyon/Machine-Learning/blob/main/Spotify_personalized_song_recommendations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset
data = pd.read_csv('spotify_synthetic_data.csv')

# Display the first few rows
print(data.head())

   user_id  song_id            timestamp  repeated_listen
0      103      442  2023-06-29 20:36:00                0
1      436      279  2023-03-13 14:58:00                0
2      861      251  2023-08-15 05:59:00                0
3      271      310  2023-10-28 11:18:00                1
4      107      208  2023-09-10 17:55:00                0


In [3]:
# Convert timestamp to datetime
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Feature engineering: Extract features from timestamp
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['hour_of_day'] = data['timestamp'].dt.hour
data['month'] = data['timestamp'].dt.month

# Drop the original timestamp column
data = data.drop(columns=['timestamp'])

# Encode categorical variables (if any)
data = pd.get_dummies(data, columns=['user_id', 'song_id'], drop_first=True)

# Display the updated dataset
print(data.head())

   repeated_listen  day_of_week  hour_of_day  month  user_id_2  user_id_3  \
0                0            3           20      6      False      False   
1                0            0           14      3      False      False   
2                0            1            5      8      False      False   
3                1            5           11     10      False      False   
4                0            6           17      9      False      False   

   user_id_4  user_id_5  user_id_6  user_id_7  ...  song_id_491  song_id_492  \
0      False      False      False      False  ...        False        False   
1      False      False      False      False  ...        False        False   
2      False      False      False      False  ...        False        False   
3      False      False      False      False  ...        False        False   
4      False      False      False      False  ...        False        False   

   song_id_493  song_id_494  song_id_495  song_id_496  s

In [4]:
# Separate features and target variable
X = data.drop(columns=['repeated_listen'])
y = data['repeated_listen']

# Display shapes
print("Features shape:", X.shape)
print("Target shape:", y.shape)

Features shape: (10000, 1501)
Target shape: (10000,)


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes
print("Training data shape:", X_train.shape, y_train.shape)
print("Testing data shape:", X_test.shape, y_test.shape)

Training data shape: (8000, 1501) (8000,)
Testing data shape: (2000, 1501) (2000,)


In [6]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display standardized features
print("Standardized training data (first 5 rows):\n", X_train[:5])

Standardized training data (first 5 rows):
 [[-0.01038465 -0.80243396  1.31199365 ... -0.04476615 -0.0316386
  -0.0316386 ]
 [ 1.49101052 -1.52911177 -1.58615888 ... -0.04476615 -0.0316386
  -0.0316386 ]
 [-0.01038465  0.36025052 -1.58615888 ... -0.04476615 -0.0316386
  -0.0316386 ]
 [ 0.49008041 -0.36642728  1.6018089  ... -0.04476615 -0.0316386
  -0.0316386 ]
 [-0.51084971  0.94159276 -1.29634362 ... -0.04476615 -0.0316386
  -0.0316386 ]]


In [7]:
# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

print("Model training complete!")

Model training complete!


In [8]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.794
Confusion Matrix:
 [[1576   36]
 [ 376   12]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.98      0.88      1612
           1       0.25      0.03      0.06       388

    accuracy                           0.79      2000
   macro avg       0.53      0.50      0.47      2000
weighted avg       0.70      0.79      0.72      2000

