In [None]:
import pandas as pd 
df = pd.read_csv("C:/Users/Prasad/Downloads/spotify_recommendation_KNNC.csv")
df

In [None]:
# 1. Check for missing values and duplicates
print(df.isnull().sum())
print("Duplicates:", df.duplicated().sum())

In [None]:
# 2. Convert Last_Played to datetime
df['Last_Played'] = pd.to_datetime(df['Last_Played'])

In [None]:
# 3. Fill missing categorical values if any
df['Genre'] = df['Genre'].fillna('Unknown')
df['Artist'] = df['Artist'].fillna('Unknown')

In [None]:
# 5. Create target: Liked (example threshold)
df['Liked'] = (df['Plays'] >= 15).astype(int)

In [None]:
# 1. Song popularity (global plays) - if you have global counts, else approximate via Plays
# For this toy set, make a categorical for 'High_Play' as extra feature
df['High_Play'] = (df['Plays'] >= 15).astype(int)

# 2. Extract time features from Last_Played
df['Last_Played_DayOfWeek'] = df['Last_Played'].dt.dayofweek  # 0=Mon
df['Last_Played_Hour'] = df['Last_Played'].dt.hour

# 3. Simplify artist info (optional: label encode top artists)
# Create a feature: Artist_is_top (example for artists appearing >1)
top_artists = df['Artist'].value_counts()
df['Artist_is_top'] = df['Artist'].isin(top_artists[top_artists>1].index).astype(int)

# 4. Aggregate user-level features (if multiple songs per user)
user_agg = df.groupby('User_ID').agg({
    'Plays': ['mean','max'],
}).reset_index()
user_agg.columns = ['User_ID','User_Plays_Mean','User_Plays_Max']
df = df.merge(user_agg, on='User_ID', how='left')

# 5. One-hot encode Genre (or use label encoding)
df = pd.get_dummies(df, columns=['Genre'], drop_first=True)


In [None]:
from sklearn.preprocessing import StandardScaler

num_cols = ['Plays','User_Plays_Mean','User_Plays_Max','Last_Played_DayOfWeek','Last_Played_Hour']
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['User_ID','Song_ID','Song_Name','Artist','Last_Played','Recommended_Songs','Liked'])
y = df['Liked']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Baseline model
knn = KNeighborsClassifier(n_neighbors=5)  # default
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [1,3,5,7,9],
    'weights': ['uniform','distance'],
    'p': [1,2]  # 1 = Manhattan, 2 = Euclidean
}

grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='f1', n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV score (F1):", grid.best_score_)

best_knn = grid.best_estimator_
y_pred_best = best_knn.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))
