**RECOMMENDER SYSTEM**

TRAINING

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from pymongo import MongoClient
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import pickle
import time
from tqdm import tqdm 

client = MongoClient("MONGODB-URL")
db = client["DATABASE-NAME"]


In [None]:
def retrieve_all_data():
    ratings_data = pd.DataFrame(list(db["ratings"].find()))
    songs_data = pd.DataFrame(list(db["songs"].find()))
    activity_data = pd.DataFrame(list(db["user_activity"].find()))
    user_data = pd.DataFrame(list(db["users"].find()))
    user_song_ratings = pd.pivot_table(ratings_data, values='rating', index='user_id', columns='track_id').fillna(0)
    user_ids, track_ids = np.nonzero(user_song_ratings)
    ratings = [user_song_ratings.iloc[user, track] for user, track in zip(user_ids, track_ids)]
    merged_data = ratings_data.merge(activity_data, on="user_id", how="left").merge(user_data, on="user_id", how="left")
    return user_ids, track_ids, np.array(ratings), songs_data, merged_data

In [None]:
class NCFWithDemographics(nn.Module):
    def __init__(self, n_users, n_items, n_factors, n_genres, n_languages):
        super(NCFWithDemographics, self).__init__()
        self.user_embedding = nn.Embedding(n_users, n_factors)
        self.item_embedding = nn.Embedding(n_items, n_factors)
        
        self.genre_embedding = nn.Embedding(n_genres, n_factors)
        self.language_embedding = nn.Embedding(n_languages, n_factors)
        self.age_embedding = nn.Embedding(100, n_factors)
        self.gender_embedding = nn.Embedding(2, n_factors)
        
        self.fc1 = nn.Linear(n_factors * 6, 64)
        self.fc2 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
        
    def forward(self, user_id, item_id, genre_id, language_id, age, gender):
        user_vec = self.user_embedding(user_id)
        item_vec = self.item_embedding(item_id)
        genre_vec = self.genre_embedding(genre_id)
        language_vec = self.language_embedding(language_id)
        age_vec = self.age_embedding(age)
        gender_vec = self.gender_embedding(gender)
        
        x = torch.cat([user_vec, item_vec, genre_vec, language_vec, age_vec, gender_vec], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return torch.sigmoid(self.output(x))


In [47]:
class RatingDataset(Dataset):
    def __init__(self, user_ids, track_ids, ratings, genres, languages, ages, genders):
        self.user_ids = torch.LongTensor(user_ids)
        self.track_ids = torch.LongTensor(track_ids)
        self.ratings = torch.FloatTensor(ratings)
        self.genres = torch.LongTensor(genres)
        self.languages = torch.LongTensor(languages)
        self.ages = torch.LongTensor(ages)
        self.genders = torch.LongTensor(genders)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.track_ids[idx], self.ratings[idx], self.genres[idx], self.languages[idx], self.ages[idx], self.genders[idx]


In [None]:
def save_model_and_encoders(model, genre_encoder, language_encoder):
    torch.save(model.state_dict(), 'Files\ncf_model.pth')
    with open('Files\genre_encoder.pkl', 'wb') as f:
        pickle.dump(genre_encoder, f)
    with open('Files\language_encoder.pkl', 'wb') as f:
        pickle.dump(language_encoder, f)

In [49]:
user_ids, track_ids, ratings, songs_data, merged_data = retrieve_all_data()

Retrieving: 253.22670197486877
UserRatingMatrix: 292.9425401687622
1


In [None]:
def train_model(user_ids, track_ids, ratings, songs_data, merged_data):
    genre_encoder = LabelEncoder()
    language_encoder = LabelEncoder()

    merged_data['genre_id'] = genre_encoder.fit_transform(merged_data['preferred_genre'])
    merged_data['language_id'] = language_encoder.fit_transform(merged_data['preferred_language'])
    
    ages = merged_data['user_age'].values
    genders = merged_data['user_gender'].values
    genres = merged_data['genre_id'].values
    languages = merged_data['language_id'].values

    train_dataset = RatingDataset(user_ids, track_ids, ratings, genres, languages, ages, genders)

    train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)

    n_users, n_items = len(set(user_ids)), len(set(track_ids))
    n_genres = len(genre_encoder.classes_)
    n_languages = len(language_encoder.classes_)

    model = NCFWithDemographics(n_users, n_items, 20, n_genres, n_languages)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)


    epochs = 10
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        epoch_start_time = time.time()

        with tqdm(train_loader, total=len(train_loader), desc=f'Epoch {epoch+1}/{epochs}', unit='batch') as pbar:
            for user_id, item_id, rating, genre_id, language_id, age, gender in pbar:
                optimizer.zero_grad()
                prediction = model(user_id, item_id, genre_id, language_id, age, gender)
                loss = criterion(prediction.squeeze(), rating)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

                pbar.set_postfix(loss=total_loss/len(train_loader))
        
        epoch_end_time = time.time()
        print(f"Epoch {epoch+1}/{epochs} completed in {epoch_end_time - epoch_start_time:.2f} seconds, Loss: {total_loss/len(train_loader):.4f}")

    save_model_and_encoders(model, genre_encoder, language_encoder)
    return model, genre_encoder, language_encoder


In [None]:
model, genre_encoder, language_encoder = train_model(user_ids, track_ids, ratings, songs_data, merged_data)

2
EncoderData: 0.4942340850830078 seconds
TrainDataset: 0.00517582893371582 seconds
TrainLoader: 0.0 seconds


Epoch 1/10: 100%|██████████| 9618/9618 [06:11<00:00, 25.87batch/s, loss=0.000327]


Epoch 1/10 completed in 371.80 seconds, Loss: 0.0003


Epoch 2/10: 100%|██████████| 9618/9618 [06:04<00:00, 26.38batch/s, loss=6.68e-11]


Epoch 2/10 completed in 364.58 seconds, Loss: 0.0000


Epoch 3/10: 100%|██████████| 9618/9618 [06:08<00:00, 26.07batch/s, loss=2.33e-12]


Epoch 3/10 completed in 368.90 seconds, Loss: 0.0000


Epoch 4/10: 100%|██████████| 9618/9618 [06:12<00:00, 25.83batch/s, loss=4.6e-13] 


Epoch 4/10 completed in 372.42 seconds, Loss: 0.0000


Epoch 5/10: 100%|██████████| 9618/9618 [06:14<00:00, 25.69batch/s, loss=2.24e-13]


Epoch 5/10 completed in 374.35 seconds, Loss: 0.0000


Epoch 6/10: 100%|██████████| 9618/9618 [06:13<00:00, 25.76batch/s, loss=1.48e-13]


Epoch 6/10 completed in 373.40 seconds, Loss: 0.0000


Epoch 7/10: 100%|██████████| 9618/9618 [06:19<00:00, 25.35batch/s, loss=1.11e-13]


Epoch 7/10 completed in 379.40 seconds, Loss: 0.0000


Epoch 8/10: 100%|██████████| 9618/9618 [06:30<00:00, 24.64batch/s, loss=8.88e-14]


Epoch 8/10 completed in 390.39 seconds, Loss: 0.0000


Epoch 9/10: 100%|██████████| 9618/9618 [07:15<00:00, 22.11batch/s, loss=7.42e-14]


Epoch 9/10 completed in 435.05 seconds, Loss: 0.0000


Epoch 10/10: 100%|██████████| 9618/9618 [07:07<00:00, 22.48batch/s, loss=6.38e-14]

Epoch 10/10 completed in 427.89 seconds, Loss: 0.0000
Training complete in 3858.17 seconds
TotalTraining:3859.370309829712





LOAD AND PREDICT

In [None]:
def load_model_and_encoders(n_users, n_items, n_factors, n_genres, n_languages):
    model = NCFWithDemographics(n_users, n_items, n_factors, n_genres, n_languages)
    model.load_state_dict(torch.load('Files\ncf_model.pth'))
    model.eval()
    with open('Files\genre_encoder.pkl', 'rb') as f:
        genre_encoder = pickle.load(f)
    with open('Files\language_encoder.pkl', 'rb') as f:
        language_encoder = pickle.load(f)
    return model, genre_encoder, language_encoder

In [89]:
def recommend_top_n(user_id, model, genre_id, language_id, age, gender, num_items, n=20):
    item_vec = torch.arange(num_items)
    user_vec = torch.full((num_items,), user_id)
    genre_vec = torch.full((num_items,), genre_id)
    language_vec = torch.full((num_items,), language_id)
    age_vec = torch.full((num_items,), age)
    gender_vec = torch.full((num_items,), gender)

    with torch.no_grad():
        predictions = model(user_vec, item_vec, genre_vec, language_vec, age_vec, gender_vec).squeeze().numpy()
    
    top_n_indices = np.argsort(predictions)[-n:][::-1].copy()
    return item_vec[top_n_indices].numpy()

In [None]:
def filter_with_mood(user_id, user_top_20, db):
    songs_collection = db['songs']
    activity_collection = db['user_activity']

    song_cursor = songs_collection.find(
        {"track_id": {"$in": user_top_20.tolist()}},
        {"track_id": 1, "energy": 1, "valence": 1}
    )
    top_20_songs = list(song_cursor)

    track_ids = [song['track_id'] for song in top_20_songs]
    features = np.array([[song['energy'], song['valence']] for song in top_20_songs])

    pca = PCA(n_components=1)
    reduced_features = pca.fit_transform(features)

    user_mood = activity_collection.find_one(
        {"user_id": user_id},
        {"mood_energy": 1, "mood_valence": 1}
    )


    mood_energy = user_mood["mood_energy"]
    mood_valence = user_mood["mood_valence"]
    user_mood_point = pca.transform([[mood_energy, mood_valence]])

    nn = NearestNeighbors(n_neighbors=10)
    nn.fit(reduced_features)
    distances, indices = nn.kneighbors(user_mood_point)

    final_10_tracks = [track_ids[i] for i in indices.flatten()]
    return final_10_tracks



In [None]:
n_users = 999
num_items = 395386
n_factors = 20
n_genres=18
n_languages = 1


99
199
299


In [None]:
model, genre_encoder, language_encoder = load_model_and_encoders(
    n_users, num_items, n_factors, n_genres, n_languages
)



  model.load_state_dict(torch.load('ncf_model.pth'))


In [None]:
def retrieve_user_info(user_id):
    user_data = pd.DataFrame(list(db["users"].find({"user_id": user_id})))
    activity_data = pd.DataFrame(list(db["user_activity"].find({"user_id": user_id})))
    if user_data.empty or activity_data.empty:
        raise ValueError(f"No data found for user_id: {user_id}")
    
    age = user_data.iloc[0]["user_age"]
    gender = user_data.iloc[0]["user_gender"]
    preferred_language = activity_data.iloc[0]["preferred_language"]
    preferred_genre = activity_data.iloc[0]["preferred_genre"]
    
    return age, gender, preferred_language, preferred_genre

user_id = 2
age, gender, preferred_language, preferred_genre = retrieve_user_info(user_id)

genre_id = genre_encoder.transform([preferred_genre])[0]
language_id = language_encoder.transform([preferred_language])[0]

69


In [111]:
top_20_songs_per_user = recommend_top_n(user_id, model, genre_id, language_id, age, gender, num_items)
final_10_songs = filter_with_mood(user_id, top_20_songs_per_user, db)
print("Final 10 Recommended Songs for User", user_id, ":", final_10_songs)

Final 10 Recommended Songs for User 2 : [36083, 154381, 154382, 165112, 154340, 108432, 154365, 36077, 36074, 154383]


RETRAIN

In [None]:
import pandas as pd
import numpy as np
from pymongo import MongoClient
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

client = MongoClient('MONGODB-URL')
db = client['BUFFER-DATABASE-NAME']

In [None]:
def retrieve_all_new_data():
    ratings_data = pd.DataFrame(list(db["buffer_ratings"].find()))
    songs_data = pd.DataFrame(list(db["buffer_songs"].find()))
    activity_data = pd.DataFrame(list(db["buffer_user_activity"].find()))
    user_data = pd.DataFrame(list(db["buffer_users"].find()))

    user_song_ratings = pd.pivot_table(ratings_data, values='rating', index='user_id', columns='track_id').fillna(0)

    user_ids, track_ids = np.nonzero(user_song_ratings)
    ratings = [user_song_ratings.iloc[user, track] for user, track in zip(user_ids, track_ids)]
    merged_data = ratings_data.merge(activity_data, on="user_id", how="left").merge(user_data, on="user_id", how="left")
    
    return user_ids, track_ids, np.array(ratings), songs_data, merged_data

In [None]:
class RatingDataset(torch.utils.data.Dataset):
    def __init__(self, user_ids, track_ids, ratings, genres, languages, ages, genders):
        self.user_ids = user_ids
        self.track_ids = track_ids
        self.ratings = ratings
        self.genres = genres
        self.languages = languages
        self.ages = ages
        self.genders = genders

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        return (
            self.user_ids[idx],
            self.track_ids[idx],
            self.ratings[idx],
            self.genres[idx],
            self.languages[idx],
            self.ages[idx],
            self.genders[idx]
        )

In [None]:
def fine_tune_model(model, new_user_ids, new_track_ids, new_ratings, new_songs_data, new_merged_data):
    for param in model.parameters():
        param.requires_grad = False
    for param in model.output.parameters():
        param.requires_grad = True

    if new_user_ids:
        for new_user_id in new_user_ids:
            model.user_embeddings[new_user_id] = torch.randn(1, model.user_embedding_dim)

    new_train_dataset = RatingDataset(
        new_user_ids, new_track_ids, new_ratings,
        new_merged_data.get('preferred_genre', []),
        new_merged_data.get('preferred_language', []),
        new_merged_data.get('user_age', []),
        new_merged_data.get('user_gender', [])
    )
    new_train_loader = DataLoader(new_train_dataset, batch_size=256, shuffle=True)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    model.train()
    for user_id, item_id, rating, genre_id, language_id, age, gender in new_train_loader:
        optimizer.zero_grad()
        prediction = model(user_id, item_id, genre_id, language_id, age, gender)
        loss = criterion(prediction.squeeze(), rating)
        loss.backward()
        optimizer.step()

    if new_songs_data:
        for song_data in new_songs_data:
            song_id = song_data['track_id']
            model.song_embeddings[song_id] = torch.randn(1, model.song_embedding_dim)

    return model

In [None]:
def save_model(model, file_path):
    torch.save(model.state_dict(), file_path)

def load_model(model, file_path):
    model.load_state_dict(torch.load(file_path))
    model.eval()
    return model

In [None]:
class RecommendationModel(nn.Module):
    def __init__(self, n_users, n_items, n_factors, n_genres, n_languages):
        super(RecommendationModel, self).__init__()
        self.user_embedding_dim = n_factors
        self.song_embedding_dim = n_factors
        
        self.user_embeddings = nn.Embedding(n_users, self.user_embedding_dim)
        self.song_embeddings = nn.Embedding(n_items, self.song_embedding_dim)
        
        self.output = nn.Linear(self.user_embedding_dim + self.song_embedding_dim + n_genres + n_languages, 1)

    def forward(self, user_id, item_id, genre_id, language_id, age, gender):
        user_embed = self.user_embeddings(user_id)
        song_embed = self.song_embeddings(item_id)
        
        features = torch.cat([user_embed, song_embed, genre_id, language_id, age, gender], dim=1)
        
        return self.output(features)


In [None]:
new_user_ids, new_track_ids, new_ratings, new_songs_data, new_merged_data = retrieve_all_new_data()

n_users = len(new_merged_data['user_id'].unique())
n_items = len(new_songs_data['track_id'].unique())
n_factors = 20
n_genres = len(new_merged_data['preferred_genre'].unique())
n_languages = len(new_merged_data['preferred_language'].unique())

In [None]:
model = RecommendationModel(n_users, n_items, n_factors, n_genres, n_languages)
model = load_model(model, 'Files\ncf_model.pth')

model = fine_tune_model(model, new_user_ids, new_track_ids, new_ratings, new_songs_data, new_merged_data)
save_model(model, 'Files\ncf_model.pth')
