In [9]:
# this should be in seperate file
import torch
from torch.utils.data import Dataset


class RatingsTrainDataset(Dataset):

    def __init__(self, ratings, all_product_ids):
        self.users, self.items, self.labels = self.get_dataset(ratings, all_product_ids)

    def __len__(self):
        return len(self.users)
  
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_dataset(self, ratings, all_product_ids):
        users, items, labels = [], [], []
        user_item_set = set(zip(ratings['user_id_int'], ratings['product_id_int']))

        num_negatives = 4
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(num_negatives):
                negative_item = np.random.choice(all_product_ids)
                while (u, negative_item) in user_item_set:
                    negative_item = np.random.choice(all_product_ids)
                users.append(u)
                items.append(negative_item)
                labels.append(0)
        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

In [10]:
# this should be in seperate file
import torch.nn as nn
import pytorch_lightning as pl
from torch.utils.data import DataLoader

class NCF(pl.LightningModule):
    
    def __init__(self, num_users, num_items, ratings, all_product_ids):
        super().__init__()
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=8)
        self.item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=8)
        self.fc1 = nn.Linear(in_features=16, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.output = nn.Linear(in_features=32, out_features=1)
        self.ratings = ratings
        self.all_product_ids = all_product_ids
        
    def forward(self, user_input, item_input):
        
        user_embedded = self.user_embedding(user_input)
        item_embedded = self.item_embedding(item_input)

        vector = torch.cat([user_embedded, item_embedded], dim=-1)

        vector = nn.ReLU()(self.fc1(vector))
        vector = nn.ReLU()(self.fc2(vector))

        pred = nn.Sigmoid()(self.output(vector))

        return pred
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predicted_labels = self(user_input, item_input)
        loss = nn.BCELoss()(predicted_labels, labels.view(-1, 1).float())
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters())

    def train_dataloader(self):
        return DataLoader(RatingsTrainDataset(self.ratings, self.all_product_ids),
                          batch_size=512, num_workers=0)

In [11]:
# this file should be in seperate file
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import pickle

def build_filtering_model(product_data_path, rating_data_path, company_id):
    
    # loading data
    product_data = pd.read_csv(product_data_path)
    rating_data = pd.read_csv(rating_data_path)
    
    # preprocessing data
    encoder1 = LabelEncoder()
    encoder2 = LabelEncoder()
    rating_data['user_id_int'] = encoder1.fit_transform(rating_data['user_id'])
    product_data['product_id_int'] = encoder2.fit_transform(product_data['product_id'])
    user_mapping = {}
    reverse_user_mapping = {}
    for (original, mapped) in zip(rating_data['user_id'].unique() ,rating_data['user_id_int'].unique()):
        user_mapping[original] = mapped
        reverse_user_mapping[mapped] = original
    product_mapping = {}
    reverse_product_mapping = {}
    for (original, mapped) in zip(product_data['product_id'].unique() ,product_data['product_id_int'].unique()):
        product_mapping[original] = mapped
        reverse_product_mapping[mapped] = original
    rating_data['product_id_int'] = rating_data["product_id"].apply(lambda x: product_mapping.get(x))
    
    print('preprocessing completed')

    # Building Content Based Filtering Model
    product_data['full_description'] = product_data['title'] + " " + product_data['description']
    product_data['full_description'] = product_data['full_description'].fillna('')
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df = 0, stop_words='english')
    tfidf_matrix = tf.fit_transform(product_data['full_description'])
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    product_data = product_data.reset_index()
    indices = pd.Series(product_data.index, index = product_data['product_id_int'])
    
    print('content based modeling completed')
    
    # Building Collaborative Filtering Model
    train_ratings = rating_data[['user_id_int', 'product_id_int', 'rating']]
    all_product_ids = rating_data['product_id_int'].unique()
    users, items, labels = [], [], []
    user_item_set = set(zip(train_ratings['user_id_int'], train_ratings['product_id_int']))
    num_negatives = 4
    for (u, i) in user_item_set:
        users.append(u)
        items.append(i)
        labels.append(1)
        for _ in range(num_negatives):
            negative_item = np.random.choice(all_product_ids) 
            while (u, negative_item) in user_item_set:
                negative_item = np.random.choice(all_product_ids)
            users.append(u)
            items.append(negative_item)
            labels.append(0)

    num_users = rating_data['user_id_int'].max()+1
    num_items = rating_data['product_id_int'].max()+1
    all_product_ids = rating_data['product_id_int'].unique()

    model = NCF(num_users, num_items, train_ratings, all_product_ids)
    trainer = pl.Trainer(max_epochs=5,logger=True, accelerator='auto')
    trainer.fit(model)
    user_interacted_items = rating_data.groupby('user_id_int')['product_id_int'].apply(list).to_dict()
    print('collaborative filtering modeling completed')
    
    # store mapping data
    if not os.path.exists(company_id):
        os.mkdir(company_id)
        
    with open(f'{company_id}/user_mapping.pkl', 'wb') as file:
        pickle.dump(user_mapping, file)
    print(f'Created {company_id}/user_mapping.pkl')
        
    with open(f'{company_id}/reverse_user_mapping.pkl', 'wb') as file:
        pickle.dump(reverse_user_mapping, file)
    print(f'Created {company_id}/reverse_user_mapping.pkl')
        
    with open(f'{company_id}/product_mapping.pkl', 'wb') as file:
        pickle.dump(product_mapping, file)
    print(f'Created {company_id}/product_mapping.pkl')
        
    with open(f'{company_id}/reverse_product_mapping.pkl', 'wb') as file:
        pickle.dump(reverse_product_mapping, file)
    print(f'Created {company_id}/reverse_product_mapping.pkl')
        
    # store content based filtering model
    with open(f'{company_id}/content_based_model.pkl', 'wb') as file:
        pickle.dump({'cosine_sim': cosine_sim, 'series': indices}, file)
    print(f'Created {company_id}/content_based_model.pkl')
        
    # store collaborative filtering model
    with open(f'{company_id}/collaborative_filtering_model.pkl', 'wb') as file:
        pickle.dump({'product_ids': all_product_ids, 'model': model, 'interacted_items': user_interacted_items}, file)
    print(f'Created {company_id}/collaborative_filtering_model.pkl')
    
    
    print('Models Created Successfully')

In [12]:
# call this for building model
build_filtering_model('products.csv', 'ratings.csv', 'xjkgkjshl')

preprocessing completed
content based modeling completed


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: /Users/suyashnehete/Suyash/PCCOE/final year/project/Final/lightning_logs

  | Name           | Type      | Params
---------------------------------------------
0 | user_embedding | Embedding | 339 K 
1 | item_embedding | Embedding | 440 K 
2 | fc1            | Linear    | 1.1 K 
3 | fc2            | Linear    | 2.1 K 
4 | output         | Linear    | 33    
---------------------------------------------
783 K     Trainable params
0         Non-trainable params
783 K     Total params
3.136     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


collaborative filtering modeling completed
Created xjkgkjshl/user_mapping.pkl
Created xjkgkjshl/reverse_user_mapping.pkl
Created xjkgkjshl/product_mapping.pkl
Created xjkgkjshl/reverse_product_mapping.pkl
Created xjkgkjshl/content_based_model.pkl
Created xjkgkjshl/collaborative_filtering_model.pkl
Models Created Successfully
