In [1]:
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA , KernelPCA
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
import torch
import torch.optim as optim

In [56]:
def download_and_unzip(url, extract_to='.'):
    """
    Downloads a ZIP file from the specified URL and extracts its contents to a specified location.
    
    Args:
        url: string, the URL of the ZIP file to download.
        extract_to: string, the path to the directory where the contents of the ZIP file should be extracted.
                    Defaults to the current working directory.
                    
    Returns:
        None
    """
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)

url ="https://files.grouplens.org/datasets/movielens/ml-20m.zip"
download_and_unzip(url)

In [2]:
data = pd.read_csv("ml-20m/ratings.csv")

In [4]:
def prepare_dataset(df):
    '''
    Function for convert given dataset to our objective: Given a pandas DataFrame, 
    returns converted pandas DataFrame.
    Args:
        df: a Pandas DataFrame containing the dataset.
    
    Returns:
        The modified DataFrame.
    '''
    subtitude = {0.5:0,
             1.0:0,
             2.0:0,
             3.0:1,
             3.5:1,
             5.0:2,
             4.0:2,
             4.5:2,
             2.5:1,
             1.5:0
             }
    df = df.rename({"userId":"user_id","movieId":"sku","rating":"type"},axis=1)
    df = df.drop("timestamp",axis=1)
    df["type"] = df["type"].map(subtitude)
    df.sku = "MYO-" + df.sku.astype(str)
    return df

In [84]:
df = prepare_dataset(data)[:5000]

In [85]:
df.columns

Index(['user_id', 'sku', 'type'], dtype='object')

In [86]:
df["type"].value_counts()

2    2829
1    1596
0     575
Name: type, dtype: int64

In [87]:
print("unique users: ",df.user_id.nunique(),"unique products skus: ",df.sku.nunique())

unique users:  50 unique products skus:  2090


# Occurrence Matrix and Nearest Neighbor

In [9]:
def interactions_as_embeddings(df):
    """
    Transforms user-product interactions in a DataFrame into product embeddings.
    
    Args:
        df: a Pandas DataFrame containing user-product interactions, with columns "sku", "user_id" and  "type".
        
    Returns:
        A DataFrame containing product embeddings, with columns "sku" and "embedding".
    """
    table = pd.pivot_table(df, values='type', index=["sku"],columns=['user_id'])
    table = table.fillna(0)
    sparsity = sum((table == 0).astype(int).sum())/table.size
    table["embedding"]=table.apply(lambda x: x.values,axis=1)
    product_embeddings = pd.DataFrame(table.to_records())[["sku","embedding"]]
    return product_embeddings

In [10]:
embeddings = interactions_as_embeddings(df)

In [16]:
def map_indices_to_skus(index_series,index_dict):
    """
    Maps a series of indices to a series of SKUs using a provided mapping dictionary.
    
    Parameters:
        index_series (list): A series of indices to map to SKUs.
        index_dict (dict): A dictionary that maps indices to SKUs.
        
    Returns:
        List: A series of SKUs corresponding to the provided indices.
    """
    sku_series = []
    for i in index_series:
        sku_series.append(index_dict[i])
    return sku_series

In [17]:
def find_nearest_neighbors(embeddings):
    """
    Finds the nearest neighbors for a series of embeddings using k-nearest neighbors algorithm.
    
    Parameters:
        embeddings (pandas.DataFrame): A DataFrame containing the embeddings to find neighbors for.
        
    Returns:
        tuple: A tuple containing two NumPy arrays, the first containing distances to the nearest neighbors and the second containing the indices of the nearest neighbors.
    """
    index_dict = embeddings.reset_index()[["index","sku"]].set_index('index').T.to_dict("records")
    nbrs = NearestNeighbors(n_neighbors=11, algorithm='ball_tree').fit(embeddings.embedding.tolist())
    distances, indices = nbrs.kneighbors(embeddings.embedding.tolist())
    return indices, index_dict
indices, index_dict = find_nearest_neighbors(embeddings)


In [18]:
def generate_recommendations(indices,embeddings,index_dict):
    """
    Generates product recommendations based on nearest neighbors of a given set of embeddings.
    
    Parameters:
        indices (pandas.DataFrame): A DataFrame of nearest neighbor indices for the embeddings.
        embeddings (pandas.DataFrame): A DataFrame of the embeddings.
        index_dict (dict): A dictionary mapping indices to product SKUs.
        
    Returns:
        pandas.DataFrame: A DataFrame of the recommended product SKUs.
    """
    recommendations = pd.DataFrame(indices).reset_index()
    recommendation_df = embeddings.reset_index().merge(recommendations,how="left",on = "index")
    recommendation_df=recommendation_df[["sku",1,2,3,4,5,6,7,8,9,10]]
    recommendations_skus = recommendation_df.drop("sku",axis=1).apply(lambda x: map_indices_to_skus(x,index_dict[0]),axis=0)
    final_recommendatons = embeddings.reset_index().merge(recommendations_skus.reset_index(),how="left",on = "index")[["sku",1,2,3,4,5,6,7,8,9,10]]
    return final_recommendatons


In [19]:
indices, index_dict = find_nearest_neighbors(embeddings)
recommendatons = generate_recommendations(indices,embeddings,index_dict)

In [21]:
recommendatons.head()

Unnamed: 0,sku,1,2,3,4,5,6,7,8,9,10
0,MYO-1,MYO-442,MYO-2797,MYO-1242,MYO-610,MYO-527,MYO-1221,MYO-173,MYO-160,MYO-1230,MYO-2657
1,MYO-10,MYO-733,MYO-165,MYO-6,MYO-21,MYO-236,MYO-161,MYO-207,MYO-349,MYO-153,MYO-19
2,MYO-1009,MYO-1350,MYO-1262,MYO-1920,MYO-1243,MYO-1201,MYO-1994,MYO-1348,MYO-112,MYO-1217,MYO-1009
3,MYO-1017,MYO-2390,MYO-1357,MYO-2080,MYO-1449,MYO-1256,MYO-1253,MYO-1777,MYO-1265,MYO-1017,MYO-1271
4,MYO-1020,MYO-1597,MYO-1183,MYO-1517,MYO-1343,MYO-1101,MYO-1573,MYO-1466,MYO-1301,MYO-1408,MYO-1441


pca kernel embeddings

In [23]:
def transform_embeddings_with_kernel_pca(embeddings,n_components = 25,kernel = 'sigmoid'):
    """
    Transforms a series of embeddings using Kernel PCA.

    Parameters:
        embeddings (pandas.DataFrame): A DataFrame containing the embeddings to transform.
        n_components (int): The number of components to keep after transformation.
        kernel (str): The kernel to use for the transformation.

    Returns:
        pandas.DataFrame: A DataFrame containing the transformed embeddings.
    """
    transformer = KernelPCA(n_components = n_components, kernel = kernel)
    X_transformed = transformer.fit_transform(embeddings.embedding.tolist())
    df1 = pd.DataFrame()
    df1["embedding"]=X_transformed.tolist()
    df1.reset_index(inplace=True)
    df1 = df1.apply(np.array)
    embeddings = embeddings.reset_index().merge(df1,how="left",on = "index")[["sku","embedding_y"]]
    pca_embeddings = embeddings.rename(columns = {"embedding_y":"embedding"})
    return pca_embeddings

In [24]:
pca_embeddings = transform_embeddings_with_kernel_pca(embeddings)

In [25]:
indices, index_dict = find_nearest_neighbors(pca_embeddings)
recommendatons = generate_recommendations(indices,embeddings,index_dict)

In [26]:
recommendatons.head()

Unnamed: 0,sku,1,2,3,4,5,6,7,8,9,10
0,MYO-1,MYO-442,MYO-610,MYO-1242,MYO-1221,MYO-2797,MYO-2657,MYO-160,MYO-527,MYO-329,MYO-1230
1,MYO-10,MYO-165,MYO-6,MYO-733,MYO-344,MYO-153,MYO-592,MYO-372,MYO-550,MYO-551,MYO-48
2,MYO-1009,MYO-112,MYO-1246,MYO-1243,MYO-1217,MYO-1920,MYO-1994,MYO-1848,MYO-1750,MYO-1348,MYO-2253
3,MYO-1017,MYO-1256,MYO-1253,MYO-3699,MYO-3671,MYO-2080,MYO-2390,MYO-2405,MYO-2403,MYO-1777,MYO-348
4,MYO-1020,MYO-1183,MYO-1101,MYO-1301,MYO-1343,MYO-1894,MYO-1888,MYO-2013,MYO-1932,MYO-1466,MYO-1441


# Graph

In [88]:
import networkx as nx


In [89]:
def create_graph(df):
    """
    Creates a graph from a DataFrame of user-product interactions.
    
    Parameters:
        df (pandas.DataFrame): A DataFrame of user-product interactions, with columns "user_id", "sku", and "type".
        
    Returns:
        networkx.Graph: A graph of the user-product interactions.
    """

    return nx.convert_matrix.from_pandas_edgelist(df,'user_id','sku',edge_attr="type", create_using=nx.Graph())


In [90]:
def graph_relations(source,G_users):
    """
    Generates a list of recommended products based on the relations of a given user in a user-product interaction graph.
    
    Parameters:
        source (str): The product SKU to get recommendations for.
        G_users (networkx.Graph): The user-product interaction graph.
        
    Returns:
        list: A list of recommended product SKUs.
    """
    commons_dict = {}
    for e in G_users.neighbors(source):
        for e2 in G_users.edges(e):
            if e2==source:
                continue
            product = e2[1]
            weight = G_users[e2[0]][e2[1]]["type"]
            if product not in commons_dict.keys():
                commons_dict[product] = weight
            else:
                commons_dict[product] += weight
    return sorted(commons_dict, key=commons_dict.get, reverse=True)[:10]

In [91]:
def graph_recommendations(df):
    """
    Generates product recommendations based on a user-product interaction graph.
    
    Parameters:
        df (pandas.DataFrame): A DataFrame of user-product interactions.
        
    Returns:
        pandas.DataFrame: A DataFrame of product SKUs and their corresponding recommendations.
    """

    G_users = create_graph(df)
    product_sku = pd.DataFrame(list(set(df.sku))).rename(columns={0:"sku"})
    product_sku["recommendations"]=product_sku.sku.apply(lambda x: graph_relations(x,G_users))
    
    for i in range(1,11):
        product_sku[i] = product_sku.recommendations.apply(lambda x:x[i-1])
    
    product_sku.drop(columns = "recommendations",inplace=True)
    return product_sku

In [92]:
recommendations = graph_recommendations(df)

In [93]:
recommendations

Unnamed: 0,sku,1,2,3,4,5,6,7,8,9,10
0,MYO-6264,MYO-1,MYO-32,MYO-39,MYO-110,MYO-150,MYO-158,MYO-160,MYO-165,MYO-170,MYO-172
1,MYO-1632,MYO-1183,MYO-1407,MYO-1416,MYO-1446,MYO-1485,MYO-1608,MYO-1617,MYO-1641,MYO-1661,MYO-1673
2,MYO-33166,MYO-551,MYO-2291,MYO-1653,MYO-3897,MYO-4973,MYO-7153,MYO-33166,MYO-1,MYO-32,MYO-39
3,MYO-3147,MYO-589,MYO-3147,MYO-296,MYO-318,MYO-593,MYO-3996,MYO-50,MYO-1089,MYO-2858,MYO-1240
4,MYO-1214,MYO-260,MYO-1196,MYO-318,MYO-1270,MYO-589,MYO-593,MYO-1214,MYO-1210,MYO-1197,MYO-541
...,...,...,...,...,...,...,...,...,...,...,...
2085,MYO-85,MYO-58,MYO-85,MYO-154,MYO-527,MYO-538,MYO-593,MYO-608,MYO-898,MYO-912,MYO-913
2086,MYO-5349,MYO-1270,MYO-2571,MYO-6377,MYO-260,MYO-356,MYO-3793,MYO-6333,MYO-6539,MYO-60069,MYO-595
2087,MYO-70,MYO-110,MYO-589,MYO-70,MYO-266,MYO-480,MYO-1249,MYO-1580,MYO-47,MYO-50,MYO-293
2088,MYO-4848,MYO-11,MYO-260,MYO-348,MYO-356,MYO-364,MYO-440,MYO-480,MYO-539,MYO-553,MYO-587


In [74]:
for i in range(1,11):
    recommendations[i] = recommendations.recommendations.apply(lambda x:x[i-1])


In [75]:
recommendations

Unnamed: 0,sku,recommendations,1,2,3,4,5,6,7,8,9,10
0,MYO-1080,"[MYO-260, MYO-318, MYO-1036, MYO-1079, MYO-109...",MYO-260,MYO-318,MYO-1036,MYO-1079,MYO-1097,MYO-1196,MYO-1198,MYO-589,MYO-1080,MYO-1136
1,MYO-8482,"[MYO-151, MYO-223, MYO-253, MYO-260, MYO-293, ...",MYO-151,MYO-223,MYO-253,MYO-260,MYO-293,MYO-296,MYO-318,MYO-541,MYO-1036,MYO-1079
2,MYO-2615,"[MYO-1, MYO-32, MYO-50, MYO-175, MYO-223, MYO-...",MYO-1,MYO-32,MYO-50,MYO-175,MYO-223,MYO-260,MYO-316,MYO-318,MYO-329,MYO-457
3,MYO-631,"[MYO-11, MYO-62, MYO-110, MYO-141, MYO-150, MY...",MYO-11,MYO-62,MYO-110,MYO-141,MYO-150,MYO-260,MYO-282,MYO-316,MYO-318,MYO-350
4,MYO-6502,"[MYO-151, MYO-223, MYO-253, MYO-260, MYO-293, ...",MYO-151,MYO-223,MYO-253,MYO-260,MYO-293,MYO-296,MYO-318,MYO-541,MYO-1036,MYO-1079
...,...,...,...,...,...,...,...,...,...,...,...,...
693,MYO-70,"[MYO-3, MYO-62, MYO-70, MYO-110, MYO-260, MYO-...",MYO-3,MYO-62,MYO-70,MYO-110,MYO-260,MYO-266,MYO-480,MYO-541,MYO-589,MYO-908
694,MYO-140,"[MYO-62, MYO-141, MYO-260, MYO-17, MYO-648, MY...",MYO-62,MYO-141,MYO-260,MYO-17,MYO-648,MYO-708,MYO-780,MYO-788,MYO-11,MYO-110
695,MYO-7046,"[MYO-151, MYO-223, MYO-253, MYO-260, MYO-293, ...",MYO-151,MYO-223,MYO-253,MYO-260,MYO-293,MYO-296,MYO-318,MYO-541,MYO-1036,MYO-1079
696,MYO-4848,"[MYO-11, MYO-260, MYO-348, MYO-356, MYO-364, M...",MYO-11,MYO-260,MYO-348,MYO-356,MYO-364,MYO-440,MYO-480,MYO-539,MYO-553,MYO-587


# Deep embedding learning


In [8]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
import torch
import torch.optim as optim


In [42]:
class InteractionDataset(Dataset):
    """
    A dataset for user-product interactions, containing user IDs, product SKUs, and labels indicating the type of interaction.
    """

    def __init__(self,df):
        """
        Initializes the dataset with a DataFrame of interaction data.
        
        Parameters:
            df (pandas.DataFrame): The DataFrame containing the interaction data.
        """
        self.df = df
        self.vocab = pd.DataFrame(list(set(self.df.sku))).reset_index().rename(columns={0:"sku"})

    def __len__(self):
        """
        Returns the number of interaction in the dataset.
        """
        return self.df.shape[0]

    def __getitem__(self,idx):
        """
        Returns the user ID, product SKU, and label for a interaction at a given index.
        
        Parameters:
            idx (int): The index of the interaction to return.
            
        Returns:
            tuple: A tuple containing the user ID, product SKU, and label for the interaction.
        """
        user_id = self.df.iloc[idx].user_id
        product_sku =  self.df.iloc[idx].sku
        product_sku = self.vocab[self.vocab["sku"]==product_sku].index[0]
        label =  self.df.iloc[idx]["type"]
        return (user_id,product_sku),label


In [43]:
class embedding_learning_model(nn.Module):
    """
    A PyTorch model for learning embeddings from user-product pairs.
    """

    def __init__(self,num_user,num_product):
        """
        Initializes the model with the number of users and products.
        
        Parameters:
            num_user (int): The number of users in the dataset.
            num_product (int): The number of products in the dataset.
        """
        super(embedding_learning_model, self).__init__()
        self.user_embedding_layer = nn.Embedding(num_user, 128)
        self.sku_embedding_layer = nn.Embedding(num_product, 128)
        self.linear_relu_stack = nn.Sequential(
                nn.Linear(256, 64),
                nn.ReLU(),
                nn.Linear(64, 3)
            )
    def forward(self,x):
        """
        Performs a forward pass through the model.
        
        Parameters:
            x (tuple): A tuple containing the user ID and product SKU to process.
            
        Returns:
            torch.Tensor: The output class probablities of the model.
        """
        user_id,product_sku = x
        user_embedding = self.user_embedding_layer(user_id)
        sku_embedding = self.sku_embedding_layer(product_sku)
        concat = torch.cat([user_embedding, sku_embedding], dim=0).reshape(1,256)
        x = self.linear_relu_stack(concat)
        return x



In [44]:
def get_ann_loader(df,batch_size):
    """
    Creates a dataset and data loader for training an ANN model.
    
    Parameters:
        df (pandas.DataFrame): A DataFrame of user-product interactions.
        batch_size (int): The batch size for the data loader.
        
    Returns:
        tuple: A tuple containing the dataset and data loader.
    """

    dataset = InteractionDataset(df)
    train_dataloader = DataLoader(dataset, batch_size, shuffle=True)
    return dataset, train_dataloader

In [45]:
dataset, train_dataloader = get_ann_loader(df,1)

In [46]:
print("unique users: ",df.user_id.nunique(),"unique products skus: ",df.sku.nunique())
unique_user = df.user_id.nunique()
unique_product = df.sku.nunique()

unique users:  11 unique products skus:  698


In [47]:
model = embedding_learning_model(unique_user+1,unique_product+1) 


In [48]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [49]:
for epoch in range(5):
    for idx,(x,y) in enumerate(train_dataloader):
        y_hat = model(x)
        loss = criterion(y_hat,y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        break
    break
print(loss)

tensor(1.0820, grad_fn=<NllLossBackward0>)


In [50]:
def generate_ann_embeddings(model,dataset,sku):
    """
    Returns the embedding for a product in a trained model.
    
    Parameters:
        model (EmbeddingLearningModel): The trained model.
        dataset (InteractionDataset): The dataset the model was trained on.
        sku (str): The SKU of the product to get the embedding for.
        
    Returns:
        numpy.ndarray: The embedding for the product.
    """

    sku = dataset.vocab[dataset.vocab["sku"]==sku].index[0]
    embeddings = model.sku_embedding_layer(torch.tensor(sku)).cpu().detach().numpy()
    return embeddings

In [51]:
def get_ann_embeddings(model,df,dataset):
    """
    Generates embeddings for products using an ANN model.
    
    Parameters:
        model (EmbeddingLearningModel): The ANN model.
        df (pandas.DataFrame): A DataFrame of product data.
        dataset (InreractionDataset): The dataset the model was trained on.
        
    Returns:
        pandas.DataFrame: A DataFrame of product SKUs and their corresponding embeddings.
    """

    product_sku = pd.DataFrame(list(set(df.sku))).rename(columns={0:"sku"})
    product_sku["embedding"] = product_sku.sku.apply(lambda x:generate_ann_embeddings(model,dataset,x))
    return product_sku

In [52]:
embeddings = get_ann_embeddings(model,df,dataset)

In [21]:
def map_indices_to_skus(index_series,index_dict):
    """
    Maps a series of indices to a series of SKUs using a provided mapping dictionary.
    
    Parameters:
        index_series (list): A series of indices to map to SKUs.
        index_dict (dict): A dictionary that maps indices to SKUs.
        
    Returns:
        List: A series of SKUs corresponding to the provided indices.
    """
    sku_series = []
    for i in index_series:
        sku_series.append(index_dict[i])
    return sku_series

In [23]:
def find_nearest_neighbors(embeddings):
    """
    Finds the nearest neighbors for a series of embeddings using k-nearest neighbors algorithm.
    
    Parameters:
        embeddings (pandas.DataFrame): A DataFrame containing the embeddings to find neighbors for.
        
    Returns:
        tuple: A tuple containing two NumPy arrays, the first containing distances to the nearest neighbors and the second containing the indices of the nearest neighbors.
    """
    index_dict = embeddings.reset_index()[["index","sku"]].set_index('index').T.to_dict("records")
    nbrs = NearestNeighbors(n_neighbors=11, algorithm='ball_tree').fit(embeddings.embedding.tolist())
    distances, indices = nbrs.kneighbors(embeddings.embedding.tolist())
    return indices, index_dict
indices, index_dict = find_nearest_neighbors(embeddings)


In [28]:
def generate_recommendations(indices,embeddings,index_dict):
    """
    Generates product recommendations based on nearest neighbors of a given set of embeddings.
    
    Parameters:
        indices (pandas.DataFrame): A DataFrame of nearest neighbor indices for the embeddings.
        embeddings (pandas.DataFrame): A DataFrame of the embeddings.
        index_dict (dict): A dictionary mapping indices to product SKUs.
        
    Returns:
        pandas.DataFrame: A DataFrame of the recommended product SKUs.
    """
    recommendations = pd.DataFrame(indices).reset_index()
    recommendation_df = embeddings.reset_index().merge(recommendations,how="left",on = "index")
    recommendation_df=recommendation_df[["sku",1,2,3,4,5,6,7,8,9,10]]
    recommendations_skus = recommendation_df.drop("sku",axis=1).apply(lambda x: map_indices_to_skus(x,index_dict[0]),axis=0)
    final_recommendatons = embeddings.reset_index().merge(recommendations_skus.reset_index(),how="left",on = "index")[["sku",1,2,3,4,5,6,7,8,9,10]]
    return final_recommendatons


In [54]:
indices, index_dict = find_nearest_neighbors(embeddings)
recommendatons = generate_recommendations(indices,embeddings,index_dict)

In [55]:
recommendatons.head()

Unnamed: 0,sku,1,2,3,4,5,6,7,8,9,10
0,MYO-1080,MYO-286,MYO-4022,MYO-2951,MYO-7,MYO-2723,MYO-3578,MYO-1270,MYO-1682,MYO-2861,MYO-3524
1,MYO-8482,MYO-2694,MYO-541,MYO-1230,MYO-286,MYO-3469,MYO-2009,MYO-919,MYO-140,MYO-1973,MYO-2143
2,MYO-2615,MYO-2390,MYO-256,MYO-4155,MYO-3578,MYO-2676,MYO-2173,MYO-3997,MYO-65,MYO-4022,MYO-480
3,MYO-631,MYO-1544,MYO-1094,MYO-3074,MYO-1230,MYO-2791,MYO-2858,MYO-3235,MYO-434,MYO-3438,MYO-16
4,MYO-6502,MYO-207,MYO-5816,MYO-3578,MYO-4446,MYO-2009,MYO-32,MYO-480,MYO-3753,MYO-1674,MYO-7164
