In [16]:
#since the system is based on one data set and I wanted a system that does clustering
#I extracted a dataset with items and customers from the main task file 
#to run this make sure jupyter notebook/jupyter lab settings in EDIT is running on GPU
    #The files were extracted by using pgadmin


In [1]:
# import the dataset
import pandas as pd
task_file = pd.read_csv(r'c:\Output\task_file.csv')
items_df = pd.read_csv(r'c:\Output\Items.csv')
customers_df = pd.read_csv(r'c:\Output\Customers.csv')

In [2]:
items_df = pd.read_csv(r'c:\Output\Items.csv')
customers_df = pd.read_csv(r'c:\Output\Customers.csv')

In [3]:
print('The dimensions of items dataframe are:', items_df.shape,'\nThe dimensions of customers dataframe are:', customers_df.shape)

The dimensions of items dataframe are: (8634, 3) 
The dimensions of customers dataframe are: (200000, 4)


In [4]:
# Take a look at items_df
items_df.head()

Unnamed: 0,d_global_item_id,count,d_item_group_id
0,7748,34,0
1,108875,2,0
2,14837,2,0
3,10020,30,0
4,3251,4,0


In [5]:
# Take a look at customers_df
customers_df.head()

Unnamed: 0,d_person_id,d_global_item_id,sales_amount,d_date_id
0,1,8320,0.505051,2014-09-01
1,1,1,1.707071,2014-09-01
2,1,1113,2.727273,2014-09-01
3,1,3336,3.535354,2014-09-01
4,1,11707,0.959596,2014-09-01


In [6]:
# items ID to customer mapping
items_names = items_df.set_index('d_global_item_id')['count'].to_dict()
n_users = len(customers_df.d_person_id.unique())
n_items = len(customers_df.d_global_item_id.unique())
print("Number of unique users:", n_users)
print("Number of unique items:", n_items)
print("The full recommendation matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of transactions:", (len(customers_df))/2)
print("Therefore: ", (len(customers_df)/2) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 21
Number of unique items: 8634
The full recommendation matrix will have: 181314 elements.
----------
Number of transactions: 100000.0
Therefore:  55.15293910012464 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [7]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)
    
    def predict(self, user, item):
        return self.forward(user, item)

In [8]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.customers = customers_df.copy()
        
        # Extract all user IDs and item IDs
        users = customers_df.d_global_item_id.unique()
        items = customers_df.d_person_id.unique()
        
        #--- Producing new continuous IDs for users and items ---
        
        # Unique values : index
        self.d_person_id2idx = {o:i for i,o in enumerate(users)}
        self.d_global_item_id2idx = {o:i for i,o in enumerate(items)}
        
        # Obtained continuous ID for users and items
        self.idx2d_person_id = {i:o for o,i in self.d_person_id2idx.items()}
        self.idx2d_global_item_id = {i:o for o,i in self.d_global_item_id2idx.items()}
        
        # return the id from the indexed values as noted in the lambda function down below.
        self.sales_amounts.d_global_item_id = customers_df.d_global_item_id.apply(lambda x: self.d_global_item_id2idx[x])
        self.sales_amounts.d_person_id = customers_df.d_person_id.apply(lambda x: self.d_person_id2idx[x])
        
        
        self.x = self.ratings.drop(['sales_amount', 'd_date_id'], axis=1).values
        self.y = self.ratings['sales_amount'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.sales_amount)

In [9]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(21, 8)
  (item_factors): Embedding(8634, 8)
)
user_factors.weight tensor([[0.0238, 0.0101, 0.0212, 0.0135, 0.0093, 0.0362, 0.0274, 0.0292],
        [0.0423, 0.0039, 0.0221, 0.0090, 0.0452, 0.0236, 0.0330, 0.0085],
        [0.0262, 0.0104, 0.0357, 0.0398, 0.0314, 0.0295, 0.0113, 0.0360],
        [0.0393, 0.0010, 0.0049, 0.0147, 0.0114, 0.0441, 0.0274, 0.0110],
        [0.0370, 0.0392, 0.0070, 0.0118, 0.0020, 0.0143, 0.0230, 0.0357],
        [0.0456, 0.0456, 0.0263, 0.0332, 0.0364, 0.0494, 0.0301, 0.0234],
        [0.0222, 0.0053, 0.0127, 0.0095, 0.0184, 0.0112, 0.0037, 0.0311],
        [0.0415, 0.0462, 0.0472, 0.0124, 0.0229, 0.0332, 0.0447, 0.0372],
        [0.0371, 0.0109, 0.0312, 0.0452, 0.0084, 0.0271, 0.0413, 0.0142],
        [0.0115, 0.0377, 0.0214, 0.0043, 0.0275, 0.0275, 0.0019, 0.0358],
        [0.0327, 0.0405, 0.0027, 0.0116, 0.0325, 0.0240, 0.0131, 0.0243],
        [0.0365, 0.0422, 0.0009, 0.0412, 0.041

KeyError: 8320

In [10]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

NameError: name 'train_loader' is not defined

In [11]:
# By training the model, we will have tuned latent factors for items and users.
c = 0
uw = 0
iw = 0 
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[0.0238, 0.0101, 0.0212, 0.0135, 0.0093, 0.0362, 0.0274, 0.0292],
        [0.0423, 0.0039, 0.0221, 0.0090, 0.0452, 0.0236, 0.0330, 0.0085],
        [0.0262, 0.0104, 0.0357, 0.0398, 0.0314, 0.0295, 0.0113, 0.0360],
        [0.0393, 0.0010, 0.0049, 0.0147, 0.0114, 0.0441, 0.0274, 0.0110],
        [0.0370, 0.0392, 0.0070, 0.0118, 0.0020, 0.0143, 0.0230, 0.0357],
        [0.0456, 0.0456, 0.0263, 0.0332, 0.0364, 0.0494, 0.0301, 0.0234],
        [0.0222, 0.0053, 0.0127, 0.0095, 0.0184, 0.0112, 0.0037, 0.0311],
        [0.0415, 0.0462, 0.0472, 0.0124, 0.0229, 0.0332, 0.0447, 0.0372],
        [0.0371, 0.0109, 0.0312, 0.0452, 0.0084, 0.0271, 0.0413, 0.0142],
        [0.0115, 0.0377, 0.0214, 0.0043, 0.0275, 0.0275, 0.0019, 0.0358],
        [0.0327, 0.0405, 0.0027, 0.0116, 0.0325, 0.0240, 0.0131, 0.0243],
        [0.0365, 0.0422, 0.0009, 0.0412, 0.0415, 0.0237, 0.0037, 0.0228],
        [0.0327, 0.0469, 0.0066, 0.0144, 0.0415, 0.0214, 0.0249, 0.0051],
        [0.0275, 0

In [12]:
trained_item_embeddings = model.item_factors.weight.data.cpu().numpy()

In [13]:
len(trained_item_embeddings) # unique item factor weights

8634

In [14]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_item_embeddings)

In [15]:
'''It can be seen here that the items that are in the same cluster tend to be of
similar item groups. Also note the algorithm only obtained the relationships by looking at the numbers representing how
users have responded to the item selections.'''
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  d_items = []
  for d_itemsidx in np.where(kmeans.labels_ == cluster)[0]:
    d_itemsid = train_set.idx2d_global_item_id[d_itemsidx]
    sales_count = customers_df.loc[customers_df['d_global_item_id']==d_itemsid].count()[0]
    d_items.append((user_names[d_itemsid], sales_count))
  for d_items in sorted(d_items, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", d_items[0])

Cluster #0


NameError: name 'train_set' is not defined