import dependencies

In [46]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import matthews_corrcoef

import numpy as np
import pandas as pd
import pymongo
from pymongo import MongoClient

Connect to database and extract data

In [47]:
# library to connect data base and preprocess data
#import util
DATABASE_ACCESS = "mongodb+srv://yelshall:yyForever-53611@auth-test.p4buu.mongodb.net/db?retryWrites=true&w=majority"

## connect to the mongo database and return target table as mongo collection
## load the database table into np array
## take mongo db as arguement
## return dataframe
def load_data(db,table):
    cluster = MongoClient(DATABASE_ACCESS)
    return pd.DataFrame(list(cluster[db][table].find()))

## return user data as dataframe
def get_users():
    return load_data("db","students")

## return events data as dataframe
def get_events():
    return load_data("db","events")

Utility Library

In [48]:
## get events id as numpy array
def get_events_id():
    return np.array(get_events()["_id"])

## get  users id as numpy array
def get_users_id(df):
    return np.array(df["_id"])
## check if a events inside a diction's list

def is_in_dict(obj_user,obj_events,D):
    return D[obj_user].count(obj_events)==1

## use the df to build dislike/like events dictionary
def build_liked_dict(df):
    return df.set_index("_id").to_dict()["interestedEvents"]

## use the df to build dislike events dictionary
def build_disliked_dict(df):
    return df.set_index("_id").to_dict()["unlikedEvents"]


In [66]:
events = df_train["interestedEvents"]

0      [618c67afffafd43022036124, 618c67afffafd430220...
1      [618c67acffafd4302203609f, 618c67acffafd430220...
2      [618c67acffafd43022036097, 618c67acffafd430220...
3      [618c67b0ffafd4302203614a, 618c67b1ffafd430220...
4      [618c67acffafd4302203609f, 618c67acffafd430220...
                             ...                        
613    [618c67abffafd43022036072, 618c67abffafd430220...
614    [618c67b2ffafd430220361a8, 618c67acffafd430220...
615                                                   []
616    [618c67aeffafd430220360f4, 618c67b0ffafd430220...
617                                                   []
Name: interestedEvents, Length: 618, dtype: object

In [49]:

df_train = get_users()
print(df_train.shape)
liked_dict = build_liked_dict(df_train)
disliked_dict = build_disliked_dict(df_train)

train_data = []

for user in liked_dict:
    for event in liked_dict[user]:
        train_data.append([user,event,1])

for user in disliked_dict:
    for event in disliked_dict[user]:
        train_data.append([user,event,0])


train_data = pd.DataFrame(np.array(train_data),columns = ["user_id","event_id","rating"])
users, unique_user_ids = pd.factorize(train_data['user_id'])
events, unique_event_ids = pd.factorize(train_data['event_id'])
ratings = train_data['rating'].values
n_1_labels = np.sum(ratings)
n_0_labels = ratings.shape[0] - n_1_labels
ratings = ratings.astype('float32')

(618, 18)


In [50]:
print(unique_user_ids.shape)

(616,)


In [51]:
## users and items here are 
class MatrixFactorization():
    def __init__(self, n_users, n_events, n_factors): ## This will also take care of initilizing the weights
        self.n_users = n_users
        self.n_events = n_events
        self.n_factors = n_factors
        self.user_factors = torch.rand(n_users, n_factors, dtype=torch.float32,requires_grad=False)/n_factors
        self.event_factors = torch.rand(n_events, n_factors, dtype=torch.float32,requires_grad=False)/n_factors

## these 2 function compute the gradient regard to U and V
## it uses MSE
    def gradient_U(self, users, events, ratings, lambda_):
        # users is a list of user ids
        # events is a list of item ids
        y_hat = torch.sigmoid((self.user_factors[users,:] * self.event_factors[events,:]).sum(dim=1))
        # print("user_factors[users,:]",self.user_factors[users,:])
        # print("user_factors[1,2,3]: ", self.user_factors[[0,0,0],:])
        # print("users",users)
        return ((lambda_*self.user_factors[users,:].T - self.event_factors[events,:].T * (ratings - y_hat) * (ratings*n_0_labels + (1-ratings)*n_1_labels)/(n_0_labels+n_1_labels)).T)

    def gradient_V(self, users, events, ratings, lambda_):
        # users is a list of user ids
        # items is a list of item ids
        y_hat = torch.sigmoid( (self.user_factors[users,:] * self.event_factors[events,:]).sum(dim=1) )
        return ((lambda_*self.event_factors[events,:].T - self.user_factors[users,:].T * (ratings - y_hat) * (ratings*n_0_labels + (1-ratings)*n_1_labels)/(n_0_labels+n_1_labels)).T) 




test on gradient mf class

In [52]:
# Turn training data into torch tensors
torch_users = torch.tensor(users,dtype=torch.long)
torch_events = torch.tensor(events,dtype=torch.long)
torch_ratings = torch.tensor(ratings,dtype=torch.float32)

In [53]:
n_users = len(unique_user_ids)
n_events = len(unique_event_ids)
print(f'n_users = {n_users} n_events = {n_events}', flush=True)

n_users = 616 n_events = 77


In [63]:
from sklearn.metrics import accuracy_score
from sklearn import metrics

model = MatrixFactorization(n_users, n_events, n_factors=5)

epochs = 10000
learning_rate = 0.001
lambda_ = 0.1

for epoch in range(epochs):
    model.user_factors[torch_users,:] -= learning_rate * model.gradient_U(torch_users,torch_events,torch_ratings,lambda_)
    model.event_factors[torch_events,:] -= learning_rate * model.gradient_V(torch_users,torch_events,torch_ratings,lambda_)

    if epoch % 2000 == 0:
        with torch.no_grad():
            y_hat = torch.sigmoid((model.user_factors[torch_users,:] * model.event_factors[torch_events,:]).sum(dim=1))
            print(f'Loss: {(torch_ratings * torch.log(y_hat) + (1 - torch_ratings) * torch.log((1.-y_hat))).sum()}')
            print(y_hat)
            y_hat = y_hat.gt(0.5).numpy()
            
            print(f'Epoch {epoch+1} last Matthew\'s correlation coefficient {matthews_corrcoef(y_hat,ratings)}', flush=True)
            print("y_hat",y_hat)
            print("rating",ratings)
            print("accuracy",accuracy_score(y_hat,ratings))

Loss: -33748.0703125
tensor([0.5117, 0.5117, 0.5109,  ..., 0.5161, 0.5093, 0.5187])
Epoch 1 last Matthew's correlation coefficient 0.0
y_hat [ True  True  True ...  True  True  True]
rating [1. 1. 1. ... 0. 0. 0.]
accuracy 0.2594560669456067
Loss: -33475.81640625
tensor([0.5041, 0.5047, 0.5040,  ..., 0.5066, 0.5016, 0.5055])
Epoch 2001 last Matthew's correlation coefficient -0.0062001672664986065
y_hat [ True  True  True ...  True  True  True]
rating [1. 1. 1. ... 0. 0. 0.]
accuracy 0.2631380753138075
Loss: -33323.9453125
tensor([0.5013, 0.5019, 0.5015,  ..., 0.5026, 0.4984, 0.5002])
Epoch 4001 last Matthew's correlation coefficient -0.003595082074207757
y_hat [ True  True  True ...  True False  True]
rating [1. 1. 1. ... 0. 0. 0.]
accuracy 0.29721757322175735
Loss: -33242.66015625
tensor([0.5002, 0.5007, 0.5004,  ..., 0.5009, 0.4970, 0.4979])
Epoch 6001 last Matthew's correlation coefficient 0.003580553180268431
y_hat [ True  True  True ...  True False False]
rating [1. 1. 1. ... 0. 0

In [55]:
(y_hat==True).sum()

19195

In [56]:
event_factors = model.event_factors
user_factors = model.user_factors

In [57]:
user_factors

tensor([[ 0.2465, -1.0289,  0.6850, -1.2479,  0.5317],
        [ 0.2355, -0.9114,  0.6447, -1.3151,  0.6602],
        [ 0.2317, -0.9624,  0.7140, -1.2971,  0.5197],
        ...,
        [ 0.4113,  1.7490, -1.2034,  1.7808,  0.2786],
        [ 0.8162,  0.2718,  0.7131, -1.4473,  0.3165],
        [ 0.3146,  2.5616, -0.8140,  0.9565,  0.9984]])

In [58]:
event_id = get_events_id()
user_id = get_users_id(df_train)

In [59]:
## event 
df_event_factors = pd.DataFrame(np.array(event_factors),columns=["e1","e2","e3","e4","e5"])
df_event_factors["event_id"] = event_id
dict_event_factors = df_event_factors.to_dict("record")

# ## user
df_user_factors = pd.DataFrame(np.array(user_factors),columns=["e1","e2","e3","e4","e5"])
df_user_factors["user_id"] = user_id
dict_user_factors = df_user_factors.to_dict("record")

  after removing the cwd from sys.path.


ValueError: Length of values (618) does not match length of index (616)

Save model to data base

In [None]:
DATABASE_ACCESS = "mongodb+srv://yelshall:yyForever-53611@auth-test.p4buu.mongodb.net/db?retryWrites=true&w=majority"
cluster = MongoClient(DATABASE_ACCESS)
db_event_factors = cluster["db"]["event_factors"]
db_user_factors = cluster["db"]["user_factors"]
db_event_factors.remove({})
db_user_factors.remove({})
db_event_factors.insert_many(dict_event_factors)
db_user_factors.insert_many(dict_user_factors)

  """
  


<pymongo.results.InsertManyResult at 0x7f8e28da51e0>

In [None]:
## make prediction
