In [1]:
from model.MF.MF import MatrixFactorization


from model.MF.preprocessing import ids_encoder, preprocess_mind_data, format_data


import pandas as pd
import movielens
import splitter
import mind

In [2]:
df = movielens.load_pandas_df("100K")


In [23]:
df

Unnamed: 0,userID,itemID,rating,timestamp
0,186,302,3.0,891717742
1,22,377,1.0,878887116
2,244,51,2.0,880606923
3,166,346,1.0,886397596
4,298,474,4.0,884182806
...,...,...,...,...
99994,880,476,3.0,880175444
99995,716,204,5.0,879795543
99996,276,1090,1.0,874795795
99997,13,225,2.0,882399156


In [24]:
ratings = pd.DataFrame(df[["userID", "itemID", "rating"]])

In [25]:

ratings, uencoder, iencoder = ids_encoder(ratings)

m = ratings["userID"].nunique()   # total number of users
n = ratings["itemID"].nunique()   # total number of items


In [6]:
split_df = splitter.interactive_split(ratings)



In [7]:
x_train, y_train, x_test, y_test = format_data(split_df[0], split_df[1], 'movielens')

In [8]:


MF = MatrixFactorization(m, n, k=10, alpha=0.01, lamb=1.5)

history = MF.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test))



Training Matrix Factorization Model ...
k=10 	 alpha=0.01 	 lambda=1.5
epoch 1/10 - loss : 1.926 - val_loss : 1.934
epoch 2/10 - loss : 1.533 - val_loss : 1.537
epoch 3/10 - loss : 1.503 - val_loss : 1.506
epoch 4/10 - loss : 1.494 - val_loss : 1.497
epoch 5/10 - loss : 1.491 - val_loss : 1.493
epoch 6/10 - loss : 1.489 - val_loss : 1.491
epoch 7/10 - loss : 1.488 - val_loss : 1.49
epoch 8/10 - loss : 1.487 - val_loss : 1.49
epoch 9/10 - loss : 1.487 - val_loss : 1.489
epoch 10/10 - loss : 1.487 - val_loss : 1.489


In [59]:
MF.evaluate(x_test, y_test)

validation error : 1.489


1.4891639952848525

# BCE LOSS #

In [36]:
threshold = 3.5
# ratings["rating"] = (ratings["rating"] > threshold).astype(int)
ratings["rating"] = (ratings["rating"] > 0).astype(int)


m = ratings["userID"].nunique()
n = ratings["itemID"].nunique()
split_df = splitter.interactive_split(ratings)

x_train, y_train, x_test, y_test = format_data(split_df[0], split_df[1], 'movielens')



In [37]:
split_df

[       userID  itemID  rating
 0           0      30       1
 1           0      38       1
 2           0     162       1
 3           0     225       1
 4           0     168       1
 ...       ...     ...     ...
 79994     942       1       1
 79995     942    1187       1
 79996     942     940       1
 79997     942      37       1
 79998     942     228       1
 
 [79999 rows x 3 columns],
        userID  itemID  rating
 0           0     241       1
 1           0     154       1
 2           0     112       1
 3           0     232       1
 4           0      57       1
 ...       ...     ...     ...
 19995     942     823       1
 19996     942     731       1
 19997     942     273       1
 19998     942      63       1
 19999     942     420       1
 
 [20000 rows x 3 columns]]

In [38]:

MF = MatrixFactorization(m, n, k=10, alpha=0.001, lamb=1.5)

history = MF.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test), loss='bce')



Training Matrix Factorization Model ...
k=10 	 alpha=0.001 	 lambda=1.5
epoch 1/10 - loss : 1.033 - val_loss : 1.053
epoch 2/10 - loss : 0.852 - val_loss : 0.876
epoch 3/10 - loss : 0.772 - val_loss : 0.795
epoch 4/10 - loss : 0.734 - val_loss : 0.755
epoch 5/10 - loss : 0.715 - val_loss : 0.734
epoch 6/10 - loss : 0.705 - val_loss : 0.721
epoch 7/10 - loss : 0.699 - val_loss : 0.713
epoch 8/10 - loss : 0.695 - val_loss : 0.708
epoch 9/10 - loss : 0.693 - val_loss : 0.705
epoch 10/10 - loss : 0.692 - val_loss : 0.702


In [39]:
MF.evaluate(x_test, y_test)


validation error : 1.011


1.0109938552713944

### Sample Negative Data ###

In [40]:
import numpy as np
import pandas as pd

def generate_negative_samples(df, num_users, num_items, neg_ratio=1):
    """
    Generate negative samples for user-item pairs.
    Args:
        df: DataFrame with 'user' and 'item' columns for positive samples.
        num_users: Total number of users.
        num_items: Total number of items.
        neg_ratio: Ratio of negative samples to positive samples.
    Returns:
        DataFrame with both positive and negative samples.
    """
    positive_pairs = set(zip(df['userID'], df['itemID']))
    all_users = np.arange(num_users)
    all_items = np.arange(num_items)
    
    negative_samples = []
    for user, item in positive_pairs:
        for _ in range(neg_ratio):
            neg_item = np.random.choice(all_items)
            while (user, neg_item) in positive_pairs:  # Avoid duplicates
                neg_item = np.random.choice(all_items)
            negative_samples.append((user, neg_item, 0))
    
    negative_df = pd.DataFrame(negative_samples, columns=['userID', 'itemID', 'rating'])
    positive_df = df.copy()
    positive_df['rating'] = 1
    
    return pd.concat([positive_df, negative_df]).sample(frac=1).reset_index(drop=True)


In [41]:
# Generate a balanced dataset with negative samples
dataset = generate_negative_samples(ratings, n, m, neg_ratio=3)


In [42]:
split_df = splitter.random_split(dataset)
x_train, y_train, x_test, y_test = format_data(split_df[0], split_df[1], 'movielens')


  return bound(*args, **kwds)


In [43]:


MF = MatrixFactorization(m, n, k=10, alpha=0.001, lamb=1.5)

history = MF.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test), loss='bce')



Training Matrix Factorization Model ...
k=10 	 alpha=0.001 	 lambda=1.5
epoch 1/10 - loss : 0.797 - val_loss : 0.792
epoch 2/10 - loss : 0.714 - val_loss : 0.712
epoch 3/10 - loss : 0.699 - val_loss : 0.697
epoch 4/10 - loss : 0.695 - val_loss : 0.694
epoch 5/10 - loss : 0.694 - val_loss : 0.693
epoch 6/10 - loss : 0.693 - val_loss : 0.693
epoch 7/10 - loss : 0.693 - val_loss : 0.693
epoch 8/10 - loss : 0.693 - val_loss : 0.693
epoch 9/10 - loss : 0.693 - val_loss : 0.693
epoch 10/10 - loss : 0.693 - val_loss : 0.693


In [44]:
MF.evaluate(x_test, y_test)


validation error : 0.249


0.2488126468168498

***MIND***

In [3]:
behaviors_train_df, behaviors_dev_df, news_train_df, news_dev_df = mind.load_pandas_df('small')


100%|██████████| 51.8k/51.8k [04:15<00:00, 203KB/s]   
100%|██████████| 30.2k/30.2k [03:51<00:00, 131KB/s]  


/home/vinmike/Documents/GitHub/LLM4Rec-Dataloader/data/MINDsmall_train/behaviors.tsv
/home/vinmike/Documents/GitHub/LLM4Rec-Dataloader/data/MINDsmall_dev/behaviors.tsv
/home/vinmike/Documents/GitHub/LLM4Rec-Dataloader/data/MINDsmall_train/news.tsv
/home/vinmike/Documents/GitHub/LLM4Rec-Dataloader/data/MINDsmall_dev/news.tsv



In [4]:
train_df, uencoder, iencoder = preprocess_mind_data(behaviors_train_df)


In [5]:
dev_df, _, _ = preprocess_mind_data(behaviors_dev_df)


In [6]:
train_df

Unnamed: 0,userID,itemID,rating
0,2246,30687,1
1,2246,22278,1
2,2246,16757,1
3,2246,24335,1
4,2246,5680,1
...,...,...,...
9506457,20362,18956,1
9506458,20362,7229,1
9506459,32251,8857,1
9506460,32251,26331,1


In [None]:

m = train_df["userID"].nunique()   # total number of users
n = train_df["itemID"].nunique()   # total number of items


50000

In [9]:
x_train, y_train, x_test, y_test = format_data(train_df, dev_df, 'mind')


In [11]:
# Instantiate and train the model
MF = MatrixFactorization(m, n, k=50, alpha=0.01, lamb=1.5)
history = MF.fit(x_train, y_train, epochs=10, validation_data=(x_test, y_test))



Training Matrix Factorization Model ...
k=50 	 alpha=0.01 	 lambda=1.5
epoch 1/10 - loss : nan - val_loss : nan
epoch 2/10 - loss : nan - val_loss : nan
epoch 3/10 - loss : nan - val_loss : nan
epoch 4/10 - loss : nan - val_loss : nan
epoch 5/10 - loss : nan - val_loss : nan
epoch 6/10 - loss : nan - val_loss : nan
epoch 7/10 - loss : nan - val_loss : nan
epoch 8/10 - loss : nan - val_loss : nan
epoch 9/10 - loss : nan - val_loss : nan
epoch 10/10 - loss : nan - val_loss : nan


In [None]:
# Evaluate the model
MF.evaluate(x_test, y_test)