In [6]:
from cornac.data import Reader
from cornac.datasets import movielens
from cornac.data import Dataset, FeatureModality
from cornac.eval_methods import RatioSplit, StratifiedSplit
from cornac.metrics import RMSE
from cornac.models import MF, ItemKNN, UserKNN, NMF, BPR
import pandas as pd
import numpy as np
import cornac
import math
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
reader = Reader()
rating_data_pd = pd.read_csv(
    "./data/ml-100k/indexed_interactions.csv",
    sep="\t",
    header=None,
    names=["userID", "itemID", "Rating", "Timestamp"],
)
rating_data = rating_data_pd.to_numpy()
rating_data.__len__()
rating_data_pd

Unnamed: 0,userID,itemID,Rating,Timestamp
0,0,0,3,881250949
1,1,1,3,891717742
2,2,2,1,878887116
3,3,3,2,880606923
4,4,4,1,886397596
...,...,...,...,...
99282,875,173,3,880175444
99283,708,247,5,879795543
99284,37,982,1,874795795
99285,58,442,2,882399156


In [3]:
df_m = pd.read_csv(
    "./data/ml-100K/u.item",
    sep="|",
    names=[
        "movieID",
        "Name",
        "Date",
        "Video_Date",
        "IMDB_URL",
        "unknown",
        "Action",
        "Adventure",
        "Animation",
        "Children's",
        "Comedy",
        "Crime",
        "Documentary",
        "Drama",
        "Fantasy",
        "Film-Noir",
        "Horror",
        "Musical",
        "Mystery",
        "Romance",
        "Sci-Fi",
        "Thriller",
        "War",
        "Western",
    ],
    header=None,
    encoding="latin-1",
)
print(df_m.shape)
df_m = df_m[
    [
        "movieID",
        "Action",
        "Adventure",
        "Animation",
        "Children's",
        "Comedy",
        "Crime",
        "Documentary",
        "Drama",
        "Fantasy",
        "Film-Noir",
        "Horror",
        "Musical",
        "Mystery",
        "Romance",
        "Sci-Fi",
        "Thriller",
        "War",
        "Western",
    ]
]

df_movies_mapped = pd.read_csv(
    "./data/ml-100K/i_id_mapping.csv",
    sep="\t",
    names=["movieID", "itemID"],
    header=None,
    encoding="latin-1",
)
movies = pd.merge(df_m, df_movies_mapped, how="inner", on="movieID")
movies

(1682, 24)


Unnamed: 0,movieID,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,itemID
0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,24
1,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,147
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,233
3,4,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,47
4,5,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1344,1592,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1305
1345,1597,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1324
1346,1598,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1319
1347,1615,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1341


In [4]:
movies = movies.drop(columns=["movieID"])
movies = movies.sort_values(by="itemID")

In [5]:
movies

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,itemID
240,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
300,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1
375,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2
50,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,3
344,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1248,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1344
1192,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1345
1176,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1346
1261,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1347


In [6]:

unique_genres = [
    "Action",
    "Thriller",
    "Romance",
    "Western",
    "Children's",
    "Mystery",
    "Fantasy",
    "Film-Noir",
    "Documentary",
    "Comedy",
    "Adventure",
    "Sci-Fi",
    "Horror",
    "Crime",
    "Musical",
    "War",
    "Animation",
    "Drama",
]
genre = movies[unique_genres]
item_features_numpy = genre.to_numpy()

users = pd.read_csv("./data/ml-100k/u_id_mapping.csv", sep="\t")

users = users.sort_values(by="userID")

users = users.drop(columns=users.columns[0])
gender_map = {"M": 0, "F": 1}
users["Gender"] = users["Gender"].map(gender_map)
user_features_numpy = users.to_numpy()
users

Unnamed: 0,Gender,userID
0,0,0
1,1,1
2,0,2
3,0,3
4,0,4
...,...,...
938,1,938
939,0,939
940,1,940
941,1,941


In [7]:
def create_genre_column(r):
    all_genres = [g for g in unique_genres if r[g] == 1]
    return "|".join(all_genres)


movies["genres"] = movies.apply(create_genre_column, axis=1)
movies

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,itemID,genres
240,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Comedy
300,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,Thriller|Mystery|Film-Noir|Crime
375,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,Children's|Comedy
50,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,3,Romance|Western|War|Drama
344,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,4,Crime|Drama
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1248,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1344,Drama
1192,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1345,Comedy
1176,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1346,Drama
1261,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1347,Drama


In [8]:
dataset = rating_data
unique_genres.__len__()

18

In [28]:
ratio_split = StratifiedSplit(
    data=dataset, test_size=0.2, rating_threshold=0.0, seed=123, verbose=True
)
hr_10 = cornac.metrics.HitRatio(k=10)
# user_feature_modality.build()
# item_feature_modality.build()
# ratio_split.add_modalities(
#     user_feature=user_feature_modality, item_feature=item_feature_modality
# )
model = MF(
    k=10, max_iter=50, learning_rate=0.01, lambda_reg=0.02, seed=123, name="lmd0.02"
)
model2 = MF(
    k=10, max_iter=50, learning_rate=0.01, lambda_reg=0.01, seed=123, name="lmd0.01"
)
models = [model, model2]
cornac.Experiment(
    ratio_split, models=models, metrics=[hr_10]
).run()

rating_threshold = 0.0
exclude_unknowns = True
---
Training data:
Number of users = 943
Number of items = 1349
Number of ratings = 79045
Max rating = 5.0
Min rating = 1.0
Global mean = 3.5
---
Test data:
Number of users = 943
Number of items = 1349
Number of ratings = 20242
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 943
Total items = 1349

[lmd0.02] Training started!

[lmd0.02] Evaluation started!


Ranking: 100%|██████████| 943/943 [00:00<00:00, 10066.50it/s]



[lmd0.01] Training started!

[lmd0.01] Evaluation started!


Ranking: 100%|██████████| 943/943 [00:00<00:00, 10558.37it/s]


TEST:
...
        | HitRatio@10 | Train (s) | Test (s)
------- + ----------- + --------- + --------
lmd0.02 |      0.4263 |    0.1306 |   0.0995
lmd0.01 |      0.4252 |    0.0843 |   0.0911






In [32]:
hr_10.compute([464, 431, 179, 291, 928, 321, 1018, 377],[1192,   96,  619,  430,  157,  329,  994 , 566,  239,  321]
)

1.0

In [22]:
user_ids = users.to_numpy()[:, 1]
item_ids = movies["itemID"].to_numpy()
item_ids.__len__()

1349

In [23]:
# get the top_k ratings for all users:
top_k = 10
reco_matrix = np.zeros((len(models), len(user_ids), top_k), dtype=int)
reco_matrix_mapped_items = np.zeros(
    (len(models), len(user_ids), len(item_ids)), dtype=int
)
reco_matrix_mapped_scores = np.zeros(
    (len(models), len(user_ids), len(item_ids)), dtype=float
)
reco_matrix_all = np.zeros((len(models), len(user_ids), len(item_ids)), dtype=int)


for u in user_ids:
    for i in range(len(models)):
        reco_items = models[i].recommend(u)
        items_mapped, mapped_scores = models[i].rank(
            user_idx=u, item_indices=list(item_ids)
        )
        reco_matrix_mapped_items[i][u] = items_mapped
        reco_matrix_mapped_scores[i][u] = mapped_scores
        reco_matrix_all[i][u] = reco_items
        reco_matrix[i][u] = reco_items[:top_k]

        # print(reco_matrix[0][3])

In [24]:
reco_matrix[0][1]

array([110, 319, 220, 374, 918, 745,  99, 607,  57, 191])

In [30]:
models[1].rank(user_idx=1,item_indices=[0,2,3])

(array([2, 0, 3]), array([3.9977236, 4.1879973, 2.9608684], dtype=float32))

In [39]:
models[1].score(1,2)

4.187997146865186

In [41]:
reco_matrix_mapped_scores[1][1][3]

2.9608683586120605

In [37]:
item_ids

array([   0,    1,    2, ..., 1346, 1347, 1348])

In [44]:
####Import Metrics####
from mymetrics.GenrePrecision import GenrePrecision

top_k = 10
###intialize them
gp = GenrePrecision(users, unique_genres, top_k)

In [45]:
####IR Metrics####
model_gps = []
model_grs = []
model_gms = []
model_gdcgs = []
model_gmrrs = []

for i in range(len(models)):
    ####Precision####
    model_gps.append(gp.compute(reco_matrix[i], movies))




In [48]:
for i in range(len(models)):
    print(sum(abs(model_gps[i])))

0.15616970094582036
0.1405879029759627


In [50]:
reco_matrix_mapped_scores.shape

(2, 943, 1349)

In [53]:
np.save("reco_matrix_mapped_scores",reco_matrix_mapped_scores)
np.save("reco_matrix",reco_matrix)

In [63]:
reco_matrix[0][0]

array([1192,   96,  619,  430,  157,  329,  994,  566,  239,  321])

In [62]:
models[0].rank(0)

(array([ 939,  163,  481, ..., 1123, 1273, 1147]),
 array([3.871816 , 2.750449 , 4.069462 , ..., 3.0672047, 2.8950624,
        3.0377712], dtype=float32))

In [72]:
models[0].iid_map[619]

481

In [73]:
models[0].recommend(0)[:10]

[1192, 96, 619, 430, 157, 329, 994, 566, 239, 321]

In [84]:
models[0].iid_map[96]


163

In [None]:
models[0].uid_map[33]

33

In [85]:
models[0].rank(0, [939,163])

(array([939, 163]), array([4.918234 , 4.8813343], dtype=float32))

In [87]:
x,y=models[0].rank(0)

In [None]:
max(y)

4.918234

In [90]:
models[0].iid_map

OrderedDict([(380, 0),
             (626, 1),
             (289, 2),
             (489, 3),
             (822, 4),
             (86, 5),
             (742, 6),
             (526, 7),
             (363, 8),
             (10, 9),
             (547, 10),
             (365, 11),
             (485, 12),
             (329, 13),
             (422, 14),
             (519, 15),
             (520, 16),
             (221, 17),
             (984, 18),
             (643, 19),
             (658, 20),
             (430, 21),
             (92, 22),
             (302, 23),
             (665, 24),
             (510, 25),
             (437, 26),
             (0, 27),
             (83, 28),
             (389, 29),
             (649, 30),
             (621, 31),
             (305, 32),
             (151, 33),
             (838, 34),
             (157, 35),
             (739, 36),
             (85, 37),
             (197, 38),
             (48, 39),
             (172, 40),
             (204, 41),
          

In [None]:
score_dicts = []
for i in range(len(user_ids)):
    iids = 
    score = reco_items_scores_all[i]
    score_dicts.append(OrderedDict(zip(iids, score)))

In [100]:
reco_matrix_mapped_scores[0]

array([[3.87181592, 2.75044894, 4.06946182, ..., 3.06720471, 2.89506245,
        3.03777122],
       [3.95757627, 3.57638502, 3.90918159, ..., 2.97954321, 2.98584437,
        2.92918181],
       [4.37335157, 2.85819197, 5.26001549, ..., 2.67650414, 2.61237574,
        2.84262347],
       ...,
       [3.5454576 , 2.7566402 , 3.70174885, ..., 2.5051918 , 2.07629728,
        2.4400425 ],
       [3.67441225, 3.25669909, 2.98512125, ..., 2.71628594, 2.8474226 ,
        2.69638228],
       [4.15575552, 3.9533999 , 3.84968376, ..., 3.40439463, 3.40589929,
        3.26646209]])

In [106]:
from collections import OrderedDict

sorted_by_values = OrderedDict(sorted(models[0].iid_map.items(), key=lambda item: item[1]))
keys_sorted_by_values = list(sorted_by_values.keys())


In [111]:
reco_items_scores_all = [OrderedDict() for _ in range(len(user_ids))]

for u in user_ids:
    actual_index_u = u
    mapped_index_u = models[0].uid_map[actual_index_u]
    # for i in item_ids:
    mapped_scores = reco_matrix_mapped_scores[0][actual_index_u]
    ordered_dict = OrderedDict(zip(keys_sorted_by_values, mapped_scores))
    reco_items_scores_all[mapped_index_u] = ordered_dict
    
    
    

In [146]:
import pickle
with open('score_dicts.pkl', 'wb') as file:
    pickle.dump(reco_items_scores_all, file)

print("List of OrderedDicts saved to 'score_dicts.pkl'.")


List of OrderedDicts saved to 'score_dicts.pkl'.


In [118]:
reco_matrix_mapped_scores[0][0]

array([3.87181592, 2.75044894, 4.06946182, ..., 3.06720471, 2.89506245,
       3.03777122])

In [119]:
models[0].rank(0)[:10]

(array([ 939,  163,  481, ..., 1123, 1273, 1147]),
 array([3.871816 , 2.750449 , 4.069462 , ..., 3.0672047, 2.8950624,
        3.0377712], dtype=float32))

In [120]:
models[0].iid_map[1192]

939

In [134]:
models[0].recommend(11)[:10]

[289, 586, 1040, 1063, 110, 45, 200, 681, 191, 716]

In [148]:
x,y=models[0].rank(models[0].uid_map[0])

In [161]:
models[0].iid_map[1192]

939

In [149]:
x[:10]

array([939, 163, 481,  21,  35,  13, 306, 442, 255, 108])

In [142]:
models[0].score(99,2)

4.843582003732976

In [147]:
reco_items_scores_all[0][1192]

4.918233871459961

In [150]:
y[:10]

array([3.871816 , 2.750449 , 4.069462 , 3.3604167, 3.5519223, 3.9583373,
       2.646601 , 3.7401524, 4.0020523, 3.6479642], dtype=float32)

In [156]:
max(x)

1348

In [152]:
idxmax(x)

NameError: name 'idxmax' is not defined

In [157]:
index_min = max(range(len(y)), key=y.__getitem__)


In [158]:
index_min

939

In [12]:
models[0].recommend(1)[:10]

[110, 319, 220, 374, 918, 745, 99, 607, 57, 191]

In [14]:
models[0].iid_map

OrderedDict([(380, 0),
             (626, 1),
             (289, 2),
             (489, 3),
             (822, 4),
             (86, 5),
             (742, 6),
             (526, 7),
             (363, 8),
             (10, 9),
             (547, 10),
             (365, 11),
             (485, 12),
             (329, 13),
             (422, 14),
             (519, 15),
             (520, 16),
             (221, 17),
             (984, 18),
             (643, 19),
             (658, 20),
             (430, 21),
             (92, 22),
             (302, 23),
             (665, 24),
             (510, 25),
             (437, 26),
             (0, 27),
             (83, 28),
             (389, 29),
             (649, 30),
             (621, 31),
             (305, 32),
             (151, 33),
             (838, 34),
             (157, 35),
             (739, 36),
             (85, 37),
             (197, 38),
             (48, 39),
             (172, 40),
             (204, 41),
          

In [18]:
alist=[]
for i in range(10):
    alist.append(models[0].iid_map[i])

In [19]:
alist

[27, 46, 118, 243, 966, 380, 141, 342, 200, 458]

In [20]:
models[0].rank(0, alist)

(array([ 46, 380, 141, 458, 342, 243,  27, 200, 966, 118]),
 array([3.34516  , 4.431407 , 2.2533798, 3.3473527, 3.0156531, 4.2838416,
        3.8198872, 3.3991296, 3.2275043, 3.4782987], dtype=float32))

In [11]:
test_set=ratio_split.test_set

In [12]:
ratio_split.global_uid_map

OrderedDict([(0, 0),
             (1, 1),
             (2, 2),
             (3, 3),
             (4, 4),
             (5, 5),
             (6, 6),
             (7, 7),
             (8, 8),
             (9, 9),
             (10, 10),
             (11, 11),
             (12, 12),
             (13, 13),
             (14, 14),
             (15, 15),
             (16, 16),
             (17, 17),
             (18, 18),
             (19, 19),
             (20, 20),
             (21, 21),
             (22, 22),
             (23, 23),
             (24, 24),
             (25, 25),
             (26, 26),
             (27, 27),
             (28, 28),
             (29, 29),
             (30, 30),
             (31, 31),
             (32, 32),
             (33, 33),
             (34, 34),
             (35, 35),
             (36, 36),
             (37, 37),
             (38, 38),
             (39, 39),
             (40, 40),
             (41, 41),
             (42, 42),
             (43, 43),
        

In [14]:
x = pd.DataFrame(test_set.uir_tuple).transpose()
x.columns = ["uid", "iid", "rating"]
x = x.astype({"uid": "int", "iid": "int", "rating": "int"})
r_global_uid_map = {v: k for k, v in ratio_split.global_uid_map.items()}
r_global_iid_map = {v: k for k, v in ratio_split.global_iid_map.items()}

x["uid"] = x["uid"].map(r_global_uid_map)
x["iid"] = x["iid"].map(r_global_iid_map)

In [15]:
x

Unnamed: 0,uid,iid,rating
0,0,464,4
1,0,431,2
2,0,179,3
3,0,291,3
4,0,928,4
...,...,...,...
20237,942,450,4
20238,942,858,5
20239,942,157,4
20240,942,24,5


In [20]:
x.to_csv("testing_set_seed123_ml100k.csv", index=False, header=False)


In [19]:
models[0].iid_map[180]

455

In [35]:
rating_data_pd[rating_data_pd["userID"]==0].__len__()

39

In [36]:
39-8

31

In [37]:
1349-31

1318

In [5]:
rating_data_pd[rating_data_pd["itemID"]==793]

Unnamed: 0,userID,itemID,Rating,Timestamp
2161,116,793,5,875801239
3759,111,793,4,875635064
7194,110,793,3,875742077
12397,144,793,3,875409139
14657,299,793,4,875707690
15189,118,793,4,875692955
48931,522,793,3,875441348
92873,825,793,1,875655669
99012,416,793,4,875655986
