In [13]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Estimate clusters’ ratings to create cluster-items matrix CI:

In [14]:
ratings = pd.read_pickle('datasets/seen/ratings_rest.dat')

In [15]:
movies = pd.read_csv("datasets/ml-1m/movies.dat", sep='\::', engine='python', names=['MID', 'name', 'genre'], encoding='latin-1')

movies['genre'] = movies['genre'].apply(lambda x: x.split('|'))

In [16]:
# create a dataframe with two columns: MID and cluster
cluster_items = pd.DataFrame(columns=["MID", "name", "cluster"])

all_cluster_items = pd.DataFrame(columns=["MID", "name", "cluster"])

In [17]:
'''
estimate ratings for all items in each cluster
if there are users rated item i before in cluster c,
then the rating of item i in cluster c is the average of the ratings of item i by users in cluster c
else if there are similar items to item i rated by users in the cluster c
then the rating of item i in cluster c is the average of the ratings of the similar items by users in cluster c
similar items are items that have at least one similar genre as item i
else the rating of item i in cluster c is the average of the ratings of item i by all users
'''

'\nestimate ratings for all items in each cluster\nif there are users rated item i before in cluster c,\nthen the rating of item i in cluster c is the average of the ratings of item i by users in cluster c\nelse if there are similar items to item i rated by users in the cluster c\nthen the rating of item i in cluster c is the average of the ratings of the similar items by users in cluster c\nsimilar items are items that have at least one similar genre as item i\nelse the rating of item i in cluster c is the average of the ratings of item i by all users\n'

In [18]:
# iterate over clusters
for i in range(12):
    print("Cluster", i, ":")

    cluster_items.drop(cluster_items.index, inplace=True)


    # get the users in cluster i
    cluster_users = pd.read_pickle("datasets/combined-u/cluster" + str(i) + ".pkl")

    # get the ratings of the users in cluster i
    cluster_ratings = ratings[ratings['UID'].isin(cluster_users['user_id'])]
    print("Number of ratings:", len(cluster_ratings))
    print("Number of users:", len(cluster_users))
    print("Number of rated items:", len(cluster_ratings['MID'].unique()))

    # iterate over items
    for j in movies['MID']:
        # get the ratings of item j by users in cluster i
        item_ratings = cluster_ratings[cluster_ratings['MID'] == j]
        # if there are users rated item j before in cluster i
        if len(item_ratings) > 0:
            # calculate the average rating of item j in cluster i
            avg_rating = item_ratings['rate'].mean()
        else:
            # get the genre of item j
            item_genre = movies[movies['MID'] == j]['genre'].values[0]
            # get the similar items to item j
            similar_items = movies[movies['genre'].apply(lambda x: len(set(x).intersection(item_genre)) > 0)]['MID'].values
            # get the ratings of the similar items by users in cluster i
            similar_items_ratings = cluster_ratings[cluster_ratings['MID'].isin(similar_items)]
            # if there are similar items to item j rated by users in cluster i
            if len(similar_items_ratings) > 0:
                # calculate the average rating of the similar items in cluster i
                avg_rating = similar_items_ratings['rate'].mean()
            else:
                # calculate the average rating of item j by all users
                avg_rating = ratings[ratings['MID'] == j]['rate'].mean()
        # add the MID and cluster to the dataframe
        cluster_items = cluster_items.append({'MID': j, 'name':movies[movies['MID'] == j]['name'].values[0], 'cluster': i, 'rate': avg_rating}, ignore_index=True)
        all_cluster_items = all_cluster_items.append({'MID': j, 'name':movies[movies['MID'] == j]['name'].values[0], 'cluster': i, 'rate': avg_rating}, ignore_index=True)
    print("Average rating of cluster", i, ":", cluster_items[cluster_items['cluster'] == i]['rate'].mean())
    print("Estimated ratings for all items (", len(movies['MID']), "):")
    cluster_items.to_pickle("datasets/combined-u/cluster" + str(i) + "_items.pkl")

# save the dataframe to a pickle file
all_cluster_items.to_pickle("datasets/combined-u/all_clusters_items.pkl")


Cluster 0 :
Number of ratings: 199319
Number of users: 1329
Number of rated items: 3500
Average rating of cluster 0 : 3.276547307977412
Estimated ratings for all items ( 3883 ):
Cluster 1 :
Number of ratings: 70237
Number of users: 429
Number of rated items: 3173
Average rating of cluster 1 : 3.3300546897805163
Estimated ratings for all items ( 3883 ):
Cluster 2 :
Number of ratings: 117370
Number of users: 690
Number of rated items: 3413
Average rating of cluster 2 : 3.3325159625875767
Estimated ratings for all items ( 3883 ):
Cluster 3 :
Number of ratings: 92224
Number of users: 549
Number of rated items: 3242
Average rating of cluster 3 : 3.361793828875295
Estimated ratings for all items ( 3883 ):
Cluster 4 :
Number of ratings: 53671
Number of users: 331
Number of rated items: 3163
Average rating of cluster 4 : 3.3686857528745904
Estimated ratings for all items ( 3883 ):
Cluster 5 :
Number of ratings: 135893
Number of users: 778
Number of rated items: 3390
Average rating of cluster 5

In [19]:
print(all_cluster_items)

        MID                                name cluster      rate
0         1                    Toy Story (1995)       0  4.172414
1         2                      Jumanji (1995)       0  3.268116
2         3             Grumpier Old Men (1995)       0  2.802632
3         4            Waiting to Exhale (1995)       0  2.571429
4         5  Father of the Bride Part II (1995)       0  3.018868
...     ...                                 ...     ...       ...
46591  3948             Meet the Parents (2000)      11  3.608696
46592  3949          Requiem for a Dream (2000)      11  3.666667
46593  3950                    Tigerland (2000)      11  5.000000
46594  3951             Two Family House (2000)      11  3.834861
46595  3952               Contender, The (2000)      11  3.666667

[46596 rows x 4 columns]
