In [3]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Estimate clusters’ ratings to create cluster-items matrix CI:

In [4]:
ratings = pd.read_csv('datasets/ml-1m/ratings.dat', sep='\::', engine='python', names=['UID', 'MID', 'rate', 'time'])
ratings.drop('time', axis=1, inplace=True)

In [5]:
UC_matrix = pd.read_pickle('datasets/matrices/all_users_cluster.pkl')

In [6]:
print(ratings)

          UID   MID  rate
0           1  1193     5
1           1   661     3
2           1   914     3
3           1  3408     4
4           1  2355     5
...       ...   ...   ...
1000204  6040  1091     1
1000205  6040  1094     5
1000206  6040   562     5
1000207  6040  1096     4
1000208  6040  1097     4

[1000209 rows x 3 columns]


In [7]:
movies = pd.read_csv("datasets/ml-1m/movies.dat", sep='\::', engine='python', names=['MID', 'name', 'genre'], encoding='latin-1')

movies['genre'] = movies['genre'].apply(lambda x: x.split('|'))

In [8]:
all_cluster_items = pd.DataFrame(columns=["cluster", "MID"])

In [17]:
# iterate over clusters
for i in range(12):
    # open file
    with open("reports/cluster movies ratings/cluster" + str(i) + "_items.txt", "w") as f:
        print('\n\n', file=f)
        print('*'*50, file=f)
        print('\n\n', file=f)
        print("Cluster", i, ":\n", file=f)

        cluster_items = pd.DataFrame(columns=["cluster", "MID", "name", "rate"])

        # get the users in cluster i
        cluster_users = pd.read_pickle("datasets/clusters/cluster_" + str(i) + ".pkl")

        # get the ratings of the users in cluster i
        cluster_ratings = ratings[ratings['UID'].isin(cluster_users['UID'])]

        print("Cluster", i, ":", len(cluster_ratings), "ratings\n", file=f)

        # iterate over items
        for j in movies['MID']:
            # get the ratings of item j by users in cluster i
            item_ratings = cluster_ratings[cluster_ratings['MID'] == j]
            print("\n***************************\
                    \nItem", j, movies[movies['MID'] == j]['name'].values[0], ":", len(item_ratings), "ratings", file=f)

            # if there are users rated item j before in cluster i
            if len(item_ratings) > 0:
                # calculate the average rating of item j in cluster i
                avg_rating = item_ratings['rate'].mean()
                print("Average rating of item", j, "in cluster", i, ":", avg_rating, file=f)
            else:
                # get the genre of item j
                item_genre = movies[movies['MID'] == j]['genre'].values[0]
                # get the similar items to item j
                similar_items = movies[movies['genre'].apply(lambda x: len(set(x).intersection(item_genre)) > 0)]['MID'].values
                # get the ratings of the similar items by users in cluster i
                similar_items_ratings = cluster_ratings[cluster_ratings['MID'].isin(similar_items)]
                # if there are similar items to item j rated by users in cluster i
                if len(similar_items_ratings) > 0:
                    print("Some of similar items to item", j, "in cluster", i, "are:", movies[movies['MID'].isin(similar_items_ratings['MID'])]['name'].values, file=f)
                    # calculate the average rating of the similar items in cluster i
                    avg_rating = similar_items_ratings['rate'].mean()
                    print("Average rating of similar items of item", j, "in cluster", i, ":", avg_rating, file=f)
                else:
                    print("No similar items to item", j, "in cluster", i, file=f)
                    # calculate the average rating of all items by all users
                    avg_rating = ratings['rate'].mean()
                    print("Average rating of all items in cluster", i, ":", avg_rating, file=f)

            cluster_items = cluster_items.append({'MID': j, 'name':movies[movies['MID'] == j]['name'].values[0], 'cluster': i, 'rate': avg_rating}, ignore_index=True)
            all_cluster_items = all_cluster_items.append({'MID': j, 'name':movies[movies['MID'] == j]['name'].values[0], 'cluster': i, 'rate': avg_rating}, ignore_index=True)

        # cluster_items.to_pickle("datasets/clusters/cluster" + str(i) + "_items.pkl")

# save the dataframe all_cluster_items to a pickle file
all_cluster_items = pd.pivot_table(all_cluster_items, values='rate', index=['MID'], columns=['cluster'], aggfunc=np.sum, fill_value=0)
# all_cluster_items.to_pickle("datasets/matrices/all_cluster_items.pkl")
