In [1]:
import numpy as np
import pandas as pd
from scipy.stats import entropy


In [2]:
dataset_users = pd.read_csv('./ratings.csv', header=None)
dataset_features = pd.read_csv('./items.csv',header=None)

In [3]:
# Set header names
dataset_users.columns = ['user_id', 'movie_id', 'rating']
dataset_features.columns = ['movie_id', 'feature_id', 'feature_value']

In [4]:
# Merge the two datasets on movie_id
dataset_merged = pd.merge(dataset_users, dataset_features, on='movie_id')

unique_features = dataset_merged['feature_id'].unique()

In [5]:
print(unique_features)

['Crime' 'Drama' 'Mystery' 'Thriller' 'women_writer_director' 'en'
 'Comedy' 'de' 'non-en' 'Romance' 'it' 'Action' 'sci_fi' 'Horror'
 'Adventure' 'Family' 'ja' 'mi' 'zh' 'fr' 'Animation' 'Fantasy' 'ru'
 'History' 'Music' 'War' 'Western' 'es' 'da' 'nl' 'yi' 'Documentary' 'fi'
 'mn' 'ar' 'la' 'xx' 'ko' 'eo' 'Foreign' 'tr' 'qu' 'fa' 'sq' 'sv' 'is'
 'hi' 'th' 'af' 'el' 'zu' 'cs' 'sk' 'pl' 'pt' 'TV' 'cy' 'ln' 'ur' 'ga'
 'he' 'hu' 'sc' 'nv' 'lb' 'no' 'sw' 'bs' 'sr' 'cn' 'bg' 'tl' 'ka' 'ro'
 'pa' 'sh' 'id' 'gd' 'lt' 'mt' 'sl' 'hr' 'si' 'km' 'sa' 'ku' 'bo' 'gl'
 'ne' 'am' 'uk' 'et' 'co' 'vi' 'bn' 'kk' 'xh' 'eu' 'ta' 'ca' 'Adult' 'mr'
 'ms' 'tt' 'kw' 'te' 'gn' 'ps' 'so' 'se' 'wo']


In [6]:
# Initialize an empty list to store the results
results = []
no_entropy = []

for user in dataset_merged['user_id'].unique():
    user_data = dataset_merged[dataset_merged['user_id'] == user]
    count_movies = np.zeros(len(unique_features))
    
    for idx, feature in enumerate(unique_features):
        count_movies_with_feature = len(user_data[user_data['feature_id'] == feature])
        count_movies[:count_movies_with_feature] = 1
        count_movies[:len(count_movies)+1] = 1
        
        entropy_val = entropy(count_movies)
        results.append([user, feature, entropy_val])

In [7]:
# Create a new DataFrame from the results list
results_df = pd.DataFrame(results, columns=['user_id', 'feature_id', 'entropy'])

print(results_df.head())


   user_id             feature_id  entropy
0   158052                  Crime  4.70953
1   158052                  Drama  4.70953
2   158052                Mystery  4.70953
3   158052               Thriller  4.70953
4   158052  women_writer_director  4.70953


In [8]:
# Save the DataFrame to a CSV file
results_df.to_csv('compat.csv', index=False, header=False)