This notebook is used to add synthetic data to datasets.\
The current implemented method is adding random ratings to a target user in a target cluster.

We can take target users from a certain cluster based on the result of the previous notebook (clustering.ipynb) and use this code to generate random ratings for few users inside a given cluster. This simulates an opt-out (or any other data manipulation scenario) scenario.

In [1]:
import os

# change dir for custom imports
os.chdir('../')

In [2]:
import pandas as pd

dataset_name = 'ml-latest-small'

ratings = pd.read_csv('datasets/' + dataset_name + '/clean/ratings.csv')
movies = pd.read_csv('datasets/' + dataset_name + '/movies.csv', encoding='latin-1')
clusters = pd.read_csv('output/' + dataset_name + '/clusters.csv')

ratings_detailed = pd.merge(ratings, movies, how='inner', on='movieId').sort_values(by='userId')
ratings_detailed = pd.merge(ratings_detailed, clusters, how='inner', on=['userId', 'movieId', 'rating', 'timestamp'])

total_movies = len(list(set(ratings.movieId.to_list())))
max_rating = max(list(set(ratings.rating.to_list())))
min_rating = min(list(set(ratings.rating.to_list())))
total_clusters = len(set(clusters.cluster.to_list()))
total_ratings = len(ratings)

print(
    "total_movies:\t%f" % total_movies,
    "total_ratings:\t%f" % total_ratings,
    "max_rating:\t%f" % max_rating,
    "min_rating:\t%f" % min_rating,
    "clusters\t%f" % total_clusters, sep='\n'
)

total_movies:	9633.000000
total_ratings:	93812.000000
max_rating:	5.000000
min_rating:	0.500000
clusters	50.000000


# Reverse profiles
This code will allow us to reverse user profiles in datasets for a target cluster/s. The idea is equivalent to opt-out where a user masks his identity behind false feedback that's not similar to his initial ratings.

In [None]:
from helpers.dataset_helpers import flip_pofile_ratings

target_clusters = [20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 31]
# total number of items to add per each genre (per user)
N = 150
# total number of users to consider for data addition (per cluster)
users_thresh = 15

new_ratings = flip_pofile_ratings(ratings_detailed, movies, target_clusters, N, users_thresh)
# make sure the dataset is clean (no synthetic data duplicates)
new_ratings.drop_duplicates(['userId', 'movieId'], inplace=True)
new_ratings.to_csv('datasets/' + dataset_name + '/modified/ratings_genre_experiment.csv', index=False)

# Random ratings
This code will us to add random ratings for users in datasets.

In [3]:
from helpers.dataset_helpers import add_random_ratings

# total number of random ratings to add per target user 
target_clusters = [4,5,6,7,8,9,10,20,30,35]
N = 450
# total number of users to consider for data addition (per cluster)
users_thresh = 15

new_ratings = add_random_ratings(ratings, clusters, target_clusters, N, users_thresh)
# make sure the dataset is clean (no synthetic data duplicates)
new_ratings.drop_duplicates(['userId', 'movieId'], inplace=True)
new_ratings.to_csv('datasets/' + dataset_name + '/modified/ratings_random_experiment.csv', index=False)

In [4]:
new_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
158659,584,8357,4.5,1212603770
158660,584,5727,3.0,1212603770
158661,584,6158,2.0,1212603770
158662,584,3519,1.5,1212603770
