This notebook is used to add synthetic data to datasets.\
The current implemented method is adding random ratings to a target user in a target cluster.

We can take target users from a certain cluster based on the result of the previous notebook (clustering.ipynb) and use this code to generate random ratings for few users inside a given cluster. This simulates an opt-out (or any other data manipulation scenario) scenario.

In [None]:
import os

# change dir for custom imports
os.chdir('../')

In [None]:
import pandas as pd

dataset_name = 'ml-latest-small'

ratings = pd.read_csv('datasets/' + dataset_name + '/ratings.csv')
movies = pd.read_csv('datasets/' + dataset_name + '/movies.csv')
clusters = pd.read_csv('output/' + dataset_name + '/clusters.csv')

ratings_detailed = pd.merge(ratings, movies, how='inner', on='movieId').sort_values(by='userId')
ratings_detailed = pd.merge(ratings_detailed, clusters, how='inner', on=['userId', 'movieId', 'rating', 'timestamp'])

total_movies = len(list(set(ratings.movieId.to_list())))
max_ratings = max(list(set(ratings.rating.to_list())))
min_ratings = min(list(set(ratings.rating.to_list())))

In [None]:
ratings_detailed

In [None]:
import numpy as np

# add random ratings to first user in cluster 0
target_user = 95
N = 400

for n in range(N):
    random_movie_id = np.random.choice(np.arange(1, total_movies, 1), size=1)[0]
    random_rating = np.random.choice(np.arange(1, 5, 0.5), size=1)[0]

    # check if the random item has already been rated by the user. If yes, no random record will be added to avoid duplicate rating
    x = ratings[(ratings['userId'] == target_user) & (ratings['movieId'] == random_movie_id)]
    if len(x) == 1:
        continue

    # new record to add
    new_record = {
        'userId': target_user,
        'movieId': random_movie_id,
        'rating': random_rating,
        'timestamp': 1212603770
    }
    # append row to the dataframe
    ratings = ratings.append(new_record, ignore_index=True)

In [None]:
len(ratings)

In [None]:
ratings.to_csv('datasets/' + dataset_name + '/modified/ratings_random_experiment.csv', index=False)