In [None]:
import os
import codecs
import pandas as pd
import numpy as np
import warnings
import collections

# change dir for custom imports
os.chdir('../')
from scripts.dataset_downloader import initialize_dataset
from scripts.k_means import create_clsuters, clustering_errors
from scripts.helpers import get_most_rated_movies


dataset = 'ml-100k'
dataset_path = os.path.join('datasets', dataset)

In [None]:
# If dataset hasn't been previously initialized, it can be done with this function
initialize_dataset(dataset)

In [None]:
ratings_path = os.path.join(dataset_path, 'u.data')
items_path = os.path.join(dataset_path, 'u.item')
genres_path = os.path.join(dataset_path, 'u.genre')
users_path = os.path.join(dataset_path, 'u.user')

ratings_file = codecs.open(ratings_path, 'rU', 'UTF-8')
items_file = codecs.open(items_path, 'rU', 'latin-1')
users_file = codecs.open(users_path, 'rU', 'latin-1')

# load data
ratings_df = pd.read_csv(ratings_file, sep='\t', names=('user_id', 'item_id', 'rating', 'timestamp'))
genres_df = pd.read_csv(genres_path, sep='|', names=('title', 'id'))

cols_names = ('id', 'title', 'year', 'nan', 'link') + tuple(genres_df.title.to_list())
items_df = pd.read_csv(items_file, sep='|', usecols=list(range(0,24)), names=cols_names).drop(columns=['nan', 'link'])

users_df = pd.read_csv(users_file, sep='|', usecols=list(range(0,4)), names=('user_id', 'age', 'gender', 'occupation'))

# dataset stats
print(f"Total dataset users: {len(set(ratings_df.user_id.to_list()))}")
print(f"Total dataset ratings: {len(ratings_df.user_id.to_list())}")

In [None]:
user_total_ratings = ratings_df.groupby(by="user_id")["rating"].count()
user_positive_ratings = ratings_df[ratings_df['rating'] > 3].groupby('user_id')['rating'].count().reset_index()
user_negative_ratings = ratings_df[ratings_df['rating'] <= 3].groupby('user_id')['rating'].count().reset_index()

# Update main df
users_df["ratings"] = user_total_ratings.values
users_df = pd.merge(users_df, user_positive_ratings, on=['user_id'], how='left').rename(columns={'rating':'positive_ratings'})
users_df = pd.merge(users_df, user_negative_ratings, on=['user_id'], how='left').rename(columns={'rating':'negative_ratings'})

# Clean none values
users_df.fillna(0, inplace=True)

In [None]:
users_df.head()

In [None]:
import gower

# Get the gower distance matrix
distance_matrix = gower.gower_matrix(users_df.drop(columns=['user_id']))

In [None]:
import scipy.cluster.hierarchy as shc
from scipy.cluster.hierarchy import dendrogram

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=shc.ClusterWarning)
    # Compute linkage using the distance matrix
    linkage = shc.linkage(
        distance_matrix,
        method='ward'
    )

# Use fcluster to get the cluster labels
# `t` is the threshold to use to cut the dendrogram - higher `t` means less clusters / more data points within individual clusters
t = 8
clusters = shc.fcluster(linkage, t, criterion='distance')

# get unique cluster labels
unique_labels = np.unique(clusters)

# Adding the results to a new column in the dataframe
users_df["cluster_shc"] = clusters

print(f'Generated {len(unique_labels)} clusters.')

In [None]:
counter = collections.Counter(users_df.cluster_shc.to_list())
counter

## Visualize clusters (theoretical)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Create a t-SNE object
tsne = TSNE(
    n_components=2,
    perplexity=30, # [5, 50] default is 30
    learning_rate=200, # [10.0, 1000.0] , def=200
    n_iter=1000, # >250, def=1000
    metric="euclidean"
)

# Perform t-SNE on the distance matrix
tsne_data = tsne.fit_transform(distance_matrix)
# test = tsne.fit(distance_matrix)

# Plot the t-SNE data using a scatter plot
plt.scatter(tsne_data[:, 0], tsne_data[:, 1])
plt.show()

In [None]:
# Add the cluster labels to the t-SNE data
tsne_data_clusters = np.column_stack((tsne_data, clusters))

# Plot the t-SNE data using a scatter plot
plt.scatter(tsne_data_clusters[:, 0], tsne_data_clusters[:, 1], c=tsne_data_clusters[:, 2], cmap='Spectral')
plt.show()

In [None]:
# Get the gower distance matrix
distance_matrix_2 = gower.gower_matrix(users_df.drop(columns=['user_id']))

In [None]:
import scipy.cluster.hierarchy as shc
from scipy.cluster.hierarchy import dendrogram

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=shc.ClusterWarning)
    # Compute linkage using the distance matrix
    linkage = shc.linkage(
        distance_matrix_2,
        method='ward'
    )

# Use fcluster to get the cluster labels
# `t` is the threshold to use to cut the dendrogram - higher `t` means less clusters / more data points within individual clusters
t = 4.2
clusters = shc.fcluster(linkage, t, criterion='distance')

# get unique cluster labels
unique_labels = np.unique(clusters)

# Adding the results to a new column in the dataframe
users_df["group_clusters"] = clusters

print(f'Generated {len(unique_labels)} clusters.')
counter = collections.Counter(users_df.group_clusters.to_list())
print(counter)

In [None]:
import numpy as np
from models.lightgcn.train_clusters_script import train_on_groups

# Baseline serendipity calculated from a normal train in a previous experiment on the dataset
baseline_serendipity = pd.read_csv('./output/exp-3/baseline_serendipity.csv')

iterations = np.linspace(4, 6, 20)
for i in iterations:
    clusters = shc.fcluster(linkage, i, criterion='distance')
    # get unique cluster labels
    unique_labels = np.unique(clusters)

    # Total groups obtained (groups of clusters)
    print(len(unique_labels))

    # Save clusters in df
    users_df["group_clusters"] = clusters

    # Train model and check serendipity per group
    new_user_serendipity = train_on_groups(users_df)

    # Apply condition (if we increased serendipity per 90% of groups stop and save i)
    serendipity_df = baseline_serendipity.merge(new_user_serendipity, on='userID')
    serendipity_df['comparison'] = serendipity_df.apply(lambda x: 1 if (x.user_serendipity_y > x.user_serendipity_x) else 0, axis=1)
    total_users = len(serendipity_df)
    total_hits = sum(serendipity_df.comparison.to_list())

    threshold = total_hits/total_users * 100
    print(threshold)
    serendipity_df.to_csv('./output/exp-3/group_iterations/' + str(i) + '.csv', index=False)

    if threshold > 80:
        print("Threshold achieved at", i)

In [None]:
users_df