In [None]:
import os
import codecs
import pandas as pd
import numpy as np
import warnings
import collections

# Change dir for custom imports
os.chdir('../')
dataset = 'epinions'
dataset_path = os.path.join('datasets', dataset, 'v1')

## Load Data (If previously computed, skip this and load the file)

In [None]:
# Load trust data
f = open(os.path.join(dataset_path, 'user_rating.txt'), 'r') # open the file for reading
raw_data = []
for row_num, line in enumerate(f):
    values = line.strip().split('\t')
    raw_data.append([v for v in values])
trust_data = np.array(raw_data)
f.close()

trust_raw_df = pd.DataFrame(trust_data).rename(columns={
    0: 'truster',
    1: 'trusted',
    2: 'value',
    3: 'date'
})

# Fix datatypes
trust_raw_df = trust_raw_df.astype({'truster':'int64', 'trusted':'int64', 'value':'int'})

# Select only positive trust to build the communities
trust_raw_df = trust_raw_df[trust_raw_df['value'] == 1]

In [None]:
# Read the data from the original epinions txt file
f = open(os.path.join(dataset_path, 'rating.txt'), 'r') # open the file for reading
data = []
for row_num, line in enumerate(f):
    values = line.strip().split('\t')
    data.append([v for v in values])

    if row_num == 6000000:
        break
rating_data = np.array(data)
f.close()

# Update the datatypes
ratings_df = pd.DataFrame(rating_data).rename(columns={
    0: 'itemID',
    1: 'userID',
    2: 'rating',
    3: 'status'
})
# Fix datatypes
ratings_df = ratings_df.astype({'itemID':'int64', 'userID':'int64', 'status':'int', 'rating':'int'})

# Dataset stats
print(f"Total dataset users: {len(set(ratings_df.userID.to_list()))}")
print(f"Total dataset ratings: {len(ratings_df.userID.to_list())}")

# Save the original dataset (to match the users selected with the social aspect in the upcoming experiment)
# ratings_df.to_csv('./datasets/epinions/v1/ratings_raw.csv', index=False)

In [None]:
# Filter records that have at least I total ratings and U total users
I = 150
U = 100

# Items filter
grouped_item_ratings = ratings_df.groupby(by="itemID")["rating"].count()
selected_items = grouped_item_ratings[grouped_item_ratings > I].index.tolist()
df = ratings_df[ratings_df['itemID'].isin(selected_items)]

# Users filter
grouped_user_ratings = df.groupby(by="userID")["rating"].count()
selected_users = grouped_user_ratings[grouped_user_ratings > U].index.tolist()
df = df[df['userID'].isin(selected_users)]

In [None]:
# Users list
users_list = list(set(df.userID.to_list()))

# Select trust data for users that appear in our selected sub-dataset
df_trust = trust_raw_df[(trust_raw_df['truster'].isin(users_list)) & (trust_raw_df['trusted'].isin(users_list))]

# Get all the unique users
users_df = df[['userID']].drop_duplicates()
df_trust = users_df.merge(df_trust, left_on='userID', right_on='truster')
df_trust.head()

In [None]:
# Get the adjacent matrix from trust data to calculate the communities
adj_matrix = pd.crosstab(df_trust.truster, df_trust.trusted)
idx = adj_matrix.columns.union(adj_matrix.index)
adj_matrix = adj_matrix.reindex(index = idx, columns=idx, fill_value=0)

In [None]:
from communities.algorithms import girvan_newman

communities, _ = girvan_newman(adj_matrix.to_numpy())

In [None]:
len(communities[2])

In [None]:
ratings_df = pd.read_csv('./datasets/epinions/v1/ratings_raw.csv')

In [None]:
ratings_df.head()

### Apply dataset filtering and create the features for Clustering (If pre-computed, just load the file)

In [None]:
# Filter records that have at least I total ratings and U total users
I = 150
U = 100

# Items filter
grouped_item_ratings = ratings_df.groupby(by="itemID")["rating"].count()
selected_items = grouped_item_ratings[grouped_item_ratings > I].index.tolist()
df = ratings_df[ratings_df['itemID'].isin(selected_items)]

# Users filter
grouped_user_ratings = df.groupby(by="userID")["rating"].count()
selected_users = grouped_user_ratings[grouped_user_ratings > U].index.tolist()
df = df[df['userID'].isin(selected_users)]

# reset numbers to avoid long values and clean columns
df['userID2'] = pd.factorize(df['userID'])[0]
df['itemID2'] = pd.factorize(df['itemID'])[0]
df.drop(['status', 4, 5, 6, 7], axis=1, inplace=True)
df.reset_index(inplace=True, drop=True)
df = df[['userID2', 'itemID2', 'rating']].rename(columns={'userID2': 'userID', 'itemID2': 'itemID'})

# Dataset stats
print(f"Total dataset users: {len(set(df.userID.to_list()))}")
print(f"Total dataset ratings: {len(df.userID.to_list())}")

# Save the dataset for training in the second experiment
# df.to_csv('./datasets/epinions/v1/ratings.csv', index=False)

In [None]:
df = pd.read_csv('./datasets/epinions/v1/ratings.csv')

In [None]:
# New dataframe
df.head()

In [None]:
# Rating threshold for +ve and -ve ratings features
r_th = 4

# Get all the unique users
users_df = df[['userID']].drop_duplicates()

# Create the possible features
user_total_ratings = df.groupby(by="userID")["rating"].count()
user_positive_ratings = df[df['rating'] > r_th].groupby('userID')['rating'].count().reset_index()
user_negative_ratings = df[df['rating'] <= r_th].groupby('userID')['rating'].count().reset_index()

# Update main df
users_df["ratings"] = user_total_ratings.values
users_df = pd.merge(users_df, user_positive_ratings, on=['userID'], how='left').rename(columns={'rating':'positive_ratings'})
users_df = pd.merge(users_df, user_negative_ratings, on=['userID'], how='left').rename(columns={'rating':'negative_ratings'})

# # Clean none values
users_df.fillna(0, inplace=True)

In [None]:
import gower

# Get the gower distance matrix
distance_matrix = gower.gower_matrix(users_df.drop(columns=['userID']))

In [None]:
import scipy.cluster.hierarchy as shc
from scipy.cluster.hierarchy import dendrogram

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=shc.ClusterWarning)
    # Compute linkage using the distance matrix
    linkage = shc.linkage(
        distance_matrix,
        method='ward'
    )

# Use fcluster to get the cluster labels
# `t` is the threshold to use to cut the dendrogram - higher `t` means less clusters / more data points within individual clusters
t = 6
clusters = shc.fcluster(linkage, t, criterion='distance')

# get unique cluster labels
unique_labels = np.unique(clusters)

# Adding the results to a new column in the dataframe
users_df["cluster_shc"] = clusters

print(f'Generated {len(unique_labels)} clusters.')

In [None]:
counter = collections.Counter(users_df.cluster_shc.to_list())
print(counter)

## Visualize clusters (theoretical - Optional)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Create a t-SNE object
tsne = TSNE(
    n_components=2,
    perplexity=30, # [5, 50] default is 30
    learning_rate=200, # [10.0, 1000.0] , def=200
    n_iter=1000, # >250, def=1000
    metric="euclidean"
)

# Perform t-SNE on the distance matrix
tsne_data = tsne.fit_transform(distance_matrix)
# test = tsne.fit(distance_matrix)

# Plot the t-SNE data using a scatter plot
plt.scatter(tsne_data[:, 0], tsne_data[:, 1])
plt.show()

In [None]:
# Add the cluster labels to the t-SNE data
tsne_data_clusters = np.column_stack((tsne_data, clusters))

# Plot the t-SNE data using a scatter plot
plt.scatter(tsne_data_clusters[:, 0], tsne_data_clusters[:, 1], c=tsne_data_clusters[:, 2], cmap='Spectral')
plt.show()

# Create groups of the user clusters formed in the previous step

In [None]:
# Get the gower distance matrix
distance_matrix_2 = gower.gower_matrix(users_df.drop(columns=['userID']))

In [None]:
import scipy.cluster.hierarchy as shc
from scipy.cluster.hierarchy import dendrogram

with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=shc.ClusterWarning)
    # Compute linkage using the distance matrix
    linkage = shc.linkage(
        distance_matrix_2,
        method='ward'
    )

# Use fcluster to get the cluster labels
# `t` is the threshold to use to cut the dendrogram - higher `t` means less clusters / more data points within individual clusters
t = 4
clusters = shc.fcluster(linkage, t, criterion='distance')

# get unique cluster labels
unique_labels = np.unique(clusters)

# Adding the results to a new column in the dataframe
users_df["group_clusters"] = clusters

print(f'Generated {len(unique_labels)} clusters.')
counter = collections.Counter(users_df.group_clusters.to_list())
print(counter)

In [None]:
# Save the clusters created
users_df.to_csv('./output/exp-3-epinions/clusters/clusters.csv', index=False)

# Clusters-Serendipity iterations experiment

In [None]:
import numpy as np
import scipy.cluster.hierarchy as shc
from models.lightgcn.train_clusters_script_epinions import train_on_groups

# Baseline serendipity calculated from a normal train in a previous experiment on the dataset
baseline_serendipity = pd.read_csv('./output/exp-3-epinions/baseline_serendipity.csv')

iterations = np.linspace(4, 5, 2)
for i in iterations:
    clusters = shc.fcluster(linkage, i, criterion='distance')
    # get unique cluster labels
    unique_labels = np.unique(clusters)

    # Total groups obtained (groups of clusters)
    print(len(unique_labels))

    # Save clusters in df
    users_df["group_clusters"] = clusters

    # Train model and check serendipity per group
    new_user_serendipity = train_on_groups(users_df)

    # Apply condition (if we increased serendipity per 0% of groups stop and save i)
    serendipity_df = baseline_serendipity.merge(new_user_serendipity, on='userID')
    serendipity_df['comparison'] = serendipity_df.apply(lambda x: 1 if (x.user_serendipity_y > x.user_serendipity_x) else 0, axis=1)
    total_users = len(serendipity_df)
    total_hits = sum(serendipity_df.comparison.to_list())

    threshold = total_hits/total_users * 100
    print(threshold)
    serendipity_df.to_csv('./output/exp-3-epinions/group_iterations_500k/' + str(i) + '.csv', index=False)

    if threshold > 80:
        print("Threshold achieved at", i)

In [None]:
users_df