In [None]:
import os
import codecs
import pandas as pd

# change dir for custom imports
os.chdir('../')
from scripts.dataset_downloader import initialize_dataset
from scripts.k_means import create_clsuters, clustering_errors

dataset = 'ml-100k'
dataset_path = os.path.join('datasets', dataset)

In [None]:
# If dataset hasn't been previously initialized, it can be done with this function
initialize_dataset(dataset)

In [None]:
ratings_path = os.path.join(dataset_path, 'u.data')
items_path = os.path.join(dataset_path, 'u.item')
genres_path = os.path.join(dataset_path, 'u.genre')
ratings_file = codecs.open(ratings_path, 'rU', 'UTF-8')
items_file = codecs.open(items_path, 'rU', 'latin-1')

# Load dfs
ratings_df = pd.read_csv(ratings_file, sep='\t', names=('user_id', 'item_id', 'rating', 'timestamp'))
genres_df = pd.read_csv(genres_path, sep='|', names=('title', 'id'))
cols_names = ('id', 'title', 'year', 'nan', 'link') + tuple(genres_df.title.to_list())
items_df = pd.read_csv(items_file, sep='|', usecols=list(range(0,24)), names=cols_names).drop(columns=['nan', 'link'])

# dataset stats
print(f"Total dataset users: {len(set(ratings_df.user_id.to_list()))}")
print(f"Total dataset ratings: {len(ratings_df.user_id.to_list())}")

In [None]:
main_df = pd.merge(ratings_df, items_df, left_on='item_id', right_on='id')\
    .sort_values(by='user_id')\
    .drop(columns=['id', 'year', 'unknown', 'title', 'timestamp', 'item_id'])
main_df.head()

In [None]:
k = 30  # total clusters
clusters_df = create_clsuters(k, main_df)
clusters_df.head()

In [None]:
clusters = clusters_df.groupby(['user_id', 'cluster']).size().reset_index(name='total_ratings')
clusters.head()

In [None]:
# OPTIONAL - Elbow method for optimal k
# Choose the range of k values to test.
# We added a stride of 5 to improve performance. We don't need to calculate the error for every k value
possible_k_values = range(2, 10, 5)

# Calculate error values for all k values we're interested in
errors_per_k = [clustering_errors(k, main_df) for k in possible_k_values]