In [1]:
import os
import codecs
import pandas as pd

# change dir for custom imports
os.chdir('../')
from scripts.dataset_downloader import initialize_dataset

In [2]:
dataset = 'ml-100k'
dataset_path = os.path.join('datasets', dataset)
initialize_dataset(dataset)

In [3]:
ratings_path = os.path.join(dataset_path, 'u.data')
items_path = os.path.join(dataset_path, 'u.item')
genres_path = os.path.join(dataset_path, 'u.genre')
ratings_file = codecs.open(ratings_path, 'rU', 'UTF-8')
items_file = codecs.open(items_path, 'rU', 'latin-1')

# Load dfs
ratings_df = pd.read_csv(ratings_file, sep='\t', names=('user_id', 'item_id', 'rating', 'timestamp'))
genres_df = pd.read_csv(genres_path, sep='|', names=('name', 'id'))
cols_names = ('id', 'name', 'year', 'nan', 'link') + tuple(genres_df.name.to_list())
items_df = pd.read_csv(items_file, sep='|', usecols=list(range(0,24)), names=cols_names).drop(columns=['nan', 'link'])

In [9]:
main_df = pd.merge(ratings_df, items_df, left_on='item_id', right_on='id')\
    .sort_values(by='user_id')\
    .drop(columns=['id', 'year', 'unknown', 'name'])
main_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
43606,1,12,5,878542960,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
79998,1,254,1,878541392,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
35906,1,189,3,888732928,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
69127,1,87,5,878543541,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
51383,1,187,4,874965678,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
from scripts.k_means import create_clsuters

k = 10
main_df_clustered = create_clsuters(k, main_df)
main_df_clustered.head()

Unnamed: 0,user_id,item_id,rating,timestamp,cluster
43606,1,12,5,878542960,5
79998,1,254,1,878541392,5
35906,1,189,3,888732928,0
69127,1,87,5,878543541,5
51383,1,187,4,874965678,2


In [11]:
len(main_df)

100000

In [15]:
len(main_df)/500

200.0

In [22]:
# Choose the range of k values to test.
# We added a stride of 5 to improve performance. We don't need to calculate the error for every k value
possible_k_values = range(2, 10, 5)

In [24]:
# Import Libraries
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Define function to calculate the clustering errors
def clustering_errors(k, data):
    kmeans = KMeans(n_clusters=k).fit(data)
    predictions = kmeans.predict(data)
    #cluster_centers = kmeans.cluster_centers_
    # errors = [mean_squared_error(row, cluster_centers[cluster]) for row, cluster in zip(data.values, predictions)]
    # return sum(errors)
    silhouette_avg = silhouette_score(data, predictions)
    return silhouette_avg

# Calculate error values for all k values we're interested in
errors_per_k = [clustering_errors(k, main_df) for k in possible_k_values]

In [25]:
errors_per_k

[0.6387230637197846, 0.6386839225786864]