In [1]:
# Train test split
import pandas as pd
from sklearn.model_selection import train_test_split

col = ['userId', 'movieId', 'rating', 'timestamp']

data = pd.read_csv('ml-1m/ratings.dat', sep='::', names=col)
# data = pd.read_csv('ml-25m/ratings.csv')

train, test = train_test_split(data, test_size=0.2)
print(train)
train.sort_values('userId').to_csv('./ml-1m/train.csv', index=False)
test.sort_values('userId').to_csv('./ml-1m/test.csv', index=False)

        userId  movieId  rating  timestamp
884596    5342     2194       4  960685341
261606    1599     1633       4  974735720
941978    5683     1721       5  958608557
864275    5208     3916       4  975560107
520044    3208     1513       3  968555572
...        ...      ...     ...        ...
375166    2186     2399       2  974609566
965580    5824     1282       4  957967655
555818    3416     1515       5  967405224
358581    2098     1756       3  974654599
841379    5054      759       3  962487218

[800167 rows x 4 columns]


In [2]:
print(len(train), len(test))

800167 200042


## Ploting

In [5]:
import time
import numpy as np
from CF import CF
from clustering import bisecting_kmeans

ratings_base = pd.read_csv('ml-1m/train.csv')
ratings_test = pd.read_csv('ml-1m/test.csv')

rate_train = ratings_base.values
rate_test = ratings_test.values

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

rs = CF(rate_train, None, k=30, uuCF=1)

rs.normalize_Y()
rs.similarity()


K_CLUSTERS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
throughput_clustering = []
RMSE_clustering = []
MAE_clustering = []


for k in K_CLUSTERS:
    print('k = ', k)
    clusters, user_mapping = bisecting_kmeans(rs.Ybar.transpose().tocsr(), k=1)
    cf_clusters = {}
    for i in clusters:
        row, col = clusters[i].nonzero()
        data = np.array(clusters[i][row, col]).flatten()
        pd_data = pd.DataFrame()
        pd_data['userId'] = row
        pd_data['movieId'] = col
        pd_data['rating'] = data
        cf_clusters[i] = CF(pd_data.values, clusters[i].transpose())
        cf_clusters[i].similarity()

    n_tests = rate_test.shape[0]
    SE = 0 # squared error
    AE = 0 # absolute error

    total_time = 0
    for n in range(n_tests):
        cf_cluster = cf_clusters[user_mapping[rate_test[n, 0]][0]]
        user_cluster_id = user_mapping[rate_test[n, 0]][1]
        start = time.time()
        pred = cf_cluster.pred(user_cluster_id, rate_test[n, 1], normalized=1) + rs.mu[rate_test[n, 0]]
        total_time += time.time() - start
        # print(pred, rate_test[n, 2])
        SE += (pred - rate_test[n, 2])**2
        AE += np.abs(pred - rate_test[n, 2])

    throughput = n_tests / total_time
    RMSE = np.sqrt(SE/n_tests)
    MAE = AE / n_tests
    throughput_clustering.append(throughput)
    RMSE_clustering.append((RMSE))
    MAE_clustering.append((MAE))
    print('throughput:', throughput)
    print('RMSE:', RMSE)
    print('MAE:', MAE)


print('throughput', throughput_clustering)
print('RMSE: ', RMSE_clustering)
print('MAE: ', MAE_clustering)

k =  1
throughput: 1010.078695205124
RMSE: 0.9518830436356711
MAE: 0.7541191867456717
k =  2
throughput: 1053.457235667426
RMSE: 0.9518830436356711
MAE: 0.7541191867456717
k =  3
throughput: 1077.4617239922345
RMSE: 0.9518830436356711
MAE: 0.7541191867456717
k =  4
throughput: 1079.2139083712611
RMSE: 0.9518830436356711
MAE: 0.7541191867456717
k =  5
throughput: 873.797332549229
RMSE: 0.9518830436356711
MAE: 0.7541191867456717
k =  6
throughput: 953.9348982937756
RMSE: 0.9518830436356711
MAE: 0.7541191867456717
k =  7
throughput: 1106.3608064737025
RMSE: 0.9518830436356711
MAE: 0.7541191867456717
k =  8
throughput: 1113.095901255056
RMSE: 0.9518830436356711
MAE: 0.7541191867456717
k =  9
throughput: 1109.1020441396822
RMSE: 0.9518830436356711
MAE: 0.7541191867456717
k =  10
throughput: 1003.8328403468995
RMSE: 0.9518830436356711
MAE: 0.7541191867456717
throughput [1010.078695205124, 1053.457235667426, 1077.4617239922345, 1079.2139083712611, 873.797332549229, 953.9348982937756, 1106.360