In [78]:
import pandas as pd
import numpy as np
import heapq
from sklearn import cluster
from sklearn.metrics.pairwise import cosine_distances
from sklearn.feature_extraction import DictVectorizer
from bottleneck import argpartition, partition
from math import ceil, isnan

In [79]:
class Rec():
    
    def __init__(self):
        self.anime_tag_vector = pd.read_csv('anime_tag_vector.csv')
        self.anime_id = self.anime_tag_vector['anime_id'].as_matrix().tolist()
        self.anime_rating = self.anime_tag_vector['rating']
        del self.anime_tag_vector['anime_id']
        del self.anime_tag_vector['rating']
        self.anime_tag_vector = self.anime_tag_vector.as_matrix()

        # user instance: calculated rating to each tag
        self.instance = pd.read_csv('user_instances.csv')
        del self.instance['user_id']
        self.instance = self.instance[1:].as_matrix()
        
        
        # user actual rating to each watched anime
        self.rate = pd.read_csv('rating.csv')
        self.rate = self.rate.as_matrix().tolist()
        
        # cluster user to c groups
        self.c = 50
        self.kmeans = cluster.KMeans(n_clusters=self.c, max_iter=300, init='k-means++',n_init=10, verbose=True, n_jobs=-1).fit(self.anime_tag_vector)
        self.clus = self.kmeans.predict(self.anime_tag_vector)
        
        # user watched anime list    
        self.watch = np.array([[] for _ in self.instance]).tolist() ## TODO
        for r in self.rate:
            self.watch[r[0]-1].append(r[1])
        
        # tag
        self.animes = pd.read_csv("anime.csv")
        tags = []
        for genres in self.animes["genre"].fillna(""):
            for genre in str.split(genres, ", "):
                tags.append(genre)
        tags = sorted(list(set(tags))[1:]) # Remove ''
        v = DictVectorizer(sparse="False")
        genre = v.fit_transform(self.animes["genre"].fillna(", ".join(tags)).apply(lambda x: {i: 1 for i in str.split(x, ", ")}))
        self.genre = np.array(genre.todense())
        
        self.inverse_anime_id = {}
        for index, anime in enumerate(self.anime_id):
            self.inverse_anime_id[anime] = index
        
        # anime_candidates for each cluster (sorted)
        self.anime_candidates = []
        for i in range(self.c):
             
            print("Cluster ", i)
            animes_index_in_clus = np.where(self.clus==i)[0]
            animes_index_in_clus = [i for i in animes_index_in_clus if not isnan(self.anime_rating[i])]
            #self.anime_candidates.append(sorted(animes_in_clus, key=lambda x: self.anime_rating[self.anime_id.index(x)], reverse=True))
            self.anime_candidates.append([anime_id[anime_index] for anime_index in sorted(animes_index_in_clus, key=lambda x: self.anime_rating[x], reverse=True)])
            
            #self.anime_candidates.append(list(set(np.where(self.clus==i)[0])))

            
        self.anime_id_to_idx = {}
        for i in range(0, 12294):
            self.anime_id_to_idx.update({self.anime_id[i]:i})
        
        
    def avg_rating(self, train):
        
        rating_times = 0
        
        ratings = 0
        
        for r in train:
            anime_id = r[0]
            rating = r[1]
            if rating != -1:
                ratings += rating
                rating_times += 1
                

        np.seterr(divide='ignore', invalid='ignore')        
        if rating_times == 0:
            ratings_vector = 5
        else:
            ratings_vector = np.nan_to_num(ratings / rating_times)
        
        return ratings_vector
    
    def get_ranked_ids(self, datadata, k):
        
        # generate user instance from data
        data = dict(datadata)
        
        rating_times = 0
        rating_appeartime = np.zeros(43)
        ratings = np.zeros(43)
        avg_rate = self.avg_rating(datadata)
        
        for obj in datadata:
            
            anime_id = obj[0]
            r = obj[1]
        
            if r == -1:
                ratings += self.anime_tag_vector[self.anime_id_to_idx[anime_id]] * (10 - avg_rate)
            else:
                ratings += self.anime_tag_vector[self.anime_id_to_idx[anime_id]] * r
                
            rating_appeartime += self.anime_tag_vector[self.anime_id_to_idx[anime_id]]
        
        
        np.seterr(divide='ignore', invalid='ignore')        
        #ratings = np.nan_to_num(ratings / rating_appeartime)
        ins = np.nan_to_num(ratings / rating_appeartime)
        
        
        

        ins_watch = []
        for r in data.keys():
            ins_watch.append(r)
        ins_watch = set(ins_watch)
        #print(ins_watch)
        

        # predict!!!!
        pred = self.kmeans.predict([ins])[0]
        #index = [i for i, x in enumerate(self.clus.tolist()) if x == pred]
        #anime_candidates = [w for i in index for w in self.watch[i] if w in self.anime_id]
        #anime_candidates = set(anime_candidates) - set(ins_watch)

        #k = min(k, len(self.anime_candidates))
        
        #return heapq.nlargest(k,anime_candidates[pred], key=lambda x: self.anime_rating[self.anime_id.index(x)])

        
        ans = []
        count = 0
        for cand in self.anime_candidates[pred]:
            if cand in ins_watch:
                continue
            if count == k:
                break
            ans.append(cand)
            count += 1
        
        return ans


In [80]:
rec = Rec()

Initialization complete
start iteration
done sorting
end inner loop
Initialization complete
Iteration 0, inertia 14368.6537239
start iteration
done sorting
start iteration
end inner loop
Initialization complete
done sorting
end inner loop
Iteration 1, inertia 13919.5755002
Initialization complete
Iteration 0, inertia 14585.6958405
Initialization complete
Initialization complete
start iteration
start iteration
done sorting
end inner loop
start iteration
start iteration
done sorting
done sorting
Initialization complete
end inner loop
done sorting
start iteration
end inner loop
Iteration 2, inertia 13728.9211429
start iteration
end inner loop
done sorting
Iteration 0, inertia 14566.8021693
Iteration 1, inertia 14061.837741
start iteration
end inner loop
done sorting
Iteration 0, inertia 14988.097367
start iteration
end inner loop
Iteration 3, inertia 13619.5500061
Initialization complete
start iteration
Iteration 0, inertia 14387.007412
done sorting
done sorting
start iteration
end inner 

done sorting
start iteration
end inner loop
start iteration
start iteration
end inner loop
start iteration
done sorting
done sorting
done sorting
done sorting
Iteration 12, inertia 13311.7220148
Iteration 12, inertia 13474.1293918
end inner loop
end inner loop
done sorting
Iteration 15, inertia 13357.6499207
start iteration
end inner loop
start iteration
end inner loop
end inner loop
Iteration 10, inertia 13614.4390319
Iteration 11, inertia 13362.2145429
start iteration
start iteration
Iteration 14, inertia 13364.6549459
done sorting
done sorting
done sorting
start iteration
Iteration 13, inertia 13403.4551591
end inner loop
start iteration
done sorting
Iteration 13, inertia 13423.8399614
start iteration
done sorting
end inner loop
end inner loop
Iteration 12, inertia 13361.0285927
start iteration
end inner loop
done sorting
end inner loop
done sorting
Iteration 13, inertia 13310.8123304
done sorting
Iteration 11, inertia 13611.6526036
end inner loop
Iteration 16, inertia 13356.2327945

end inner loop
Iteration 10, inertia 13660.7915102
Iteration 13, inertia 13633.1977205
start iteration
start iteration
done sorting
done sorting
end inner loop
end inner loop
Iteration 14, inertia 13631.5127459
start iteration
done sorting
Iteration 11, inertia 13660.5591501
end inner loop
Iteration 15, inertia 13630.4876368
start iteration
start iteration
done sorting
end inner loop
done sorting
Iteration 16, inertia 13630.4458538
end inner loop
start iteration
done sorting
end inner loop
Iteration 17, inertia 13630.4458538
Iteration 12, inertia 13660.3204886
center shift 0.000000e+00 within tolerance 6.254264e-06
start iteration
done sorting
end inner loop
Iteration 13, inertia 13659.9323114
start iteration
done sorting
end inner loop
Iteration 14, inertia 13659.7471334
start iteration
done sorting
end inner loop
Iteration 15, inertia 13659.6111669
start iteration
done sorting
end inner loop
Iteration 16, inertia 13659.6111669
center shift 0.000000e+00 within tolerance 6.254264e-06
C

In [81]:
i = 7
print([(i, rec.anime_rating[rec.anime_id.index(i)]) for i in rec.anime_candidates[i]])

[(245, 8.7699999999999996), (22219, 8.6699999999999999), (32182, 8.5500000000000007), (10165, 8.5199999999999996), (22789, 8.5), (57, 8.4000000000000004), (19363, 8.3599999999999994), (11843, 8.3499999999999996), (9617, 8.3399999999999999), (8129, 8.2400000000000002), (26213, 8.2300000000000004), (17739, 8.1899999999999995), (16918, 8.1799999999999997), (4772, 8.1600000000000001), (2004, 8.1600000000000001), (7791, 8.1400000000000006), (29831, 8.0999999999999996), (32093, 8.0999999999999996), (66, 8.0600000000000005), (26123, 8.0500000000000007), (30279, 8.0399999999999991), (9289, 8.0299999999999994), (29787, 8.0099999999999998), (7062, 8.0099999999999998), (12815, 8.0099999999999998), (14175, 7.9900000000000002), (11239, 7.9900000000000002), (10521, 7.9800000000000004), (253, 7.9699999999999998), (15771, 7.9699999999999998), (254, 7.9500000000000002), (23225, 7.9500000000000002), (12893, 7.9400000000000004), (22265, 7.9400000000000004), (9563, 7.9400000000000004), (12403, 7.940000000

In [66]:
[i for i in rec.kmeans.labels_]

[7,
 49,
 42,
 20,
 42,
 23,
 25,
 34,
 42,
 42,
 7,
 44,
 42,
 40,
 23,
 44,
 7,
 47,
 5,
 40,
 23,
 19,
 34,
 45,
 15,
 38,
 31,
 47,
 47,
 13,
 26,
 5,
 23,
 47,
 5,
 47,
 32,
 31,
 43,
 7,
 31,
 36,
 18,
 23,
 23,
 31,
 5,
 48,
 47,
 31,
 31,
 5,
 21,
 19,
 44,
 16,
 5,
 5,
 23,
 20,
 7,
 24,
 1,
 42,
 25,
 42,
 43,
 48,
 19,
 35,
 12,
 19,
 23,
 7,
 33,
 40,
 33,
 31,
 48,
 23,
 5,
 37,
 46,
 7,
 31,
 47,
 29,
 7,
 7,
 26,
 13,
 43,
 34,
 5,
 32,
 25,
 44,
 26,
 4,
 46,
 23,
 49,
 44,
 44,
 33,
 24,
 25,
 44,
 44,
 43,
 44,
 44,
 25,
 25,
 7,
 1,
 40,
 47,
 30,
 19,
 29,
 25,
 23,
 7,
 47,
 32,
 20,
 49,
 44,
 31,
 10,
 26,
 7,
 15,
 31,
 22,
 25,
 0,
 49,
 23,
 40,
 15,
 5,
 33,
 31,
 25,
 25,
 32,
 23,
 48,
 46,
 32,
 34,
 24,
 34,
 5,
 7,
 7,
 26,
 22,
 31,
 31,
 23,
 33,
 38,
 18,
 31,
 1,
 34,
 9,
 47,
 42,
 7,
 49,
 28,
 29,
 32,
 47,
 29,
 26,
 26,
 44,
 29,
 23,
 23,
 26,
 7,
 5,
 23,
 31,
 46,
 13,
 46,
 18,
 22,
 15,
 20,
 31,
 32,
 31,
 49,
 48,
 25,
 4,
 5,
 18,
 29,
 

In [67]:
[i for i in rec.clus]

[7,
 49,
 42,
 20,
 42,
 23,
 25,
 34,
 42,
 42,
 7,
 44,
 42,
 40,
 23,
 44,
 7,
 47,
 5,
 40,
 23,
 19,
 34,
 45,
 15,
 38,
 31,
 47,
 47,
 13,
 26,
 5,
 23,
 47,
 5,
 47,
 32,
 31,
 43,
 7,
 31,
 36,
 18,
 23,
 23,
 31,
 5,
 48,
 47,
 31,
 31,
 5,
 21,
 19,
 44,
 16,
 5,
 5,
 23,
 20,
 7,
 24,
 1,
 42,
 25,
 42,
 43,
 48,
 19,
 35,
 12,
 19,
 23,
 7,
 33,
 40,
 33,
 31,
 48,
 23,
 5,
 37,
 46,
 7,
 31,
 47,
 29,
 7,
 7,
 26,
 13,
 43,
 34,
 5,
 32,
 25,
 44,
 26,
 4,
 46,
 23,
 49,
 44,
 44,
 33,
 24,
 25,
 44,
 44,
 43,
 44,
 44,
 25,
 25,
 7,
 1,
 40,
 47,
 30,
 19,
 29,
 25,
 23,
 7,
 47,
 32,
 20,
 49,
 44,
 31,
 10,
 26,
 7,
 15,
 31,
 22,
 25,
 0,
 49,
 23,
 40,
 15,
 5,
 33,
 31,
 25,
 25,
 32,
 23,
 48,
 46,
 32,
 34,
 24,
 34,
 5,
 7,
 7,
 26,
 22,
 31,
 31,
 23,
 33,
 38,
 18,
 31,
 1,
 34,
 9,
 47,
 42,
 7,
 49,
 28,
 29,
 32,
 47,
 29,
 26,
 26,
 44,
 29,
 23,
 23,
 26,
 7,
 5,
 23,
 31,
 46,
 13,
 46,
 18,
 22,
 15,
 20,
 31,
 32,
 31,
 49,
 48,
 25,
 4,
 5,
 18,
 29,
 

In [68]:
np.histogram(rec.kmeans.labels_, bins=[i for i in range(50)])

(array([  55,  920,   62,  187,  386,  378,  459,  359, 1028,  577,  151,
         213,  295,  255,  161,  182,  125,  184,  193,  288,  307,  165,
         239,  174,  107,  193,  218,  351,  329,  123,  290,  163,  180,
         154,  123,  149,  260,  238,  205,   82,  120,  178,   44,  150,
         376,   87,  344,  238,  249]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]))

In [62]:
rate = pd.read_csv('rating.csv')
rate = rate.as_matrix().tolist()
rr = [r for r in rate if r[0]==11]
data = []
for rrr in rr:
    data.append((rrr[1],rrr[2]))

In [63]:
recommend = rec.get_ranked_ids(data, 10)
[rec.anime_rating[rec.anime_id.index(i)] for i in recommend]

[8.1699999999999999,
 8.0,
 7.3300000000000001,
 7.1699999999999999,
 7.1500000000000004,
 7.0,
 6.96,
 6.5599999999999996,
 6.29,
 6.2699999999999996]

In [31]:
anime_tag_vector = pd.read_csv('anime_tag_vector.csv')
anime_id = anime_tag_vector['anime_id'].as_matrix().tolist()
anime_rating = anime_tag_vector['rating']
del anime_tag_vector['anime_id']
del anime_tag_vector['rating']
anime_tag_vector = anime_tag_vector.as_matrix()

# user instance: calculated rating to each tag
instance = pd.read_csv('user_instances.csv')
del instance['user_id']
instance = instance[1:].as_matrix()


# user actual rating to each watched anime
rate = pd.read_csv('rating.csv')
rate = rate.as_matrix().tolist()


In [32]:
# cluster user to c groups
c = 50
kmeans = cluster.KMeans(n_clusters=c, max_iter=300, init='k-means++',n_init=10, verbose=True, n_jobs=-1).fit(anime_tag_vector)
clus = kmeans.predict(anime_tag_vector)


Initialization complete
Initialization complete
Initialization complete
Initialization complete
start iteration
start iteration
done sorting
done sorting
start iteration
end inner loop
Initialization complete
end inner loop
done sorting
end inner loop
Iteration 0, inertia 14529.6536261
Iteration 0, inertia 14516.1565757
start iteration
Initialization complete
done sorting
Initialization complete
start iteration
end inner loop
Iteration 0, inertia 14659.918598
Iteration 1, inertia 13899.8292653
start iteration
done sorting
start iteration
done sorting
end inner loop
done sorting
start iteration
done sorting
start iteration
Initialization complete
Iteration 0, inertia 14674.4127802
start iteration
start iteration
end inner loop
end inner loop
done sorting
done sorting
done sorting
Iteration 1, inertia 14154.9372975
start iteration
end inner loop
done sorting
Iteration 2, inertia 13712.0812435
end inner loop
end inner loop
start iteration
end inner loop
start iteration
Iteration 0, inerti

center shift 0.000000e+00 within tolerance 6.254264e-06
start iteration
Iteration 10, inertia 13641.9560011
start iteration
start iteration
done sorting
done sorting
end inner loop
done sorting
done sorting
Iteration 15, inertia 13329.3777301
end inner loop
end inner loop
end inner loop
start iteration
done sorting
end inner loop
Iteration 11, inertia 13560.3738236
Iteration 12, inertia 13638.8039286
Iteration 13, inertia 13476.4251088
start iteration
start iteration
Iteration 16, inertia 13327.6401682
Iteration 11, inertia 13223.0869914
done sorting
start iteration
end inner loop
start iteration
done sorting
done sorting
end inner loop
done sorting
Iteration 11, inertia 13640.981968
end inner loop
end inner loop
end inner loop
Iteration 13, inertia 13638.7675994
start iteration
Iteration 12, inertia 13222.4614222
done sorting
end inner loop
start iteration
start iteration
done sorting
done sorting
end inner loop
Iteration 17, inertia 13326.2633187
Iteration 14, inertia 13463.7911403
e

In [56]:

# user watched anime list    
watch = np.array([[] for _ in instance]).tolist() ## TODO
for r in rate:
    watch[r[0]-1].append(r[1])

# tag
animes = pd.read_csv("anime.csv")
tags = []
for genres in animes["genre"].fillna(""):
    for genre in str.split(genres, ", "):
        tags.append(genre)
tags = sorted(list(set(tags))[1:]) # Remove ''
v = DictVectorizer(sparse="False")
genre = v.fit_transform(animes["genre"].fillna(", ".join(tags)).apply(lambda x: {i: 1 for i in str.split(x, ", ")}))
genre = np.array(genre.todense())

inverse_anime_id = {}
for index, anime in enumerate(anime_id):
    inverse_anime_id[anime] = index

# anime_candidates for each cluster (sorted)
anime_candidates = []
for i in range(c):

    print("Cluster ", i)
    animes_in_clus = np.where(clus==i)[0]
    #anime_candidates.append(sorted(animes_in_clus, key=lambda x: anime_rating[anime_id.index(x)], reverse=True))
    anime_candidates.append([anime_id[anime_index] for anime_index in sorted(animes_in_clus, key=lambda x: anime_rating[x], reverse=True)])

    #anime_candidates.append(list(set(np.where(clus==i)[0])))


anime_id_to_idx = {}
for i in range(0, 12294):
    anime_id_to_idx.update({anime_id[i]:i})


Cluster  0
Cluster  1
Cluster  2
Cluster  3
Cluster  4
Cluster  5
Cluster  6
Cluster  7
Cluster  8
Cluster  9
Cluster  10
Cluster  11
Cluster  12
Cluster  13
Cluster  14
Cluster  15
Cluster  16
Cluster  17
Cluster  18
Cluster  19
Cluster  20
Cluster  21
Cluster  22
Cluster  23
Cluster  24
Cluster  25
Cluster  26
Cluster  27
Cluster  28
Cluster  29
Cluster  30
Cluster  31
Cluster  32
Cluster  33
Cluster  34
Cluster  35
Cluster  36
Cluster  37
Cluster  38
Cluster  39
Cluster  40
Cluster  41
Cluster  42
Cluster  43
Cluster  44
Cluster  45
Cluster  46
Cluster  47
Cluster  48
Cluster  49


In [55]:
animes_in_clus = np.where(clus==5)[0]
[anime_id[i] for i in animes_in_clus]

[11879,
 29575,
 15843,
 21097,
 10779,
 10380,
 22069,
 12375,
 3559,
 32587,
 21829,
 15537,
 24641,
 6893,
 2798,
 7411,
 32355,
 5959,
 22429,
 7748,
 9322,
 4502,
 20377,
 20801,
 33322,
 31886,
 18655,
 8110,
 14991,
 3918,
 5097,
 3102,
 3220,
 31789,
 16474,
 15097,
 28779,
 29083,
 12143,
 30614,
 30243,
 18691,
 8291,
 30891,
 30702,
 32667,
 10419,
 10683,
 3939,
 2372,
 29261,
 28157,
 2135,
 7053,
 19051,
 25667,
 21521,
 32423,
 13057,
 2867,
 14127,
 16472,
 9311,
 29808,
 32872,
 18525,
 15841,
 1787,
 16638,
 33231,
 28961,
 6465,
 32620,
 17867,
 9435,
 5315,
 31788,
 18693,
 4763,
 21925,
 23479,
 10106,
 5194,
 10694,
 3771,
 2446,
 4358,
 16914,
 29807,
 11321,
 20849,
 32518,
 6194,
 8039,
 10280,
 32063,
 13595,
 19859,
 29809,
 13219,
 12959,
 31810,
 21001,
 2432,
 17745,
 2433,
 3970,
 30898,
 2852,
 10488,
 21363,
 5464,
 2136,
 19631,
 5762,
 16642,
 4600,
 31118,
 12997,
 2873,
 7178,
 2866,
 1401,
 2396,
 3941,
 4360,
 21069,
 16189,
 9308,
 28309,
 17537,