# KNN (K-Nearest-Neighbors)

Predict rating of a movie

In [1]:
import pandas as pd
import numpy as np

In [2]:
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', names=r_cols, usecols=range(3))
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


Group by movie ID - total number of ratings (each movie's popularity) and the average rating for every movie

In [3]:
movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
print(movieProperties.shape)
movieProperties.head()

(1682, 2)


Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


movie 들간의 거리를 계산하기 위해 size를 0 ~ 1 사이의 값으로 scaling. <br>
0 = nobody rated it<br>
1 = the most popular movie

In [4]:
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
print(movieNumRatings.shape)

movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))  # min-max scaling
movieNormalizedNumRatings.head()

(1682, 1)


Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


19개의 genre 정보 가져오기

In [5]:
movieDict = {}
with open(r'data/ml-100k/u.item', encoding="ISO-8859-1") as f:
    temp = ''
    for line in f:
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = map(int, genres)
        movieDict[movieID] = (name, 
                              np.array(list(genres)), 
                              movieNormalizedNumRatings.loc[movieID].get('size'), 
                              movieProperties.loc[movieID].rating.get('mean'))


In [6]:
print(movieDict[1])

('Toy Story (1995)', array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.7735849056603774, 3.8783185840707963)


In [7]:
movieDict[1][2]

0.7735849056603774

두 영화의 장르 거리와 인기도 거리 계산 - 거리가 가까울수록 유사한 영화

In [8]:
from scipy import spatial

def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance
    
ComputeDistance(movieDict[2], movieDict[4])

0.8004574042309892

In [9]:
print(movieDict[2])
print(movieDict[4])

('GoldenEye (1995)', array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]), 0.22298456260720412, 3.2061068702290076)
('Get Shorty (1995)', array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.3567753001715266, 3.550239234449761)


테스트 영화(토이 스토리)와 데이터 세트에 있는 모든 영화 사이의 거리를 계산. <br>
거리별로 정렬하고 가장 가까운 K개 영화의 rating score를 인쇄

In [10]:
import operator

def getNeighbors(movieID, K):
    distances = []
    for movie in movieDict:
        if (movie != movieID):
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

K = 10
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movieDict[neighbor][3]
    print (movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Aladdin and the King of Thieves (1996) 2.8461538461538463


computed the average rating of the 10 nearest neighbors to Toy Story:

In [11]:
avgRating /= K
avgRating

3.3445905900235564

How does this compare to Toy Story's actual average rating?

In [12]:
movieDict[1]

('Toy Story (1995)',
 array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 0.7735849056603774,
 3.8783185840707963)

Not too bad!
