In [31]:
import zipfile
import requests
import io
import pandas as pd
# URL of MovieLens 100k zip
url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"

response = requests.get(url)
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    with z.open('ml-100k/u.data') as f1:
        ratings = pd.read_csv(f1, sep='\t', names=['user_id', 'movie_id','rating', 'timestamps'], encoding='ISO-8859-1')
    with z.open('ml-100k/u.item') as f2:
        movies = pd.read_csv(f2, sep='|', names=['movie_id', 'title'], usecols=[0, 1], encoding='ISO-8859-1')

ratings=pd.merge(ratings,movies)

In [32]:
ratings.head(10)

Unnamed: 0,user_id,movie_id,rating,timestamps,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)
5,298,474,4,884182806,Dr. Strangelove or: How I Learned to Stop Worr...
6,115,265,2,881171488,"Hunt for Red October, The (1990)"
7,253,465,5,891628467,"Jungle Book, The (1994)"
8,305,451,3,886324817,Grease (1978)
9,6,86,3,883603013,"Remains of the Day, The (1993)"


In [33]:
import numpy as np

movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
movieProperties.head()


  movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})


Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


In [34]:
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedNumRatings.head()

Unnamed: 0_level_0,size
movie_id,Unnamed: 1_level_1
1,0.774914
2,0.223368
3,0.152921
4,0.357388
5,0.146048


In [35]:
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    with z.open('ml-100k/u.data') as f1:
        ratings = pd.read_csv(f1, sep='\t', names=['user_id', 'movie_id', 'rating'], encoding='ISO-8859-1')
    with z.open('ml-100k/u.item') as f2:
        movies = pd.read_csv(f2, sep='|', names=['movie_id', 'title'], usecols=[0, 1], encoding='ISO-8859-1')
        # Build movie_dict here if you need to parse more fields
        f2.seek(0)
        movie_dict = {}
        for line in f2:
            line = line.decode('ISO-8859-1')
            fields = line.rstrip('\n').split('|')
            movie_id = int(fields[0])
            name = fields[1]
            genre = np.array(list(map(int, fields[5:25])))
            size = movieNormalizedNumRatings['size'].get(movie_id, np.nan)
            mean = movieProperties['rating']['mean'].get(movie_id, np.nan)
            movie_dict[movie_id] = (name, genre, size, mean)
movie_dict.get(1)

('Toy Story (1995)',
 array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 np.float64(0.7749140893470791),
 np.float64(3.8783185840707963))

In [41]:
from scipy import spatial

def ComputeDistance(a, b):
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    return genreDistance + popularityDistance
    
ComputeDistance(movie_dict[2], movie_dict[4])

np.float64(0.8006872852233677)

In [42]:
import operator

def getNeighbors(movieID, K):
    distances = []
    for movie in movie_dict:
        if (movie != movieID):
            dist = ComputeDistance(movie_dict[movieID], movie_dict[movie])
            distances.append((movie, dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(K):
        neighbors.append(distances[x][0])
    return neighbors

K = 10
avgRating = 0
neighbors = getNeighbors(1, K)
for neighbor in neighbors:
    avgRating += movie_dict[neighbor][3]
    print (movie_dict[neighbor][0] + " " + str(movie_dict[neighbor][3]))
    
avgRating /= K

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Lion King, The (1994) 3.7818181818181817
