# Ch 4b. Movie recommendation engine

In [1]:
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import pandas as pd
import networkx as nx

## 1 Get movie rating data

In [2]:
ratings = pd.read_csv(
    'ml-100k/u.data',
    sep='\t',
    names=['user_id', 'movie_id', 'rating', 'unix_timestamp']
)
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [3]:
movies = pd.read_csv(
    'ml-100k/u.item',
    sep='|',
    usecols=range(2),
    names=['movie_id', 'title'],
    encoding='latin-1'
)
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [4]:
ratings = ratings[ratings.rating >= 4]
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
5,298,474,4,884182806
7,253,465,5,891628467
11,286,1014,5,879781125
12,200,222,5,876042340
16,122,387,5,879270459
...,...,...,...,...
99988,421,498,4,892241344
99989,495,1091,4,888637503
99990,806,421,4,882388897
99991,676,538,4,892685437


## 2 Count coincident movie pairs

That is, # of times a given pair was liked by the same person.

In [5]:
from collections import defaultdict

In [6]:
pairs = defaultdict(int)

In [7]:
for group in ratings.groupby('user_id'):
    # group = movies that the current user likes
    user_movies = list(group[1]['movie_id'])
    # Increment each (movie_i, movie_j) pair
    for i in range(len(user_movies)):
        for j in range(i+1, len(user_movies)):
            pairs[(user_movies[i], user_movies[j])] += 1

In [13]:
# Convert to normal dict to avoid querying side effect where queries add new dict rows
pairs = dict(pairs)

In [14]:
list(pairs)[0:5]

[(61, 33), (61, 160), (61, 20), (61, 202), (61, 171)]

In [15]:
pairs[(61, 63)]

1

## 3 Build weighted graph

In [16]:
G = nx.Graph()

In [18]:
for pair in pairs:
    movie1, movie2 = pair
    score = pairs[pair]
    if score >= 20:
        G.add_edge(movie1, movie2, weight=score)

In [22]:
print(G)

Graph with 410 nodes and 14936 edges


## 4 node2vec

In [24]:
from node2vec import Node2Vec

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
node2vec = Node2Vec(G, dimensions=64, walk_length=20, num_walks=200, p=2, q=1, workers=1)

Computing transition probabilities: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 410/410 [00:06<00:00, 62.48it/s]
Generating walks (CPU: 1): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:21<00:00,  9.35it/s]


In [26]:
model = node2vec.fit(window=10, min_count=1, batch_words=4)

In [90]:
def recommend(movie):
    movie_id = movies[movies.title == movie].movie_id.values[0]
    print(f'movie_id={movie_id}')
    for id in model.wv.most_similar(movie_id):  #[:5]:
        title = movies[movies.movie_id == int(id[0])].title.values[0]
        print(f'{title}: {id[1]:.2f}')

In [91]:
# movies[movies.title == 'Star Wars (1977)'].movie_id.values[0]

In [92]:
recommend('Star Wars (1977)')

movie_id=50
Contact (1997): 0.64
Good Will Hunting (1997): 0.53
Rainmaker, The (1997): 0.51
Air Force One (1997): 0.51
Volcano (1997): 0.48
Rosewood (1997): 0.46
Apt Pupil (1998): 0.45
English Patient, The (1996): 0.45
Alien: Resurrection (1997): 0.45
In & Out (1997): 0.43


In [93]:
movies[movies.movie_id == 50]

Unnamed: 0,movie_id,title
49,50,Star Wars (1977)


In [94]:
model.wv.most_similar(50)[:5]

[('258', 0.6357621550559998),
 ('272', 0.5284056067466736),
 ('310', 0.5089632868766785),
 ('300', 0.5088992714881897),
 ('678', 0.4841189384460449)]

Hm, this doesn't seem like that great of a recommendation engine! :D

In [95]:
recommend('2001: A Space Odyssey (1968)')

movie_id=135
Close Shave, A (1995): 0.57
Searching for Bobby Fischer (1993): 0.52
Ed Wood (1994): 0.48
Grand Day Out, A (1992): 0.47
Big Night (1996): 0.46
Clerks (1994): 0.46
Day the Earth Stood Still, The (1951): 0.45
This Is Spinal Tap (1984): 0.45
Heathers (1989): 0.44
Hoop Dreams (1994): 0.44
