# Collaborative Filtering Recommendation Exploration
## Knn Exploration of MovieLens with Surprise

In [17]:
#!pip install scikit-surprise
import io  # needed because of weird encoding of u.item file
from surprise import KNNBaseline
from surprise import Dataset
from surprise import get_dataset_dir
import pandas as pd

## Helper Function to Convert IDS to Names

In [4]:
def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

## Train KNN based model

In [5]:
# First, train the algorithm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)



Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /Users/angieli/.surprise_data/ml-100k
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x11063a0b8>

## Print 10 Similarity Based Recommendations

In [6]:
# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)

print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)

The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)


### In Class Exercise:  Write a Function to Return The Top Ten Recommendations From Five Movies in ml-100k

Example Pseudocode:

```
def recommendations(movies, rec_count)"
    """Your
    return recommendations
    
movies = ["Beauty and the Beast (1991)", "Cool Hand Luke (1967)",.. ]

print(recommendations(movies=movies, rec_count=10)

```

* Additional considerations:

- What tradeoffs are you making in picking the top from a group of selections vs just movie?
- How well will this perform on a very large dataset (every movie ever made)?

    





### Category of recommendations:
* User-based (user characteristics choose this loan or not 0/1)
* Item-based (user preference, like A or B)
* Content-based

In [10]:
rid_to_name

{'1': 'Toy Story (1995)',
 '2': 'GoldenEye (1995)',
 '3': 'Four Rooms (1995)',
 '4': 'Get Shorty (1995)',
 '5': 'Copycat (1995)',
 '6': 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 '7': 'Twelve Monkeys (1995)',
 '8': 'Babe (1995)',
 '9': 'Dead Man Walking (1995)',
 '10': 'Richard III (1995)',
 '11': 'Seven (Se7en) (1995)',
 '12': 'Usual Suspects, The (1995)',
 '13': 'Mighty Aphrodite (1995)',
 '14': 'Postino, Il (1994)',
 '15': "Mr. Holland's Opus (1995)",
 '16': 'French Twist (Gazon maudit) (1995)',
 '17': 'From Dusk Till Dawn (1996)',
 '18': 'White Balloon, The (1995)',
 '19': "Antonia's Line (1995)",
 '20': 'Angels and Insects (1995)',
 '21': 'Muppet Treasure Island (1996)',
 '22': 'Braveheart (1995)',
 '23': 'Taxi Driver (1976)',
 '24': 'Rumble in the Bronx (1995)',
 '25': 'Birdcage, The (1996)',
 '26': 'Brothers McMullen, The (1995)',
 '27': 'Bad Boys (1995)',
 '28': 'Apollo 13 (1995)',
 '29': 'Batman Forever (1995)',
 '30': 'Belle de jour (1967)',
 '31': 'Crimson Tide

In [40]:
from surprise import KNNBasic

sim_options = {
    'name': 'cosine',
    'user_based': False
}

knn = KNNBasic(sim_options=sim_options)
trainingSet = data.build_full_trainset()

knn.fit(trainingSet)
testSet = trainingSet.build_anti_testset()
predictions = knn.test(testSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [41]:
from collections import defaultdict

def get_top3_recommendations(predictions, topN = 3):
    
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
    
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
    
    return top_recs

In [53]:
top3_recommendations = get_top3_recommendations(predictions)
rid_to_name = read_item_names()
for uid, user_ratings in top3_recommendations.items():
    print(uid, [rid_to_name[iid] for (iid, _) in user_ratings])

TypeError: tuple indices must be integers or slices, not str

In [46]:
top3_recommendations

defaultdict(list,
            {'1': [('1156', 4.378378378378378),
              ('1653', 4.375),
              ('1601', 4.275)],
             '10': [('644', 4.599628223552752),
              ('423', 4.574522736854731),
              ('659', 4.549810018825058)],
             '100': [('1619', 4.333333333333333),
              ('1556', 4.0),
              ('1674', 4.0)],
             '101': [('1430', 4.0), ('1236', 4.0), ('1582', 4.0)],
             '102': [('1673', 3.1481481481481484),
              ('1457', 3.075),
              ('1458', 3.075)],
             '103': [('1430', 4.000983786355035),
              ('1618', 4.0),
              ('1606', 4.0)],
             '104': [('1582', 3.7142857142857144),
              ('1561', 3.7142857142857144),
              ('1565', 3.7142857142857144)],
             '105': [('1616', 4.5),
              ('1556', 4.308149812804074),
              ('1306', 4.27228126655734)],
             '106': [('1673', 4.333333333333333),
              ('1614', 4.23