# Using our Approximate Nearest Neighbor Model to pre-compute candidates

In [1]:
from annoy import AnnoyIndex
import pandas as pd

In [2]:
t = AnnoyIndex(19019, 'angular')
t.load('movies.ann')

True

In [3]:
in_movies = pd.read_parquet('training_data/movielens_25m_5_5.parquet').MOVIE_ID.unique()

In [4]:
movies = pd.read_parquet('dataset/ml-25m/cleaned/movies.pq')
id2title = {}
ids = list(movies.movieId)
titles = list(movies.title)
for i,id in enumerate(ids):
    id2title[id] = titles[i]

In [8]:
def get_nn(movie_id, k=100):
    return t.get_nns_by_item(movie_id,k)
        
def print_movie_titles(movies):
    for m in movies:
        print(f"{id2title[m]}, ({m})")
        
nn = get_nn(1, k=10)
print_movie_titles(nn)

Toy Story (1995), (1)
Toy Story 2 (1999), (3114)
Toy Story 3 (2010), (78499)
Monsters, Inc. (2001), (4886)
Finding Nemo (2003), (6377)
Lion King, The (1994), (364)
Incredibles, The (2004), (8961)
Up (2009), (68954)
Shrek (2001), (4306)
Aladdin (1992), (588)


## Compute nearest neighbors for every movie

In [None]:
from tqdm import tqdm
preds = {}
for m in tqdm(list(in_movies)):
    preds[m] = get_nn(int(m),k=100)

In [None]:
import pickle
pickle.dump(preds, open('movie_nearest_neighbors.p', 'wb'))

In [None]:
movie_ids = []
nns = []
for k in preds:
    movie_ids.append(k)
    nns.append(preds[k])
    

## Load nearest neighbors into Snowflake

In [None]:
from datetime import datetime
data = pd.DataFrame({'MOVIE_ID': movie_ids, 'NEAREST_NEIGHBORS': nns, 'CREATED_AT': [datetime.now()] * len(movie_ids)})

In [None]:
import snowflake.connector
from dotenv import load_dotenv
import os
from snowflake.connector.pandas_tools import write_pandas

load_dotenv()  # take environment variables from .env.

connection_parameters = {
    "user": os.environ['SNOWFLAKE_USER'],
    "password": os.environ['SNOWFLAKE_PASSWORD'],
    "account": os.environ['SNOWFLAKE_ACCOUNT'],
    "warehouse": "DEMO_WH",
    # Database and schema are required to create various temporary objects by tecton
    "database": "TECTON",
    "schema": "PUBLIC",
}
conn = snowflake.connector.connect(**connection_parameters)

In [None]:
write_pandas(conn, data, "MOVIE_NEAREST_NEIGHBORS", database='DEV_DAVID', schema="MOVIELENS_25M")