# KNN

In [1]:
import pandas as pd
from scipy.sparse import hstack
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
movies = pd.read_csv('/kaggle/working/movies.csv')
movies.head()

Unnamed: 0,movieid,imdbid,tmdbid,title,poster,genres,cast
0,182167,tt0023812,66822.0,Birds in the Spring,https://image.tmdb.org/t/p/original/3PH9xmPp8a...,"['Animation', 'Comedy']","['Marion Darlington', 'Clarence Nash', 'Purv P..."
1,182169,tt0024977,67585.0,The China Shop,https://image.tmdb.org/t/p/original/jpKtLIcANy...,['Animation'],
2,77,tt0113973,124626.0,Nico Icon,https://image.tmdb.org/t/p/original/oE5N0E2YGp...,['Documentary'],"['Nico', 'Tina Aumont', 'Christian Aaron Boulo..."
3,116,tt0112373,51352.0,Anne Frank Remembered,https://image.tmdb.org/t/p/original/6nyhzrSGim...,"['Documentary', 'Drama']","['Kenneth Branagh', 'Glenn Close', 'Anne Frank..."
4,162,tt0109508,26564.0,Crumb,https://image.tmdb.org/t/p/original/oJWGWzaYdY...,['Documentary'],"['Robert Crumb', 'Aline Kominsky', 'Charles Cr..."


In [23]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59695 entries, 0 to 59694
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieid  59695 non-null  int64  
 1   imdbid   59695 non-null  object 
 2   tmdbid   59619 non-null  float64
 3   title    59695 non-null  object 
 4   poster   58201 non-null  object 
 5   genres   59695 non-null  object 
 6   cast     56829 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 3.2+ MB


In [24]:
movies['cast'] = movies['cast'].fillna('unknown')
movies['genres'] = movies['genres'].fillna('other')

In [25]:
def _format(features):
    features = features.replace('{', '').replace('}', '').replace('[', '')\
    .replace(']', '').replace('"', '').replace("'", '').split(',')
    features = [f.replace(' ', '') for f in features]
    return ' '.join(features)

In [26]:
movies['genres'] = movies['genres'].apply(_format)
movies['cast'] = movies['cast'].apply(_format)

In [30]:
movies['cast'][50]

'EmilioCigoli IsaPola LucianoDeAmbrosis ErnestoCalindri GiovannaCigoli TeclaScarano AdrianoRimoldi DinaPerbellini JoneFrigerio MariaGardena NicolettaParodi MarcelloMastroianni OlintoCristina MarioGallina ArmandoMigliari GuidoMorisi'

In [31]:
movies['genres'][5]

'Animation Comedy Family TVMovie'

In [32]:
movieid_map = movies[['movieid']].reset_index()
movieid_map.to_csv('knn_movieids.csv', index=False)

In [33]:
import numpy as np
vectorizer = CountVectorizer()
title = vectorizer.fit_transform(movies['title'])
genres = vectorizer.fit_transform(movies['genres'])
cast = vectorizer.fit_transform(movies['cast'])
features = hstack([title, genres, cast])

In [35]:
import joblib

joblib.dump(features, 'knn_features.dump')

['knn_features.dump']

In [12]:
text_data = movies['genres_str'] + ' ' + movies['actors_str'] + ' ' + movies['title']
vectorizer = CountVectorizer()
combined_matrix = vectorizer.fit_transform(text_data)

In [None]:
print("Genres Matrix Shape:", genres_matrix.shape)
print("Title Matrix Shape:", title_matrix.shape)
print("Actors Matrix Shape:", actors_matrix.shape)
print("Combined Matrix Shape:", combined_matrix.shape)

In [16]:
# Build k-nearest neighbors model
knn_model = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn_model.fit(combined_matrix)

In [17]:
def get_recommendations(movie_titles):
    movie_indices = []
    for title in movie_titles:
        idx = movies[movies['title'] == title].index[0]
        movie_indices.append(idx)
    distances, indices = knn_model.kneighbors(combined_matrix[movie_indices])
    similar_movies_indices = indices.flatten()
    similar_movies_indices = similar_movies_indices[~pd.Series(similar_movies_indices).isin(movie_indices)]  # Exclude input movies
    return movies.iloc[similar_movies_indices]['title']

In [18]:
# Example usage
movie_list = ['Spider-Man (2002)','Avengers: Age of Ultron (2015)', "Ferris Bueller's Day Off (1986)"]
recommendations = get_recommendations(movie_list)
print("Recommendations:")
print(recommendations)

Recommendations:
5247                         Spider-Man 2 (2004)
2405                        Clockstoppers (2002)
78418                               River (2022)
49764                            Mountain (2017)
14746                       Avengers, The (2012)
23051    Avengers: Infinity War - Part II (2019)
23050     Avengers: Infinity War - Part I (2018)
23055          Captain America: Civil War (2016)
898                              H.O.T.S. (1979)
58574                                Poms (2019)
80352                           A Therapy (2012)
74875                       Skate Witches (1986)
Name: title, dtype: object


In [27]:
movies[['title']]

Unnamed: 0,title
0,Hang 'Em High (1968)
1,Handle with Care (a.k.a. Citizen's Band) (1977)
2,Drunken Master (Jui kuen) (1978)
3,"Conformist, The (Conformista, Il) (1970)"
4,Hairspray (1988)
...,...
86532,Body Heat (1981)
86533,Ferris Bueller's Day Off (1986)
86534,"Year of Living Dangerously, The (1982)"
86535,Children of Paradise (Les enfants du paradis) ...


---