In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieid,imdbid,tmdbid,title,poster,genres,cast
0,182167,tt0023812,66822.0,Birds in the Spring,https://image.tmdb.org/t/p/original/3PH9xmPp8a...,"{Animation,Comedy}","{""Marion Darlington"",""Clarence Nash"",""Purv Pul..."
1,182169,tt0024977,67585.0,The China Shop,https://image.tmdb.org/t/p/original/jpKtLIcANy...,{Animation},
2,77,tt0113973,124626.0,Nico Icon,https://image.tmdb.org/t/p/original/oE5N0E2YGp...,{Documentary},"{Nico,""Tina Aumont"",""Christian Aaron Boulogne""..."
3,116,tt0112373,51352.0,Anne Frank Remembered,https://image.tmdb.org/t/p/original/6nyhzrSGim...,"{Documentary,Drama}","{""Kenneth Branagh"",""Glenn Close"",""Anne Frank"",..."
4,162,tt0109508,26564.0,Crumb,https://image.tmdb.org/t/p/original/oJWGWzaYdY...,{Documentary},"{""Robert Crumb"",""Aline Kominsky"",""Charles Crum..."


In [5]:
def _format(features):
    features = features.replace('{', '').replace('}', '').replace('"', '').split(',')
    features = [f.replace(' ', '') for f in features]
    return ' '.join(features)

In [6]:
movies['cast'].fillna('unknown', inplace=True)
movies['genres'].fillna('other', inplace=True)
movies['genres'] = movies['genres'].apply(_format)
movies['cast'] = movies['cast'].apply(_format)
movies['movieid'] = movies['movieid'].astype(str)
movies.head()

Unnamed: 0,movieid,imdbid,tmdbid,title,poster,genres,cast
0,182167,tt0023812,66822.0,Birds in the Spring,https://image.tmdb.org/t/p/original/3PH9xmPp8a...,Animation Comedy,MarionDarlington ClarenceNash PurvPullen MaeQu...
1,182169,tt0024977,67585.0,The China Shop,https://image.tmdb.org/t/p/original/jpKtLIcANy...,Animation,unknown
2,77,tt0113973,124626.0,Nico Icon,https://image.tmdb.org/t/p/original/oE5N0E2YGp...,Documentary,Nico TinaAumont ChristianAaronBoulogne Jackson...
3,116,tt0112373,51352.0,Anne Frank Remembered,https://image.tmdb.org/t/p/original/6nyhzrSGim...,Documentary Drama,KennethBranagh GlennClose AnneFrank OttoFrank
4,162,tt0109508,26564.0,Crumb,https://image.tmdb.org/t/p/original/oJWGWzaYdY...,Documentary,RobertCrumb AlineKominsky CharlesCrumb MaxonCr...


In [7]:
movies['movieid'] = movies['movieid'].astype(str)
movieid_map = [(i, movieid) for i, movieid in enumerate(movies['movieid'])]
movieid_map = pd.DataFrame(movieid_map, columns=['index', 'movieid'])
movieid_map.to_csv('../data/movieids.csv', index=False)

In [10]:
# save movieid with index without set
movieid_map = movies[['movieid']].reset_index()
movieid_map.to_csv('../data/movieids2.csv', index=False)

In [14]:
vectorizer = CountVectorizer()
title = vectorizer.fit_transform(movies['title'])
genres = vectorizer.fit_transform(movies['genres'])
cast = vectorizer.fit_transform(movies['cast'])
features = hstack([title, genres, cast])

In [15]:
import joblib
# joblib.dump(features, 'knn_features.dump')
knn = NearestNeighbors(n_neighbors=10, metric='euclidean')
knn.fit(features)

In [11]:
movieid_map = pd.read_csv('../data/movieids.csv', index_col='index')
movieid_map.head()

Unnamed: 0_level_0,movieid
index,Unnamed: 1_level_1
0,182167
1,182169
2,77
3,116
4,162


In [13]:
len(movieid_map)

86537

In [16]:
features.shape

(86537, 580054)

In [None]:
def predict(movieids, watched):
    watched = watched.split(',')
    watched = [int(movieid_map.loc[movieid_map['movieid'] == movieid].index[0]) for movieid in watched]
    watched = [features[watched].sum(axis=0)]
    watched = hstack(watched)
    _, indices = knn.kneighbors(watched, n_neighbors=10)
    indices = indices[0]
    return movieid_map.loc[indices]['movieid'].values

In [3]:
# ckeck for nan values

movies.isnull().sum()

movieid       0
imdbid        0
tmdbid      126
title         0
poster     2862
genres        0
cast       4677
dtype: int64

In [4]:
movies['cast'].fillna('Unknown', inplace=True)
movies['poster'].fillna('Unknown', inplace=True)
movies['tmdbid'].fillna('Unknown', inplace=True)

  movies['tmdbid'].fillna('Unknown', inplace=True)


In [5]:
movies.isnull().sum()

movieid    0
imdbid     0
tmdbid     0
title      0
poster     0
genres     0
cast       0
dtype: int64

In [6]:
def get_genres(genres):
    genres = genres.replace('{', '').replace('}', '').replace('"', '').split(',')
    return ' '.join(genres)

def get_cast(cast):
    cast = cast.replace('{', '').replace('}', '').replace('"', '').split(',')
    cast = [c.replace(' ', '') for c in cast]
    return ' '.join(cast)

In [7]:
# test the functions
print(get_genres(movies['genres'][0]))
print(get_cast(movies['cast'][0]))

Animation Comedy
MarionDarlington ClarenceNash PurvPullen MaeQuestel


In [8]:
# apply the functions to the dataframe
movies['genres'] = movies['genres'].apply(get_genres)
movies['cast'] = movies['cast'].apply(get_cast)

In [9]:
movies.head()

Unnamed: 0,movieid,imdbid,tmdbid,title,poster,genres,cast
0,182167,tt0023812,66822.0,Birds in the Spring,https://image.tmdb.org/t/p/original/3PH9xmPp8a...,Animation Comedy,MarionDarlington ClarenceNash PurvPullen MaeQu...
1,182169,tt0024977,67585.0,The China Shop,https://image.tmdb.org/t/p/original/jpKtLIcANy...,Animation,Unknown
2,77,tt0113973,124626.0,Nico Icon,https://image.tmdb.org/t/p/original/oE5N0E2YGp...,Documentary,Nico TinaAumont ChristianAaronBoulogne Jackson...
3,116,tt0112373,51352.0,Anne Frank Remembered,https://image.tmdb.org/t/p/original/6nyhzrSGim...,Documentary Drama,KennethBranagh GlennClose AnneFrank OttoFrank
4,162,tt0109508,26564.0,Crumb,https://image.tmdb.org/t/p/original/oJWGWzaYdY...,Documentary,RobertCrumb AlineKominsky CharlesCrumb MaxonCr...


In [13]:
vectorizer = CountVectorizer()
genres = vectorizer.fit_transform(movies['genres'])
cast = vectorizer.fit_transform(movies['cast'])
title = vectorizer.fit_transform(movies['title'])
matrix = hstack((genres, cast, title))

matrix.shape

(86537, 580055)

In [14]:
knn = NearestNeighbors(n_neighbors=10, metric='euclidean')
knn.fit(matrix)

In [2]:
import joblib

In [16]:
joblib.dump(knn, 'knn_model.dump')
joblib.dump(matrix, 'matrix.dump')

['matrix.dump']

In [7]:
matrix = joblib.load('D:\Desktop\DB_CRUD_OP\AI\models\knn_matrix.dump')
knn = NearestNeighbors(n_neighbors=10, metric='euclidean')
knn.fit(matrix)

In [3]:
movieid_map = pd.read_csv('../data/movieids.csv', index_col='index')
movieid_map.head()

Unnamed: 0_level_0,movieid
index,Unnamed: 1_level_1
0,182167
1,182169
2,77
3,116
4,162


In [22]:
def get_recommendation(movieid):
    movie = movies[movies['movieid'] == movieid]
    movie_index = movie.index[0]
    distances, indices = knn.kneighbors(combined[movie_index], n_neighbors=11)
    recommended_movies = []
    for i in indices:
        recommended_movies.append(movies.iloc[i])
    return recommended_movies

def get_movieid(title):
    return movies[movies['title'] == title]['movieid'].values[0]

def get_movie_title(movieid):
    return movies[movies['movieid'] == movieid]['title'].values[0]

In [21]:
def get_recommendation(user_ratings, top_n=50): # list of tuples (movieid, rating)
    # use weighted distance based on rating and get top n recommendations in total
    combined = {}
    for movieid, rating in user_ratings:
        movie = movies[movies['movieid'] == movieid]
        movie_index = movie.index[0]
        distances, indices = knn.kneighbors(combined[movie_index], n_neighbors=11)
        for i in indices:
            if i not in combined:
                combined[i] = 0
            combined[i] += distances[i] / rating
    combined = sorted(combined.items(), key=lambda x: x[1], reverse=True)
    recommended_movies = []
    for i in combined[:top_n]:
        recommended_movies.append(movies.iloc[i])
    return recommended_movies

[[0.         0.4452998  0.64887656 0.68011836 0.68302846 0.70212227
  0.72220805 0.72563526 0.73050222 0.73472095 0.7415617 ]]


[       movieid     imdbid    tmdbid                       title  \
 48           1  tt0114709     862.0                   Toy Story   
 5105      3114  tt0120363     863.0                 Toy Story 2   
 16732    78499  tt0435761   10193.0                 Toy Story 3   
 4348      2355  tt0120623    9487.0                A Bug's Life   
 61443   201588  tt1979376  301528.0                 Toy Story 4   
 25671   120474  tt3473654  256835.0  Toy Story That Time Forgot   
 24278   115875  tt1850374   77887.0           Hawaiian Vacation   
 6840      4886  tt0198781     585.0              Monsters, Inc.   
 2773       588  tt0103639     812.0                     Aladdin   
 22219   106022  tt2446040  213121.0        Toy Story of Terror!   
 80013   267944  tt0268096  193970.0     Christmas in Tattertown   
 
                                                   poster  \
 48     https://image.tmdb.org/t/p/original/uXDfjJbdP4...   
 5105   https://image.tmdb.org/t/p/original/2MFIhZAW0C...   

In [28]:
# save all model components
import joblib
joblib.dump(knn, 'knn_model.pkl')
# joblib.dump(movies, 'movies.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(combined, 'matrix.pkl')

['matrix.pkl']

In [30]:

# Assuming 'knn_model' is your KNN model and 'count_vectorizer' is your CountVectorizer
joblib.dump(knn, 'knn_model.dump')
joblib.dump(vectorizer, 'count_vectorizer.dump')

['count_vectorizer.dump']