In [37]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-20m-dataset/rating.csv
/kaggle/input/movielens-20m-dataset/link.csv
/kaggle/input/movielens-20m-dataset/genome_tags.csv
/kaggle/input/movielens-20m-dataset/genome_scores.csv
/kaggle/input/movielens-20m-dataset/tag.csv
/kaggle/input/movielens-20m-dataset/movie.csv


In [9]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

## Load the datasets : we use two dataset --> (1).movie.csv, (2).rating.csv

In [16]:
movies = pd.read_csv("/kaggle/input/movielens-20m-dataset/movie.csv")
ratings = pd.read_csv("/kaggle/input/movielens-20m-dataset/rating.csv")

In [17]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [5]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

## Merge movies and ratings on 'movieId'

In [19]:
data = pd.merge(ratings, movies, on='movieId')

In [20]:
data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00,Up (2009),Adventure|Animation|Children|Drama
20000259,138493,69526,4.5,2009-12-03 18:31:48,Transformers: Revenge of the Fallen (2009),Action|Adventure|Sci-Fi|IMAX
20000260,138493,69644,3.0,2009-12-07 18:10:57,Ice Age: Dawn of the Dinosaurs (2009),Action|Adventure|Animation|Children|Comedy|Rom...
20000261,138493,70286,5.0,2009-11-13 15:42:24,District 9 (2009),Mystery|Sci-Fi|Thriller


## Feature Engineering for genres : 

* Split genres into a list. And use MultiLabelBinarizer for encoding


In [21]:
data['genres'] = data['genres'].apply(lambda x: x.split('|'))  # Convert genres to lists

In [24]:
data.head(5)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),"[Adventure, Children, Fantasy]"
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...","[Adventure, Drama, Fantasy, Mystery, Sci-Fi]"
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),"[Mystery, Sci-Fi, Thriller]"
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]"
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]"


In [22]:
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(data['genres'])

In [26]:
genres_encoded.shape

(20000263, 20)

## Create a new DataFrame for encoding

In [27]:
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)
data = pd.concat([data, genres_df], axis=1)

## Aggregate data by movieId: Calculate average ratings for each movie

In [28]:
movie_features = data.groupby('movieId').agg({
    'rating': 'mean',  # Average rating
    **{genre: 'max' for genre in mlb.classes_}  # Max value for each genre
}).reset_index()

In [29]:
movie_features

Unnamed: 0,movieId,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,3.921240,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,3.211977,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,3.151040,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,2.861393,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,3.064592,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26739,131254,4.000000,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
26740,131256,4.000000,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
26741,131258,2.500000,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26742,131260,3.000000,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Extract features and target

In [30]:

X = movie_features.drop(['movieId'], axis=1)  # Features
movie_ids = movie_features['movieId']  # Keep track of movie IDs

In [31]:
X

Unnamed: 0,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,3.921240,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3.211977,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.151040,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,2.861393,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,3.064592,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26739,4.000000,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26740,4.000000,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26741,2.500000,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26742,3.000000,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Fit KNN model

In [32]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(X)

## Recommend movies for a given movieId

In [38]:
movie_index = 4  # Choose a movie index to find recommendations
distances, indices = knn.kneighbors([X.iloc[movie_index]])

## Display recommendations

In [36]:
print("Recommendations for:", movies[movies['movieId'] == movie_ids.iloc[movie_index]]['title'].values[0])
for i in indices[0]:
    print(movies[movies['movieId'] == movie_ids.iloc[i]]['title'].values[0])

Recommendations for: Father of the Bride Part II (1995)
Father of the Bride Part II (1995)
Semi-Tough (1978)
Arthur (2011)
Bachelor Party (1984)
Rocker, The (2008)
