# Content-Based Movie Recommendation

## Preparing Data

Dataset from [grouplens](https://grouplens.org/datasets/movielens/)

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.neighbors import NearestNeighbors

In [None]:
data_movies = pd.read_csv('/content/drive/MyDrive/Dataset/Dataset Data Mining/Modul 6/Dataset Materi/movies.csv')
data_ratings = pd.read_csv('/content/drive/MyDrive/Dataset/Dataset Data Mining/Modul 6/Dataset Materi/ratings.csv')

In [None]:
data_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [None]:
data_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [None]:
data_movies.duplicated().sum()

0

In [None]:
data_ratings.duplicated().sum()

0

In [None]:
data_ratings.isna().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0
timestamp,0


In [None]:
data_movies.isna().sum()

Unnamed: 0,0
movieId,0
title,0
genres,0


In [None]:
df = pd.merge(data_movies, data_ratings, on='movieId')

In [None]:
df.sample(5)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
98903,134853,Inside Out (2015),Adventure|Animation|Children|Comedy|Drama|Fantasy,227,4.5,1447189572
90468,72998,Avatar (2009),Action|Adventure|Sci-Fi|IMAX,332,4.0,1352672811
21759,1015,Homeward Bound: The Incredible Journey (1993),Adventure|Children|Drama,140,4.0,951416146
96331,106642,"Day of the Doctor, The (2013)",Adventure|Drama|Sci-Fi,525,4.0,1476482422
60298,4128,"Lost Boys, The (1987)",Comedy|Horror|Thriller,274,4.5,1171758494


In [None]:
df = df.drop(columns=['timestamp', 'userId', 'movieId', 'rating'])

## Preprocessing Data

Supaya lebih efisien, kita akan menghapus keterangan tahun pada data title. Selanjutnya, kita mengekstraksi kolom genres dimana data dipisahkan oleh | , maka kita akan mengekstraknya dengan kolom baru dan menghapus kolom lama

In [None]:
def remove_year_from_title(title):
  return re.sub(r'\(\d{4}\)', '', title).strip()

df['title'] = df['title'].apply(remove_year_from_title)

In [None]:
df.sample(5)

Unnamed: 0,title,genres
78043,Branded to Kill (Koroshi no rakuin),Action|Crime|Drama
49575,"Dirty Dozen, The",Action|Drama|War
44162,"Color of Money, The",Drama
22476,That Thing You Do!,Comedy|Drama
34898,Good Will Hunting,Drama|Romance


In [None]:
df['genre'] = df['genres'].apply(lambda x: x.split('|'))
df = df.drop(columns=['genres'])

In [None]:
df.tail()

Unnamed: 0,title,genre
100831,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]"
100832,No Game No Life: Zero,"[Animation, Comedy, Fantasy]"
100833,Flint,[Drama]
100834,Bungo Stray Dogs: Dead Apple,"[Action, Animation]"
100835,Andrew Dice Clay: Dice Rules,[Comedy]


In [None]:
tfidf_title = TfidfVectorizer(stop_words='english')
tfidf_matrix_title = tfidf_title.fit_transform(df['title'])

df.fillna('')

Unnamed: 0,title,genre
0,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
2,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
3,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
4,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
...,...,...
100831,Black Butler: Book of the Atlantic,"[Action, Animation, Comedy, Fantasy]"
100832,No Game No Life: Zero,"[Animation, Comedy, Fantasy]"
100833,Flint,[Drama]
100834,Bungo Stray Dogs: Dead Apple,"[Action, Animation]"


In [None]:
tfidf_genre = TfidfVectorizer(stop_words='english')
all_genres = [genre for sublist in df['genre'] for genre in sublist]
tfidf_genre.fit(all_genres)
tfidf_matrix_genre = tfidf_genre.transform(df['genre'].apply(lambda x: ' '.join(x)))

In [None]:
tfidf_matrix = hstack([tfidf_matrix_title, tfidf_matrix_genre])

In [None]:
nn = NearestNeighbors(metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)

## Fungsi Rekomendasi

In [None]:
def recommend_movies(movie_title, num_recommendations=5):
    movie_index = df[df['title'] == movie_title].index[0]
    distances, indices = nn.kneighbors(tfidf_matrix[movie_index], n_neighbors=num_recommendations + 100)
    recommended_movies = []

    for i in range(1, len(distances[0])):
        recommended_movie_index = indices[0][i]
        recommended_movie_title = df['title'].iloc[recommended_movie_index]
        distance = distances[0][i]
        recommended_movies.append((recommended_movie_title, distance))

    unique_recommendations = []
    for title, distance in recommended_movies:
        if title not in [rec[0] for rec in unique_recommendations]:
            unique_recommendations.append((title, distance))
        if len(unique_recommendations) == num_recommendations:
            break

    print(f"Rekomendasi untuk '{movie_title}':")
    for i, (title, distance) in enumerate(unique_recommendations):
        print(f"{i+1}. {title} (Jarak: {distance:.1f})")


## Testing

In [None]:
recommend_movies("Bungo Stray Dogs: Dead Apple", 5)

Rekomendasi untuk 'Bungo Stray Dogs: Dead Apple':
1. Batman: The Dark Knight Returns, Part 2 (Jarak: 0.5)
2. Mortal Kombat: The Journey Begins (Jarak: 0.5)
3. Superman/Doomsday (Jarak: 0.5)
4. Batman: Under the Red Hood (Jarak: 0.5)
5. Street Fighter II: The Animated Movie (Sutorîto Faitâ II gekijô-ban) (Jarak: 0.5)


In [None]:
recommend_movies("Scooby-Doo", 7)

Rekomendasi untuk 'Scooby-Doo':
1. Scooby-Doo (Jarak: 0.0)
2. Scooby-Doo! Curse of the Lake Monster (Jarak: 0.2)
3. Scooby-Doo 2: Monsters Unleashed (Jarak: 0.2)
4. Scooby-Doo! Abracadabra-Doo (Jarak: 0.3)
5. Big Top Scooby-Doo! (Jarak: 0.3)
6. Scooby-Doo! and the Samurai Sword (Jarak: 0.4)
7. Scooby-Doo Goes Hollywood (Jarak: 0.4)
