In [2]:
import pandas as pd

# Load dataset
file_path = "movies.csv"
movies = pd.read_csv(file_path)

# Preview dataset
print(movies.head())


   index     budget                                    genres  \
0      0  237000000  Action Adventure Fantasy Science Fiction   
1      1  300000000                  Adventure Fantasy Action   
2      2  245000000                    Action Adventure Crime   
3      3  250000000               Action Crime Drama Thriller   
4      4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future space war space colony so...                en   
1  ocean drug abuse exotic island east india trad...                en   
2         spy based on novel sec

In [7]:
# Pilih kolom yang relevan
movies = movies[["title", "genres", "overview", "vote_average", "popularity"]]

# Isi data kosong pada kolom 'overview'
movies["overview"] = movies["overview"].fillna(" ")

# Hapus baris dengan nilai kosong di kolom 'title' atau 'genres'
movies = movies.dropna(subset=["title", "genres"])

print(movies.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4775 entries, 0 to 4802
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         4775 non-null   object 
 1   genres        4775 non-null   object 
 2   overview      4775 non-null   object 
 3   vote_average  4775 non-null   float64
 4   popularity    4775 non-null   float64
dtypes: float64(2), object(3)
memory usage: 223.8+ KB
None


In [20]:
def extract_genres_simple(genre_str):
    if pd.notnull(genre_str):  # Pastikan data tidak null
        return genre_str.split()  # Pisahkan genre berdasarkan spasi
    return []

movies["genre_list"] = movies["genres"].apply(extract_genres_simple)

In [21]:
print(movies["genres"].head())

0    Action Adventure Fantasy Science Fiction
1                    Adventure Fantasy Action
2                      Action Adventure Crime
3                 Action Crime Drama Thriller
4            Action Adventure Science Fiction
Name: genres, dtype: object


In [22]:
print(movies[["title", "genre_list"]].head())

                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                       genre_list  
0  [Action, Adventure, Fantasy, Science, Fiction]  
1                    [Adventure, Fantasy, Action]  
2                      [Action, Adventure, Crime]  
3                [Action, Crime, Drama, Thriller]  
4           [Action, Adventure, Science, Fiction]  


In [23]:
movies.to_csv("processed_movies.csv", index=False)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF vektor dari overview
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["overview"])

# Matriks kemiripan
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [26]:
def recommend_movies(title, top_n=10):
    indices = pd.Series(movies.index, index=movies["title"]).drop_duplicates()
    
    # Ambil index dari film yang diminta
    idx = indices[title]
    
    # Ambil skor kemiripan untuk film tersebut
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Urutkan berdasarkan skor
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Ambil top_n film
    sim_scores = sim_scores[1:top_n+1]
    
    # Ambil judul film
    movie_indices = [i[0] for i in sim_scores]
    return movies["title"].iloc[movie_indices]

# Contoh penggunaan
print(recommend_movies("The Dark Knight"))

3                         The Dark Knight Rises
428                              Batman Returns
3854    Batman: The Dark Knight Returns, Part 2
299                              Batman Forever
1359                                     Batman
119                               Batman Begins
1181                                        JFK
9            Batman v Superman: Dawn of Justice
2507                                  Slow Burn
210                              Batman & Robin
Name: title, dtype: object
