# Import Library

In [1]:
import pandas as pd 
import numpy as np
from ast import literal_eval
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Load Dataset

In [2]:
df_credits = pd.read_csv('Dataset\\tmdb_5000_credits.csv')
df_movies = pd.read_csv('Dataset\\tmdb_5000_movies.csv')

movie = pd.read_csv("Dataset\\movie.csv")
rating = pd.read_csv("Dataset\\rating.csv")

# Movie Description Based Recommender

### Preprocessing

In [3]:
# Process Missing Value
df_movies['overview'] = df_movies['overview'].fillna('')

# TF-IDF Vectorizer
tfidf_matrix = TfidfVectorizer(stop_words='english').fit_transform(df_movies['overview'])

# Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix)

# Membuat Reverse Map dari Indices dan Movie Titles
indices = pd.Series(df_movies.index, index=df_movies['title']).drop_duplicates()

### Recommender

In [4]:
def get_recommendations(title):
    # Mendapatkan Index dari Movie Title yang Dicari
    idx = indices[title]

    # Mengambil Semua Movie dengan Similarity yang Sama dengan Movie Title yang Dicari dan Diurutkan dari Paling Besar
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Top 10 Movie Yang Sama
    sim_scores = sim_scores[1:11]

    # Mendapatkan Indeks-Indeks Movie Tersebut
    movie_indices = [i[0] for i in sim_scores]

    # Mendapatkan Judul dan Skor Kesamaan
    movie_titles = df_movies['title'].iloc[movie_indices]
    similarity_scores = [i[1] for i in sim_scores]

    # Membuat DataFrame untuk Menampilkan Hasil dalam Bentuk Tabel
    recommendations = pd.DataFrame({
        'Title': movie_titles,
        'Similarity Score': similarity_scores
    })

    return recommendations

In [5]:
get_recommendations(input("Masukkan Judul Film : "))

Unnamed: 0,Title,Similarity Score
1269,Raise the Titanic,0.141998
2143,Ghost Ship,0.106122
2287,I Can Do Bad All By Myself,0.098245
770,Event Horizon,0.096332
4287,Niagara,0.093108
3212,The Rose,0.08778
2902,Triangle,0.086359
4228,The Ballad of Jack and Rose,0.086188
171,Master and Commander: The Far Side of the World,0.083704
104,Poseidon,0.083474


# Movie Metadata Based Recommender

Cast, Director, Keywords, Genres

### Preprocesing

In [6]:
df_credits.columns = ['id', 'title', 'cast', 'crew']
df_metadata= df_movies.merge(df_credits,on='id')

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df_metadata[feature] = df_metadata[feature].apply(literal_eval)

# Mendapatkan Nama Director dari Crew
def get_director(x):
    for i in x:
        if isinstance(i, dict):  # Pastikan i adalah dictionary
            if i.get('job') == 'Director':  # Gunakan .get() untuk menghindari KeyError
                # print(i['name'])
                return i.get('name')
    return np.nan #return bentuk list kosong

# Return 3 Tokoh Pertama (Misalnya Pada Cast)
def get_FiguresList(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        # Check apakah lebih dari 3. Jika iya, return 3 pertama. Jika tidak, return semua list.
        if len(names) > 3:
            names = names[:3]
        return names
    # Return empty list jika missing/malformed data
    return []

# Cleaning Data
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        # Check apakah Director ada. If Jika tidak, return Empty String
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

# Menggabungkan Metadata
def combine_features(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

# Buat Kolom Baru Untuk Director Serta Perbarui Kolom Cast, Keyword, dan Genre
df_metadata['director'] = df_metadata['crew'].apply(get_director)
features = ['cast', 'keywords', 'genres']
for feature in features:
    df_metadata[feature] = df_metadata[feature].apply(get_FiguresList)

# Cleaning Isi dari Fitur
features = ['cast', 'keywords', 'director', 'genres']
for feature in features:
    df_metadata[feature] = df_metadata[feature].apply(clean_data)

# Mengkombinasikan Isi dari Fitur yang Sudah di Cleaning
df_metadata['combine'] = df_metadata.apply(combine_features, axis=1)

# Count Vectorizer
count = CountVectorizer(stop_words='english')
count_matrix_2 = count.fit_transform(df_metadata['combine'])

# Cosine Similarity
cosine_sim_2 = cosine_similarity(count_matrix_2)

# Membuat Reverse Map dari Indices dan Movie Titles
df_metadata = df_metadata.reset_index()
indices_2 = pd.Series(df_metadata.index, index=df_metadata['title_x'])

In [7]:
df_metadata[['title_x', 'cast', 'director', 'keywords', 'genres']].sample(5)

Unnamed: 0,title_x,cast,director,keywords,genres
3478,College,"[drakebell, andrewcaldwell, andreemoss]",debhagan,"[sexuality, college, girlfriend]",[comedy]
221,Stuart Little 2,"[michaelj.fox, geenadavis, hughlaurie]",robminkoff,"[mouse, falcon, bird]","[family, adventure, animation]"
4013,Hud,"[paulnewman, melvyndouglas, patricianeal]",martinritt,"[alcoholism, ranchers, rancher]","[action, drama, western]"
973,Basic,"[johntravolta, connienielsen, samuell.jackson]",johnmctiernan,"[drugaddiction, militarycourt, panama]","[action, drama, mystery]"
972,The Host,"[saoirseronan, dianekruger, jakeabel]",andrewniccol,"[basedonnovel, massmurder, dystopia]","[action, adventure, romance]"


### Recommender

In [8]:
def get_recommendations2(title):
    # Mendapatkan Index dari Movie Title yang Dicari
    idx = indices_2[title]

    # Mengambil Semua Movie dengan Similarity yang Sama dengan Movie Title yang Dicari dan Diurutkan dari Paling Besar
    sim_scores = list(enumerate(cosine_sim_2[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Top 10 Movie Yang Sama
    sim_scores = sim_scores[1:11]

    # Mendapatkan Indeks-Indeks Movie Tersebut
    movie_indices = [i[0] for i in sim_scores]

    # Mendapatkan Judul dan Skor Kesamaan
    movie_titles = df_movies['title'].iloc[movie_indices]
    similarity_scores = [i[1] for i in sim_scores]

    # Membuat DataFrame untuk Menampilkan Hasil dalam Bentuk Tabel
    recommendations = pd.DataFrame({
        'Title': movie_titles,
        'Similarity Score': similarity_scores
    })

    return recommendations

In [9]:
get_recommendations2(input("Masukkan Judul Film : "))

Unnamed: 0,Title,Similarity Score
102,The Hunger Games: Mockingjay - Part 2,0.6
200,The Hunger Games: Mockingjay - Part 1,0.572078
183,The Hunger Games: Catching Fire,0.5
661,Zathura: A Space Adventure,0.4
193,After Earth,0.33541
91,Independence Day: Resurgence,0.316228
101,X-Men: First Class,0.316228
256,Allegiant,0.316228
294,Epic,0.316228
1326,The 5th Wave,0.316228


# User Rating Based Recommender

### Preprocessing

In [10]:
movie = movie.loc[:,["movieId", "title"]]
rating = rating.loc[:,["userId", "movieId", "rating"]]

# Merge Kedua Dataset
data = pd.merge(movie,rating)
data = data.iloc[:1500000,:]

# Menghilangkan Tahun Pada Dataset
data['title'] = data['title'].str.replace(r'\s*\(\d{4}\)$', '', regex=True)

pivot_table = data.pivot_table(index = ["userId"], columns = ["title"], values = "rating")
# pivot_table.head(10)

### Recommender

In [11]:
def get_recommendations3(title):
    # Mendapatkan Movie Watched dari Pivot Table
    movie_watched = pivot_table[title]
    
    # Menghitung Similarity dengan Film Lain
    similarity_with_other_movies = pivot_table.corrwith(movie_watched)
    
    # Mengurutkan dari Similarity Terbesar ke Terkecil
    similarity_with_other_movies = similarity_with_other_movies.sort_values(ascending=False)
    
    # Mengambil Top 10 Rekomendasi (kecuali film itu sendiri)
    top_recommendations = similarity_with_other_movies.iloc[1:11]
    
    # Membuat DataFrame untuk Menampilkan Hasil dalam Bentuk Tabel
    recommendations = pd.DataFrame({
        'Title': top_recommendations.index,
        'Similarity Score': top_recommendations.values
    })

    return recommendations

In [20]:
get_recommendations3(input("Masukkan Judul Film : "))

  c /= stddev[:, None]
  c /= stddev[None, :]


Unnamed: 0,Title,Similarity Score
0,Pie in the Sky,0.931813
1,Happiness Is in the Field (Bonheur est dans le...,0.891447
2,Two Bits,0.864008
3,Headless Body in Topless Bar,0.839254
4,Gospa,0.832061
5,Nobody Loves Me (Keiner liebt mich),0.762271
6,"Journey of August King, The",0.727521
7,Nueba Yol,0.717138
8,"Show, The",0.690066
9,"Neon Bible, The",0.690058
