<a href="https://colab.research.google.com/github/widura26/machine-learning-portfolio/blob/main/recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Konfigurasi untuk proyek ini
*   Download dataset [Kaggle MovieLens Dataset](https://www.kaggle.com/datasets/prajitdatta/movielens-100k-dataset?resource=download)
*   Letakkan folder dataset yang telah diunduh pada google drive.
*   Ubah file path sesuai keinginan dan definisikan pada variabel ``file_path``

In [1]:
#this file is used to prepare for the next project
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive

## Data Loading

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
file_path = '/content/drive/MyDrive/dataset/movieDatasets'

## Data Collection

In [4]:
# ratings
users = pd.read_csv(f"{file_path}/u.data", sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
users

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [5]:
#movies
movies = pd.read_csv(f"{file_path}/u.item", sep='|', encoding="latin-1", names=["movie_id", "title", "release_date", "video_release_date",
    "imdb_url", "unknown", "Action", "Adventure", "Animation",
    "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
    "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
    "Thriller", "War", "Western"])

movies[movies["movie_id"] == 1]

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
#users
users = pd.read_csv(f"{file_path}/u.user", sep='|', encoding="latin-1", names=["user_id", "age", "gender", "occupation", "zip_code"])
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


## Data Preprocessing / Cleaning

In [7]:
movies.drop(columns=['video_release_date'], inplace=True)
movies

Unnamed: 0,movie_id,title,release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Cosine similarity

In [8]:
genre_cols = ["unknown", "Action", "Adventure", "Animation",
    "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
    "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
    "Thriller", "War", "Western"]

cosine_sim = cosine_similarity(movies[genre_cols], movies[genre_cols])

# Buat mapping: judul -> index
indices = pd.Series(movies.index, index=movies['title'])

In [9]:
def recommend(title, n=5):
    idx = indices[title]  # ambil index film
    sim_scores = list(enumerate(cosine_sim[idx]))  # skor similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # urutkan
    sim_scores = sim_scores[1:n+1]  # ambil n film teratas (skip dirinya sendiri)
    movie_indices = [i[0] for i in sim_scores]
    return movies[['title', 'release_date']].iloc[movie_indices]

print(recommend("Toy Story (1995)", 5))

                                       title release_date
421   Aladdin and the King of Thieves (1996)  01-Jan-1996
94                            Aladdin (1992)  01-Jan-1992
1218                   Goofy Movie, A (1995)  01-Jan-1995
62                  Santa Clause, The (1994)  01-Jan-1994
93                         Home Alone (1990)  01-Jan-1990


In [10]:
movies[movies['title'].isin(['Home Alone (1990)', 'Toy Story (1995)'])]

Unnamed: 0,movie_id,title,release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
93,94,Home Alone (1990),01-Jan-1990,http://us.imdb.com/M/title-exact?Home%20Alone%...,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Load dataset
# movies = pd.read_csv(
#     "u.item",
#     sep='|',
#     encoding="latin-1",
#     names=["movie_id", "title", "release_date", "video_release_date",
#            "imdb_url", "unknown", "Action", "Adventure", "Animation",
#            "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
#            "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi",
#            "Thriller", "War", "Western"]
# )

# # Ambil release year dari release_date
# movies["release_year"] = movies["release_date"].str[-4:]

# # Buat kolom 'genres' dalam bentuk string
# genre_columns = ["unknown","Action","Adventure","Animation","Children's","Comedy",
#                  "Crime","Documentary","Drama","Fantasy","Film-Noir","Horror",
#                  "Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western"]

# movies["genres"] = movies[genre_columns].apply(lambda x: " ".join([col for col in genre_columns if x[col] == 1]), axis=1)

# # Gabungkan fitur: genre + release_year
# movies["combined_features"] = movies["genres"] + " " + movies["release_year"].fillna("")

# # Vectorize
# vectorizer = CountVectorizer()
# feature_matrix = vectorizer.fit_transform(movies["combined_features"])

# # Cosine similarity
# cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

# # Fungsi rekomendasi
# def recommend(movie_title, n=5):
#     if movie_title not in movies["title"].values:
#         return f"Movie '{movie_title}' tidak ditemukan."

#     idx = movies[movies["title"] == movie_title].index[0]
#     sim_scores = list(enumerate(cosine_sim[idx]))
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

#     print(f"Rekomendasi untuk '{movie_title}':")
#     for i, score in sim_scores[1:n+1]:
#         print(f"- {movies.iloc[i]['title']} (score: {score:.2f})")

# # Contoh penggunaan
# recommend("Toy Story (1995)")
