In [17]:
# Import necessary Python libraries/modules
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import pickle  # To save the model

# Step 3: Data Understanding
# Load the dataset locally (ensure the file path is correct)
movies_meta_data = pd.read_csv('Dataset/movies.csv')
movies_meta_data.info()  # Display dataset information

# Separate the year from the title using regex
# Extract year enclosed in parentheses and store in the 'year' column
movies_meta_data['year'] = movies_meta_data['title'].str.extract(r'\((\d{4})\)', expand=False)

# Convert 'year' to string type (this also handles NaN values by converting them to 'Unknown')
movies_meta_data['year'] = movies_meta_data['year'].fillna('Unknown').astype(str)

# Remove the year (and parentheses) from the title using regex
movies_meta_data['title'] = movies_meta_data['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()

# Display the resulting dataframe to check the title and year columns
print(movies_meta_data[['title', 'year', 'genres']].head())

# Step 4: Data Preparation
# Select relevant columns
judul_movie = movies_meta_data['title'].tolist()
genre_movie = movies_meta_data['genres'].tolist()

# Create a DataFrame
data = pd.DataFrame({
    'judul': judul_movie,
    'genre': genre_movie,
    'year': movies_meta_data['year']
})

# Remove movies with genre '-'
data = data[data.genre != '-']

# Remove duplicate movie titles
data = data.drop_duplicates('judul')

# Reset the index of data
data.reset_index(drop=True, inplace=True)

# Step 5: Modeling
# Initialize CountVectorizer
tf = CountVectorizer()

# Fit and transform genre data into a matrix
tfidf_matrix = tf.fit_transform(data['genre'])

# Calculate cosine similarity on the tf-idf matrix
cosine_sim = cosine_similarity(tfidf_matrix)

# Step 6: Save the Model
# Save CountVectorizer and cosine similarity matrix
with open('count_vectorizer.pkl', 'wb') as file:
    pickle.dump(tf, file)

with open('cosine_similarity.pkl', 'wb') as file:
    pickle.dump(cosine_sim, file)

# Create a Series to map movie titles to indices
indices = pd.Series(index=data['judul'], data=data.index).drop_duplicates()

# Define a function to recommend movies based on the input title
# Function to recommend movies based on input title
def movie_recommendations(judul, cosine_sim=cosine_sim, items=data[['judul', 'genre', 'year']]):
    # Remove the year if it's included in the input title
    clean_title = re.sub(r'\(\d{4}\)', '', judul).strip()
    
    # Check if the cleaned title exists in the indices
    if clean_title not in indices:
        raise KeyError(f"The title '{judul}' was not found in the dataset.")
    
    # Get the index of the input movie title
    idx = indices[clean_title]
    
    # Get similarity scores with all movie titles
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Select the top 20 most similar movies (excluding the input movie itself)
    sim_scores = sim_scores[1:20]
    
    # Get movie indices from similarity scores
    movie_indices = [i[0] for i in sim_scores]
    
    # Return recommended movie titles along with genre and year
    return pd.DataFrame(data[['judul', 'genre', 'year']].iloc[movie_indices])

# Example usage of the recommendation function
recommendation = movie_recommendations('Johnny English Reborn (2011)')
print(recommendation)

# Calculate Precision
TP = 19  # Number of correct predictions for similar genres
FP = 0   # Number of incorrect predictions
Precision = TP / (TP + FP)
print("Precision: {:.0%}".format(Precision))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
                         title  year  \
0                    Toy Story  1995   
1                      Jumanji  1995   
2             Grumpier Old Men  1995   
3            Waiting to Exhale  1995   
4  Father of the Bride Part II  1995   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
                                                  judul  \
3603                                            Topkapi   
7526

In [18]:
# Example usage of the recommendation function
recommendation = movie_recommendations('Jumanji')
print(recommendation)

# Calculate Precision
TP = 19  # Number of correct predictions for similar genres
FP = 0   # Number of incorrect predictions
Precision = TP / (TP + FP)
print("Precision: {:.0%}".format(Precision))


                                                  judul  \
53                          Indian in the Cupboard, The   
109                          NeverEnding Story III, The   
766                            Escape to Witch Mountain   
1510                 Darby O'Gill and the Little People   
1550                                       Return to Oz   
1610                             NeverEnding Story, The   
1611        NeverEnding Story II: The Next Chapter, The   
1788                             Santa Claus: The Movie   
3537  Harry Potter and the Sorcerer's Stone (a.k.a. ...   
5950  Chronicles of Narnia: The Lion, the Witch and ...   
6244                               Bridge to Terabithia   
6474                                Golden Compass, The   
6500               Water Horse: Legend of the Deep, The   
6595          Chronicles of Narnia: Prince Caspian, The   
7295  Chronicles of Narnia: The Voyage of the Dawn T...   
8012                     Percy Jackson: Sea of Monsters 

In [19]:
# Save the modified DataFrame to a new CSV file
data.to_csv('modified_movies.csv', index=False)

print("Modified data has been saved to 'modified_movies.csv'")

Modified data has been saved to 'modified_movies.csv'
