In [1]:
# Step 2.0 – Import libraries
import pandas as pd

# Step 2.1 – Load datasets with correct encoding
movies = pd.read_csv('movies.dat', sep='::', engine='python', 
                     names=['MovieID', 'Title', 'Genres'], encoding='ISO-8859-1')

ratings = pd.read_csv('ratings.dat', sep='::', engine='python', 
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'], encoding='ISO-8859-1')

users = pd.read_csv('users.dat', sep='::', engine='python', 
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], encoding='ISO-8859-1')

In [2]:
# Step 2.2 – Basic info about data
print("Movies shape:", movies.shape)
print("Ratings shape:", ratings.shape)
print("Users shape:", users.shape)

# Check for missing values
print("\nMissing values in movies:\n", movies.isnull().sum())
print("\nMissing values in ratings:\n", ratings.isnull().sum())
print("\nMissing values in users:\n", users.isnull().sum())

Movies shape: (3883, 3)
Ratings shape: (1000209, 4)
Users shape: (6040, 5)

Missing values in movies:
 MovieID    0
Title      0
Genres     0
dtype: int64

Missing values in ratings:
 UserID       0
MovieID      0
Rating       0
Timestamp    0
dtype: int64

Missing values in users:
 UserID        0
Gender        0
Age           0
Occupation    0
Zip-code      0
dtype: int64


In [3]:
# Step 2.3 – Quick look at genres and rating distribution
print("\nUnique genres:\n", movies['Genres'].unique())

# Rating distribution
print("\nRatings summary:\n", ratings['Rating'].describe())
ratings['Rating'].value_counts().sort_index()


Unique genres:
 ["Animation|Children's|Comedy" "Adventure|Children's|Fantasy"
 'Comedy|Romance' 'Comedy|Drama' 'Comedy' 'Action|Crime|Thriller'
 "Adventure|Children's" 'Action' 'Action|Adventure|Thriller'
 'Comedy|Drama|Romance' 'Comedy|Horror' "Animation|Children's" 'Drama'
 'Action|Adventure|Romance' 'Drama|Thriller' 'Drama|Romance' 'Thriller'
 'Action|Comedy|Drama' 'Crime|Drama|Thriller' 'Drama|Sci-Fi' 'Romance'
 'Adventure|Sci-Fi' 'Adventure|Romance' "Children's|Comedy|Drama"
 'Documentary' 'Drama|War' 'Action|Crime|Drama' 'Action|Adventure'
 'Crime|Thriller' "Animation|Children's|Musical|Romance"
 'Action|Drama|Thriller' "Children's|Comedy" 'Drama|Mystery'
 'Sci-Fi|Thriller' 'Action|Comedy|Crime|Horror|Thriller' 'Drama|Musical'
 'Crime|Drama|Romance' 'Adventure|Drama' 'Action|Thriller'
 "Adventure|Children's|Comedy|Musical" 'Action|Drama|War'
 'Action|Adventure|Crime' 'Crime' 'Drama|Mystery|Romance' 'Action|Drama'
 'Drama|Romance|War' 'Horror' 'Action|Adventure|Comedy|Crime' 'Com

Rating
1     56174
2    107557
3    261197
4    348971
5    226310
Name: count, dtype: int64

In [4]:
# Step 2.4 – Merge ratings with users
ratings_users = pd.merge(ratings, users, on='UserID')

# Step 2.5 – Merge the result with movies
full_data = pd.merge(ratings_users, movies, on='MovieID')

# Step 2.6 – Final dataset overview
print("Full merged dataset shape:", full_data.shape)
full_data.head()

Full merged dataset shape: (1000209, 10)


Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [5]:
# Create a pivot table (UserID × Movie Title) with ratings
user_movie_matrix = full_data.pivot_table(index='UserID', columns='Title', values='Rating')

# Show sample
user_movie_matrix.head()

Title,"$1,000,000 Duck (1971)",'Night Mother (1986),'Til There Was You (1997),"'burbs, The (1989)",...And Justice for All (1979),1-900 (1994),10 Things I Hate About You (1999),101 Dalmatians (1961),101 Dalmatians (1996),12 Angry Men (1957),...,"Young Poisoner's Handbook, The (1995)",Young Sherlock Holmes (1985),Young and Innocent (1937),Your Friends and Neighbors (1998),Zachariah (1971),"Zed & Two Noughts, A (1985)",Zero Effect (1998),Zero Kelvin (Kjærlighetens kjøtere) (1995),Zeus and Roxanne (1997),eXistenZ (1999)
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [6]:
# Split genres into separate columns using One Hot Encoding
movies_expanded = movies.copy()
movies_expanded['Genres'] = movies_expanded['Genres'].str.split('|')

# One-hot encode genres
genres_encoded = movies_expanded['Genres'].explode().str.get_dummies().groupby(movies_expanded['MovieID']).max()

# Merge genre info back with movies
movie_profiles = pd.merge(movies[['MovieID', 'Title']], genres_encoded, left_on='MovieID', right_index=True)

# Show movie profiles
movie_profiles.head()

Unnamed: 0,MovieID,Title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Drop non-numeric columns for similarity computation
genre_features = movie_profiles.drop(['MovieID', 'Title'], axis=1)

# Compute cosine similarity between all movies
cosine_sim = cosine_similarity(genre_features)

# Store results in a DataFrame for easier lookup
content_sim_df = pd.DataFrame(cosine_sim, index=movie_profiles['Title'], columns=movie_profiles['Title'])

In [8]:
def get_content_recommendations(movie_title, top_n=10):
    if movie_title not in content_sim_df.index:
        return f"❌ Movie '{movie_title}' not found in database."

    # Get similarity scores and sort
    similar_scores = content_sim_df[movie_title].sort_values(ascending=False)
    
    # Exclude the movie itself and return top N
    recommended_titles = similar_scores.iloc[1:top_n+1].index.tolist()
    return recommended_titles

In [9]:
get_content_recommendations('Toy Story (1995)', top_n=5)

['Adventures of Rocky and Bullwinkle, The (2000)',
 'Aladdin and the King of Thieves (1996)',
 'Chicken Run (2000)',
 'Saludos Amigos (1943)',
 "Bug's Life, A (1998)"]

In [10]:
# Fill NaN values with 0 to make matrix numeric
user_movie_matrix_filled = user_movie_matrix.fillna(0)

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Transpose to get Movie × User matrix
movie_user_matrix = user_movie_matrix_filled.T

# Compute cosine similarity between movies
collab_cosine_sim = cosine_similarity(movie_user_matrix)

# Create a DataFrame for easier lookup
collab_sim_df = pd.DataFrame(collab_cosine_sim, index=movie_user_matrix.index, columns=movie_user_matrix.index)

In [12]:
def get_collab_recommendations(movie_title, top_n=10):
    if movie_title not in collab_sim_df.index:
        return f"❌ Movie '{movie_title}' not found in user ratings."

    # Get similarity scores and sort
    similar_scores = collab_sim_df[movie_title].sort_values(ascending=False)

    # Exclude the movie itself
    recommended_titles = similar_scores.iloc[1:top_n+1].index.tolist()
    return recommended_titles

In [13]:
get_collab_recommendations('Toy Story (1995)', top_n=5)

['Toy Story 2 (1999)',
 'Groundhog Day (1993)',
 'Aladdin (1992)',
 "Bug's Life, A (1998)",
 'Back to the Future (1985)']

In [14]:
def hybrid_recommend(movie_title, top_n=10, content_weight=0.5, collab_weight=0.5):
    # Check existence in both models
    if movie_title not in content_sim_df.index or movie_title not in collab_sim_df.index:
        return f"❌ Movie '{movie_title}' not found in both models."

    # Get similarity scores from both models
    content_scores = content_sim_df[movie_title]
    collab_scores = collab_sim_df[movie_title]
    
    # Normalize scores between 0 and 1
    content_scores_norm = (content_scores - content_scores.min()) / (content_scores.max() - content_scores.min())
    collab_scores_norm = (collab_scores - collab_scores.min()) / (collab_scores.max() - collab_scores.min())
    
    # Combine scores using weights
    hybrid_scores = (content_weight * content_scores_norm) + (collab_weight * collab_scores_norm)
    
    # Sort and get top recommendations, excluding the input movie
    hybrid_scores_sorted = hybrid_scores.sort_values(ascending=False).drop(movie_title)
    
    return hybrid_scores_sorted.head(top_n).index.tolist()

In [15]:
# Get hybrid recommendations
hybrid_recommend('Toy Story (1995)', top_n=5)

['Toy Story 2 (1999)',
 "Bug's Life, A (1998)",
 'Aladdin (1992)',
 'Chicken Run (2000)',
 'American Tail, An (1986)']