In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Configure visual settings for plots
%matplotlib inline

# 1. Load the datasets
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# 2. Merge datasets
# We merge on 'movieId' to see movie titles alongside user ratings
df = pd.merge(ratings, movies, on='movieId')

df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [29]:
# Calculate mean rating and vote count for each movie
movie_stats = df.groupby('title')['rating'].agg(['mean', 'count'])
movie_stats.columns = ['mean_rating', 'vote_count']

# Inspect the most popular movies (by vote count)
print("Most Popular Movies:")
display(movie_stats.sort_values(by='vote_count', ascending=False).head(5))

# Inspect the highest-rated movies (potential noise)
print("\nHighest Rated Movies (Noise check):")
display(movie_stats.sort_values(by='mean_rating', ascending=False).head(5))

Most Popular Movies:


Unnamed: 0_level_0,mean_rating,vote_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump (1994),4.164134,329
"Shawshank Redemption, The (1994)",4.429022,317
Pulp Fiction (1994),4.197068,307
"Silence of the Lambs, The (1991)",4.16129,279
"Matrix, The (1999)",4.192446,278



Highest Rated Movies (Noise check):


Unnamed: 0_level_0,mean_rating,vote_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Gena the Crocodile (1969),5.0,1
True Stories (1986),5.0,1
Cosmic Scrat-tastrophe (2015),5.0,1
Love and Pigeons (1985),5.0,1
Red Sorghum (Hong gao liang) (1987),5.0,1


In [30]:
# Only choose the film that received more than 100 votes
reliable_movies = movie_stats[movie_stats['vote_count'] > 100]

# Şimdi en yüksek puanlıları tekrar sıralayalım
print("Top Rated Movies (Received at least 100 votes):")
display(reliable_movies.sort_values(by='mean_rating', ascending=False).head(10))

Top Rated Movies (Received at least 100 votes):


Unnamed: 0_level_0,mean_rating,vote_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Shawshank Redemption, The (1994)",4.429022,317
"Godfather, The (1972)",4.289062,192
Fight Club (1999),4.272936,218
"Godfather: Part II, The (1974)",4.25969,129
"Departed, The (2006)",4.252336,107
Goodfellas (1990),4.25,126
"Dark Knight, The (2008)",4.238255,149
"Usual Suspects, The (1995)",4.237745,204
"Princess Bride, The (1987)",4.232394,142
Star Wars: Episode IV - A New Hope (1977),4.231076,251


In [31]:
# Check the 'genres' column
print(movies['genres'].head())

0    Adventure|Animation|Children|Comedy|Fantasy
1                     Adventure|Children|Fantasy
2                                 Comedy|Romance
3                           Comedy|Drama|Romance
4                                         Comedy
Name: genres, dtype: object


In [32]:
# --- ENGINE 1: CONTENT-BASED FILTERING ---

# Create the Genre Matrix using One-Hot Encoding
# This converts genres like "Action|Adventure" into mathematical 1s and 0s.
genre_matrix = movies['genres'].str.get_dummies(sep='|')

display(genre_matrix.head())

print(genre_matrix.shape)

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


(9742, 20)


In [33]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate Cosine Similarity
# This creates a square matrix (Movie x Movie) representing similarity scores.
cosine_sim = cosine_similarity(genre_matrix, genre_matrix)

print(f"Similarity Matrix Shape: {cosine_sim.shape}")

print(cosine_sim[0])

Similarity Matrix Shape: (9742, 9742)
[1.         0.77459667 0.31622777 ... 0.         0.31622777 0.4472136 ]


In [34]:
# Create a mapping series to get movie indices from titles
indices = pd.Series(movies.index, index=movies['title'])

def get_recommendations(title, cosine_sim=cosine_sim):
    # 1. Find the movie index
    try:
        idx = indices[title]
    except KeyError:
        return "Movie not found! Make sure you spelled the name correctly."

    # 2. Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 3. Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 4. Get top 5 similar movies (ignoring the movie itself at index 0)
    sim_scores = sim_scores[1:6]

    # 5. Get movie indices
    movie_indices = [i[0] for i in sim_scores]

    # 6. Return titles
    return movies['title'].iloc[movie_indices]

# Test the Content-Based Engine
print("If you like 'Toy Story (1995)', you might like:")
print(get_recommendations('Toy Story (1995)'))

If you like 'Toy Story (1995)', you might like:
1706                                       Antz (1998)
2355                                Toy Story 2 (1999)
2809    Adventures of Rocky and Bullwinkle, The (2000)
3000                  Emperor's New Groove, The (2000)
3568                             Monsters, Inc. (2001)
Name: title, dtype: object


In [35]:
# --- ENGINE 2: COLLABORATIVE FILTERING ---

# Create the User-Item Matrix (Pivot Table)
user_movie_ratings = df.pivot_table(index='userId', columns='title', values='rating')

print("User-Item Matrix (First 5 rows):")
display(user_movie_ratings.head())

User-Item Matrix (First 5 rows):


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [36]:
# 1. Select a target movie
target_movie_name = 'Toy Story (1995)'
target_movie_ratings = user_movie_ratings[target_movie_name]

# 2. Calculate Correlation with all other movies
# This finds patterns: "Users who liked X also liked Y"
similar_movies = user_movie_ratings.corrwith(target_movie_ratings)

# 3. Create a DataFrame for the results
corr_toy_story = pd.DataFrame(similar_movies, columns=['Correlation'])
corr_toy_story.dropna(inplace=True) # Discard the uncalculable ones.

# 4. Join with vote counts to filter noise
corr_toy_story = corr_toy_story.join(movie_stats['vote_count'])

# 5. Filter and Sort
# We only consider movies with > 100 ratings for reliability
recommendations = corr_toy_story[corr_toy_story['vote_count'] > 100].sort_values(by='Correlation', ascending=False)

print(f"Users who liked '{target_movie_name}' also liked:")
display(recommendations.head(10))

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


Users who liked 'Toy Story (1995)' also liked:


Unnamed: 0_level_0,Correlation,vote_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),1.0,215
"Incredibles, The (2004)",0.643301,125
Finding Nemo (2003),0.618701,141
Aladdin (1992),0.611892,183
"Monsters, Inc. (2001)",0.490231,132
Mrs. Doubtfire (1993),0.446261,144
"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",0.438237,120
American Pie (1999),0.420117,103
Die Hard: With a Vengeance (1995),0.410939,144
E.T. the Extra-Terrestrial (1982),0.409216,122
