In [1]:
# Import libraries
import pandas as pd
import os

# dataset folder exists or not
print("Files inside 'ml-latest-small':")
print(os.listdir("ml-latest-small"))

# Loading the datasets
movies = pd.read_csv("ml-latest-small/movies.csv")
ratings = pd.read_csv("ml-latest-small/ratings.csv")
tags = pd.read_csv("ml-latest-small/tags.csv")
links = pd.read_csv("ml-latest-small/links.csv")

# Displaying first few rows of each dataset
print("\n Movies Dataset:")
display(movies.head())

print("\n Ratings Dataset:")
display(ratings.head())

print("\n Tags Dataset:")
display(tags.head())

print("\n Links Dataset:")
display(links.head())

# Basic info about datasets
print("\n===== Dataset Information =====")
print("Movies:", movies.shape)
print("Ratings:", ratings.shape)
print("Tags:", tags.shape)
print("Links:", links.shape)



Files inside 'ml-latest-small':
['.ipynb_checkpoints', 'links.csv', 'movies.csv', 'ratings.csv', 'README.txt', 'tags.csv']

 Movies Dataset:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy



 Ratings Dataset:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931



 Tags Dataset:


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200



 Links Dataset:


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0



===== Dataset Information =====
Movies: (9742, 3)
Ratings: (100836, 4)
Tags: (3683, 4)
Links: (9742, 3)


In [2]:
# ‚Äî Data Cleaning and Exploration

#  Check for missing values
print("üîç Missing Values:\n")
print("Movies:\n", movies.isnull().sum())
print("\nRatings:\n", ratings.isnull().sum())
print("\nTags:\n", tags.isnull().sum())
print("\nLinks:\n", links.isnull().sum())

#  Check for duplicate entries
print("\n Duplicate Records:")
print("Movies:", movies.duplicated().sum())
print("Ratings:", ratings.duplicated().sum())
print("Tags:", tags.duplicated().sum())
print("Links:", links.duplicated().sum())

# Data types and summary info
print("\n Dataset Info:")
print("\nMovies:")
print(movies.info())
print("\nRatings:")
print(ratings.info())

#  Ratings basic statistics
print("\n Ratings Summary:")
print(ratings.describe())

#  Merge movies and ratings for easy analysis
movie_ratings = pd.merge(ratings, movies, on='movieId')

# Display first few rows of the merged data
print("\n Combined Movie-Rating Data:")
display(movie_ratings.head())

# Check how many unique users and movies
print("\n Unique Users:", movie_ratings['userId'].nunique())
print(" Unique Movies:", movie_ratings['movieId'].nunique())

#  Average rating per movie
avg_ratings = movie_ratings.groupby('title')['rating'].mean().sort_values(ascending=False)
print("\n Top 5 Highest Rated Movies:")
print(avg_ratings.head())



üîç Missing Values:

Movies:
 movieId    0
title      0
genres     0
dtype: int64

Ratings:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

Tags:
 userId       0
movieId      0
tag          0
timestamp    0
dtype: int64

Links:
 movieId    0
imdbId     0
tmdbId     8
dtype: int64

 Duplicate Records:
Movies: 0
Ratings: 0
Tags: 0
Links: 0

 Dataset Info:

Movies:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None

Ratings:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller



 Unique Users: 610
 Unique Movies: 9724

 Top 5 Highest Rated Movies:
title
Karlson Returns (1970)                                                         5.0
Zeitgeist: Moving Forward (2011)                                               5.0
Dream of Light (a.k.a. Quince Tree Sun, The) (Sol del membrillo, El) (1992)    5.0
Dragons: Gift of the Night Fury (2011)                                         5.0
12 Angry Men (1997)                                                            5.0
Name: rating, dtype: float64


In [3]:
#  ‚Äî Movie Recommendation System (Cosine Similarity)

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# pivot table: rows = movie titles, columns = user IDs, values = ratings
movie_user_matrix = movie_ratings.pivot_table(index='title', columns='userId', values='rating').fillna(0)

# Compute cosine similarity between movies
cosine_sim = cosine_similarity(movie_user_matrix, movie_user_matrix)

# Convert to a DataFrame for readability
cosine_sim_df = pd.DataFrame(cosine_sim, index=movie_user_matrix.index, columns=movie_user_matrix.index)

print(" Cosine Similarity Matrix created successfully!")

# Function to get similar movies
def recommend_movies(movie_title, n=10):
    if movie_title not in cosine_sim_df.index:
        return f" Movie '{movie_title}' not found in dataset."

    # Get similarity scores for the movie
    similar_movies = cosine_sim_df[movie_title].sort_values(ascending=False)[1:n+1]
    return similar_movies

# Example: Recommend movies similar to "Toy Story (1995)"
recommendations = recommend_movies("Toy Story (1995)")
print("\n Movies similar to 'Toy Story (1995)':")
print(recommendations)


 Cosine Similarity Matrix created successfully!

 Movies similar to 'Toy Story (1995)':
title
Toy Story 2 (1999)                                   0.572601
Jurassic Park (1993)                                 0.565637
Independence Day (a.k.a. ID4) (1996)                 0.564262
Star Wars: Episode IV - A New Hope (1977)            0.557388
Forrest Gump (1994)                                  0.547096
Lion King, The (1994)                                0.541145
Star Wars: Episode VI - Return of the Jedi (1983)    0.541089
Mission: Impossible (1996)                           0.538913
Groundhog Day (1993)                                 0.534169
Back to the Future (1985)                            0.530381
Name: Toy Story (1995), dtype: float64


In [None]:
# ‚Äî Interactive Movie Recommender

def recommend_movies_pretty(movie_title, n=10):
    if movie_title not in cosine_sim_df.index:
        print(f" Movie '{movie_title}' not found in dataset.")
        return
    
    print(f"\n Movies similar to '{movie_title}':\n")
    similar_movies = cosine_sim_df[movie_title].sort_values(ascending=False)[1:n+1]
    
    for i, (movie, score) in enumerate(similar_movies.items(), start=1):
        print(f"{i}. {movie} ‚Äî Similarity Score: {score:.3f}")

# Ask user for input
user_input = input("Enter a movie title: ")
recommend_movies_pretty(user_input)



 Movies similar to 'Toy Story 2 (1999)':

1. Bug's Life, A (1998) ‚Äî Similarity Score: 0.620
2. Toy Story (1995) ‚Äî Similarity Score: 0.573
3. Shrek (2001) ‚Äî Similarity Score: 0.542
4. Monsters, Inc. (2001) ‚Äî Similarity Score: 0.532
5. Galaxy Quest (1999) ‚Äî Similarity Score: 0.516
6. Men in Black (a.k.a. MIB) (1997) ‚Äî Similarity Score: 0.516
7. Sixth Sense, The (1999) ‚Äî Similarity Score: 0.504
8. Truman Show, The (1998) ‚Äî Similarity Score: 0.487
9. X-Men (2000) ‚Äî Similarity Score: 0.487
10. Shrek 2 (2004) ‚Äî Similarity Score: 0.485


In [None]:
pip install streamlit
