In [2]:
import pandas as pd

df = pd.read_csv("movies.csv")
df


Unnamed: 0,title,genre,description
0,Avengers,Action Superhero,A team of superheroes save the world from alie...
1,Iron Man,Action Superhero,A billionaire builds a powered suit to fight evil
2,Thor,Action Fantasy,A god from Asgard fights to protect Earth
3,Batman,Action Crime,A vigilante fights crime in Gotham City
4,Superman,Action Superhero,An alien with superpowers protects humanity
5,Interstellar,Sci-Fi Space,A journey through space and time to save humanity
6,Inception,Sci-Fi Thriller,A thief steals information using dream invasion
7,The Dark Knight,Action Crime,Batman faces a powerful criminal mastermind
8,Doctor Strange,Fantasy Superhero,A surgeon learns mystic arts to protect reality
9,Guardians of the Galaxy,Sci-Fi Superhero,A group of misfits protect the galaxy


In [3]:
df.isnull().sum()


title          0
genre          0
description    0
dtype: int64

In [4]:
df['combined_features'] = df['genre'] + " " + df['description']
df[['title', 'combined_features']]


Unnamed: 0,title,combined_features
0,Avengers,Action Superhero A team of superheroes save th...
1,Iron Man,Action Superhero A billionaire builds a powere...
2,Thor,Action Fantasy A god from Asgard fights to pro...
3,Batman,Action Crime A vigilante fights crime in Gotha...
4,Superman,Action Superhero An alien with superpowers pro...
5,Interstellar,Sci-Fi Space A journey through space and time ...
6,Inception,Sci-Fi Thriller A thief steals information usi...
7,The Dark Knight,Action Crime Batman faces a powerful criminal ...
8,Doctor Strange,Fantasy Superhero A surgeon learns mystic arts...
9,Guardians of the Galaxy,Sci-Fi Superhero A group of misfits protect th...


In [5]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z ]', '', text)
    return text

df['combined_features'] = df['combined_features'].apply(clean_text)
df.head()


Unnamed: 0,title,genre,description,combined_features
0,Avengers,Action Superhero,A team of superheroes save the world from alie...,action superhero a team of superheroes save th...
1,Iron Man,Action Superhero,A billionaire builds a powered suit to fight evil,action superhero a billionaire builds a powere...
2,Thor,Action Fantasy,A god from Asgard fights to protect Earth,action fantasy a god from asgard fights to pro...
3,Batman,Action Crime,A vigilante fights crime in Gotham City,action crime a vigilante fights crime in gotha...
4,Superman,Action Superhero,An alien with superpowers protects humanity,action superhero an alien with superpowers pro...


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
feature_matrix = tfidf.fit_transform(df['combined_features'])

feature_matrix


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 70 stored elements and shape (10, 50)>

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(feature_matrix)
similarity_matrix


array([[1.        , 0.10317514, 0.05200428, 0.04551536, 0.27986312,
        0.10601961, 0.1111601 , 0.04877091, 0.0568282 , 0.06928756],
       [0.10317514, 1.        , 0.04863745, 0.04256863, 0.12301606,
        0.        , 0.        , 0.04561342, 0.05314907, 0.06480179],
       [0.05200428, 0.04863745, 1.        , 0.16734873, 0.06200487,
        0.        , 0.        , 0.05112269, 0.21551855, 0.11392714],
       [0.04551536, 0.04256863, 0.16734873, 1.        , 0.05426811,
        0.        , 0.        , 0.2691439 , 0.        , 0.        ],
       [0.27986312, 0.12301606, 0.06200487, 0.05426811, 1.        ,
        0.12640754, 0.        , 0.05814972, 0.06775645, 0.08261179],
       [0.10601961, 0.        , 0.        , 0.        , 0.12640754,
        1.        , 0.07250821, 0.        , 0.        , 0.09262193],
       [0.1111601 , 0.        , 0.        , 0.        , 0.        ,
        0.07250821, 1.        , 0.        , 0.        , 0.09711282],
       [0.04877091, 0.04561342, 0.0511226

In [8]:
similarity_matrix.shape


(10, 10)

In [9]:
movie_indices = pd.Series(df.index, index=df['title']).drop_duplicates()
movie_indices


title
Avengers                   0
Iron Man                   1
Thor                       2
Batman                     3
Superman                   4
Interstellar               5
Inception                  6
The Dark Knight            7
Doctor Strange             8
Guardians of the Galaxy    9
dtype: int64

In [10]:
def recommend_movies(title, num_recommendations=5):
    # Get index of the movie
    idx = movie_indices[title]
    
    # Get similarity scores for this movie
    sim_scores = list(enumerate(similarity_matrix[idx]))
    
    # Sort movies based on similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get indices of top similar movies (excluding itself)
    sim_scores = sim_scores[1:num_recommendations+1]
    
    # Get movie titles
    movie_indices_list = [i[0] for i in sim_scores]
    
    return df['title'].iloc[movie_indices_list]


In [11]:
recommend_movies("Avengers")


4                   Superman
6                  Inception
5               Interstellar
1                   Iron Man
9    Guardians of the Galaxy
Name: title, dtype: object

In [12]:
recommend_movies("Interstellar")


4                   Superman
0                   Avengers
9    Guardians of the Galaxy
6                  Inception
1                   Iron Man
Name: title, dtype: object

In [13]:
recommend_movies("Avengers")


4                   Superman
6                  Inception
5               Interstellar
1                   Iron Man
9    Guardians of the Galaxy
Name: title, dtype: object

In [14]:
recommend_movies("Avengers")
recommend_movies("Batman")
recommend_movies("Interstellar")
recommend_movies("Inception")


0                   Avengers
9    Guardians of the Galaxy
5               Interstellar
1                   Iron Man
2                       Thor
Name: title, dtype: object

In [16]:
""" ## Limitations
- The recommendation system is purely content-based and does not consider user preferences.
- Recommendations depend on textual similarity and keyword overlap.
- The dataset is limited in size, which affects recommendation diversity.
- Semantic understanding of movie plots is limited.
"""

' ## Limitations\n- The recommendation system is purely content-based and does not consider user preferences.\n- Recommendations depend on textual similarity and keyword overlap.\n- The dataset is limited in size, which affects recommendation diversity.\n- Semantic understanding of movie plots is limited.\n'

In [17]:
"""## Future Scope
- The system can be enhanced by adding user ratings and collaborative filtering.
- A larger and more diverse movie dataset can improve recommendations.
- Advanced NLP techniques or deep learning models can be used for better semantic understanding.
- The recommendation system can be deployed as a web-based application.
"""

'## Future Scope\n- The system can be enhanced by adding user ratings and collaborative filtering.\n- A larger and more diverse movie dataset can improve recommendations.\n- Advanced NLP techniques or deep learning models can be used for better semantic understanding.\n- The recommendation system can be deployed as a web-based application.\n'

In [18]:
"""## Conclusion
The content-based movie recommendation system successfully suggests movies based on similarity between movie features.
By using TF-IDF vectorization and cosine similarity, the system provides relevant recommendations and demonstrates the working of 
recommendation engines in real-world applications.
"""


'## Conclusion\nThe content-based movie recommendation system successfully suggests movies based on similarity between movie features.\nBy using TF-IDF vectorization and cosine similarity, the system provides relevant recommendations and demonstrates the working of \nrecommendation engines in real-world applications.\n'