In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load the dataset
file_path = '/kaggle/input/movies/movies2.csv'  # please change this path into your csv file path
df = pd.read_csv(file_path)

# Ensure CSV has 'title' and 'plot' columns
if 'title' not in df.columns or 'plot' not in df.columns:
    raise ValueError("CSV must contain 'title' and 'plot' ")

# 2. User query (input description)
# i love The story of Andy Dufresne \\ i love The story of Forrest Gump \\ i love The story of Batman and Joker
user_query = "i love The story of Andy Dufresne " 

# 3. Vectorization: TF-IDF for dataset plot summaries
vectorizer = TfidfVectorizer(stop_words=None)

# Fit the vectorizer only on the dataset
tfidf_matrix = vectorizer.fit_transform(df['plot'])

# 4. Transform the user query using the same vectorizer
user_query_tfidf = vectorizer.transform([user_query])

# 5. Compute Cosine Similarity: Compute similarity between the user query and dataset items
cosine_similarities = cosine_similarity(user_query_tfidf, tfidf_matrix)

# 6. Get top 5 most similar items (movies)
top_n = 5  # Number of top recommendations to return

# Get the similarities and corresponding indices
similarities = cosine_similarities[0]
indices = list(range(len(similarities)))

# Sort: order the indices based on similarity from highest to lowest
indices.sort(key=lambda x: similarities[x], reverse=True)

# Get the top_n indices
top_indices = indices[:top_n]

# Output the recommended movies and their similarity scores
print("Recommended Movies Based on Your Preferences:")

for idx in top_indices:
    title = df.iloc[idx]['title']
    similarity_score = similarities[idx]
    print(f"Title: {title}, Similarity: {similarity_score:.4f}")