<a href="https://colab.research.google.com/github/abiralchy0987/movie_recommendation_system/blob/main/optimized_CF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import spacy
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load SpaCy's English model
nlp = spacy.load('en_core_web_sm')

# Reading the data
movies = pd.read_csv('/content/tmdb_5000_movies.csv')
credits = pd.read_csv('/content/tmdb_5000_credits.csv')

# Merge datasets using movie_id in 'credits' and 'id' in 'movies'
movies_merged = movies.merge(credits, left_on='id', right_on='movie_id')

# Select relevant columns and rename for clarity
movies = movies_merged[['movie_id', 'title_x', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies = movies.rename(columns={'title_x': 'title'})

# Drop rows with missing values
movies.dropna(inplace=True)

# Safely convert JSON-like strings to lists
def convert(text):
    try:
        return [i['name'] for i in ast.literal_eval(text)]
    except (ValueError, SyntaxError):
        return []  # Return empty list if parsing fails

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Convert cast to a list of top 3 actors
def convert_cast(text):
    try:
        return [i['name'] for i in ast.literal_eval(text)[:3]]
    except (ValueError, SyntaxError):
        return []

movies['cast'] = movies['cast'].apply(convert_cast)

# Fetch director from crew
def fetch_director(text):
    try:
        for i in ast.literal_eval(text):
            if i['job'] == 'Director':
                return [i['name']]
    except (ValueError, SyntaxError):
        return []
    return []

movies['crew'] = movies['crew'].apply(fetch_director)

# Process text: replace spaces with underscores
def process_text(text):
    return [i.replace(" ", "_") for i in text]

movies['cast'] = movies['cast'].apply(process_text)
movies['crew'] = movies['crew'].apply(process_text)

# Convert 'overview' to a list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Combine all features into 'tags' as lists
movies['tags'] = (
    movies['overview'] +
    movies['genres'].apply(lambda x: x * 2) +  # Weight genres
    movies['keywords'].apply(lambda x: x * 2) +  # Weight keywords
    movies['cast'].apply(lambda x: x * 3) +  # Weight cast
    movies['crew'].apply(lambda x: x * 3)  # Weight crew
)

# Convert 'tags' to a single string
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

# Use SpaCy's nlp.pipe for batch processing
def preprocess_text_batch(texts):
    processed_texts = []
    for doc in nlp.pipe(texts, batch_size=50):
        processed_tokens = [
            token.lemma_.lower() for token in doc
            if not token.is_stop and not token.is_punct
        ]
        processed_texts.append(" ".join(processed_tokens))
    return processed_texts

# Apply batch preprocessing to 'tags'
movies['tags'] = preprocess_text_batch(movies['tags'])

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 3),
    min_df=0.001,
    max_df=0.8,
    stop_words='english'
)
vectors = tfidf.fit_transform(movies['tags'])

# Compute cosine similarity
similarity = cosine_similarity(vectors)

# Recommendation function
def recommend(movie, num_recommendations=5):
    index = movies[movies['title'] == movie].index
    if len(index) == 0:
        return f"Movie '{movie}' not found in database"

    index = index[0]  # Get the first index safely
    sim_scores = list(enumerate(similarity[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    recommendations = [
        {'title': movies.iloc[i[0]]['title'], 'similarity': f"{i[1] * 100:.1f}%"}
        for i in sim_scores[1:num_recommendations + 1]
    ]

    return pd.DataFrame(recommendations)

# Example usage
print(recommend('Avatar'))


                     title similarity
0                   Aliens      43.2%
1                   Alien³      35.1%
2  Star Trek Into Darkness      31.0%
3                    Alien      30.2%
4           Silent Running      29.4%
