# Recommendation System for Lumaa Summer internship 2025 - Urmi Dedhia

### Run this to import all the dependencies required

In [1]:
import pandas as pd
import re
import spacy
import numpy as np
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


### Run this cell to load the SentenceTransformer model, which is used to generate embeddings for movie descriptions.



In [2]:
# Load a more advanced sentence embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient but powerful model

### Run the following cells to load the dataset and view it

In [3]:
# Load dataset
dataset_path = "TMDb_updated.csv"
df = pd.read_csv(dataset_path)

In [4]:
# Change 'x' to the desired number of rows (<=10000)
x = 400
df = df.sample(n=x, random_state=42).reset_index(drop=True)

In [5]:
# Inspect the first five rows of the dataset
df.head()

Unnamed: 0.1,Unnamed: 0,title,overview,original_language,vote_count,vote_average
0,6252,7 Days in Entebbe,"In 1976, four hijackers take over an Air Franc...",en,234,5.8
1,4684,The Scorpion King: Quest for Power,"When he is betrayed by a trusted friend, Matha...",en,109,4.7
2,1731,Disobedience,A woman learns about the death of her Orthodox...,en,530,6.9
3,4742,Wolf,Publisher Will Randall becomes a werewolf and ...,en,509,6.1
4,4521,Flypaper,A man caught in the middle of two simultaneous...,en,446,6.3


In [6]:
# Rows, columns
df.shape 

(400, 6)

### Run the following cells to load spaCy's English model and define a function for text preprocessing

In [7]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [8]:
# Function to clean and preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower() # Make text lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    doc = nlp(text)  # Use spaCy for tokenization and lemmatization
    tokens = [token.lemma_ for token in doc if not token.is_stop] #Filter out stop words and lemmatize the rest of the words
    return " ".join(tokens)


### Run the following cells to preprocess all movie descriptions in the dataset (it might take a minute or two)

In [9]:
# Preprocess movie overviews
df["processed_overview"] = df["overview"].apply(preprocess_text)

In [10]:
# Compute sentence embeddings for each movie
df["embedding"] = df["processed_overview"].apply(lambda x: model.encode(x, convert_to_tensor=True))

### Run this cell to define a function that recommends movies based on text similarity and movie popularity

In [12]:
# Recommendation function
def recommend_movies(user_input, df, top_n=5, weight_text=1.5, weight_popularity=0.2, min_similarity=0.3):
    user_input = preprocess_text(user_input)
    user_embedding = model.encode(user_input, convert_to_tensor=True)

    # Compute cosine similarity
    similarities = [util.pytorch_cos_sim(user_embedding, emb)[0].item() for emb in df["embedding"]]
    df["similarity_score"] = similarities

    # Hybrid Score (weighted sum of similarity + popularity)
    df["final_score"] = (df["similarity_score"] * weight_text) + (df["vote_average"] * weight_popularity)

    df_filtered = df[df["similarity_score"] >= min_similarity]
    
    # Sort and return top matches
    top_movies = df_filtered.sort_values(by="final_score", ascending=False).head(top_n)
    return top_movies[["title", "overview", "similarity_score", "vote_average"]]

### Run the following cell and enter the inputs as required in the text boxes. The recommendations will be displayed in the output! :)

In [13]:
# User input
user_query = input('Describe the type of movie you like eg. "I like feel-good, romantic and cosy movies." (Press enter when done): ')
top_n = int(input("\n\nHow many movies would you like? Enter a number: "))

recommendations = recommend_movies(user_query, df, top_n)

# Display results
print("\n\n\n"+"-" * 60+"Top Recommendations"+"-" *60)

for _, row in recommendations.iterrows():
    print(f"\n🎬 {row['title']}")
    print(f"📖 {row['overview']}")
    print(f"🔹 Similarity Score: {row['similarity_score']:.4f}")
    print(f"⭐ Popularity Score: {row['vote_average']:.4f}")
    print("-" * 140)

Describe the type of movie you like eg. "I like feel-good, romantic and cosy movies." (Press enter when done):  Something with cars, racing, fast paced, speedy, action, thrill


How many movies would you like? Enter a number:  5





------------------------------------------------------------Top Recommendations------------------------------------------------------------

🎬 Cars
📖 Lightning McQueen, a hotshot rookie race car driven to succeed, discovers that life is about the journey, not the finish line, when he finds himself unexpectedly detoured in the sleepy Route 66 town of Radiator Springs. On route across the country to the big Piston Cup Championship in California to compete against two seasoned pros, McQueen gets to know the town's offbeat characters.
🔹 Similarity Score: 0.4863
⭐ Popularity Score: 6.8000
--------------------------------------------------------------------------------------------------------------------------------------------

🎬 Hard Boiled
📖 A cop who loses his partner in a shoot-out with gun smugglers goes on a mission to catch them. In order to get closer to the leaders of the ring he joins forces with an undercover cop who's working as a gangster hitman. They use all means of excess