In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../datasets/raw/tmdb_movies_raw_2.csv")

In [3]:
print(df.dtypes)

movie_id          int64
title            object
overview         object
genres           object
cast             object
keywords         object
runtime           int64
release_year      int64
language         object
vote_average    float64
vote_count        int64
dtype: object


In [4]:
df['overview'].tail(3)

7997    Mikado and Laetitia lead an alternative lifest...
7998    Emperor Philippa Georgiou joins a secret divis...
7999    Julien and Marie have enjoyed 15 years of seem...
Name: overview, dtype: object

In [5]:
print(df.isnull().sum())

movie_id         0
title            0
overview        94
genres           0
cast             0
keywords         0
runtime          0
release_year     0
language         0
vote_average     0
vote_count       0
dtype: int64


In [6]:
df = df.fillna({'overview': 'no overview'})

In [7]:
import ast


def normalize_text(x):
    # Handle NaN
    if pd.isna(x):
        return ""

    # Case 1: already a real list
    if isinstance(x, list):
        return ", ".join(g.strip() for g in x if isinstance(g, str) and g.strip())

    # Case 2: string that looks like a list -> parse it
    if isinstance(x, str):
        x = x.strip()

        # empty or invalid
        if x.lower() in ["", "nan", "none", "[]"]:
            return ""

        # stringified list like "['Drama', 'Romance']"
        if x.startswith("[") and x.endswith("]"):
            try:
                parsed = ast.literal_eval(x)
                if isinstance(parsed, list):
                    return ", ".join(
                        g.strip() for g in parsed if isinstance(g, str) and g.strip()
                    )
            except Exception:
                pass  # fall through

        # already pipe or comma separated
        return x.replace(",", ", ")

    return ""


In [8]:
import re

def clean_brackets(text):
    if pd.isna(text):
        return ""
    # Remove [, ], and ' using regex
    return re.sub(r"[\[\]']", "", str(text))

In [9]:
import re
import pandas as pd

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z\s,:.\'\"]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [10]:
df['overview'].iloc[67:75]

67    Romeu, a corrupt cop, hires the services of Ci...
68    When a dance troupe is lured to an empty schoo...
69    At a Bandung high school, charming and rebelli...
70    As Christmas approaches, a department store wo...
71    Deliveryman Jong-su is out on a job when he ru...
72    Speedboat racing champion and multimillionaire...
73    Captain Glass of the USS Arkansas discovers th...
74    A graduate student, who is trying to finish th...
Name: overview, dtype: object

In [11]:
df['overview']= df['overview'].apply(clean_text)
df['overview'].iloc[67:75]

67    romeu, a corrupt cop, hires the services of ci...
68    when a dance troupe is lured to an empty schoo...
69    at a bandung high school, charming and rebelli...
70    as christmas approaches, a department store wo...
71    deliveryman jongsu is out on a job when he run...
72    speedboat racing champion and multimillionaire...
73    captain glass of the uss arkansas discovers th...
74    a graduate student, who is trying to finish th...
Name: overview, dtype: object

In [12]:
df['keywords'] = df['keywords'].apply(normalize_text).apply(clean_brackets)

df['keywords'].tail(5)

df["genres"] = df["genres"].apply(normalize_text).apply(lambda x: x.lower())

df["genres"].head(2)

df['genres']= df['genres'].apply(clean_brackets)
df["keywords"].iloc[87:94]
df['cast']=df['cast'].apply(normalize_text)

df['cast']= df['cast'].apply(clean_brackets)

df['runtime'] = df['runtime'].astype(int)
df['release_year']=df['release_year'].astype(int)

# Check if the keywords at index 87 is empty (length 0)
print(type(df['overview'].iloc[87]))


<class 'str'>


In [13]:
import pycountry
import pandas as pd

def convert_lang(code):
    if pd.isnull(code):
        return code
    
    # pycountry needs uppercase 2-letter codes (e.g., 'EN')
    code_clean = str(code).strip().upper()
    
    try:
        lang = pycountry.languages.get(alpha_2=code_clean)
        return lang.name.lower() if lang else code
    except (AttributeError, LookupError):
        return code

# Apply the helper function
df["language"] = df["language"].apply(convert_lang)

In [14]:
df['language']= df['language'].apply(lambda x: x.lower() if isinstance(x,str) else x)
df['language'].iloc[78]

'german'

In [15]:
def replace_with_keywords(row):
    """
    Takes a row (Series) from the DataFrame.
    If overview is 'no overview', joins keywords into a string.
    """
    # Access columns by name from the row object
    overview = str(row['overview']).lower().strip()
    keywords = row['keywords']
    genres = str(row['genres'])
    
    # Logic: if 'no overview' and keywords exist
    if overview == 'no overview' and len(keywords) > 0:
        # If keywords is a list, join it; if it's already a string, return it
        if isinstance(keywords, str):
            row['extracted_text'] = str(keywords.strip())
        return row
    
    if overview == 'no overview' and len(genres) > 0:
        if isinstance(genres, str):
            row['extracted_text'] = str(genres.strip())
        return row

    row['extracted_text'] = pd.NA
    # Otherwise, keep the original overview
    return row

df = df.apply(replace_with_keywords, axis =1)

In [16]:
mask = (df['extracted_text'].notna())

df.loc[mask, 'extracted_text'].head()

364               drama
422               radio
482              comedy
534              comedy
545    horror, thriller
Name: extracted_text, dtype: object

In [17]:
# Creates a view of the data where extracted_text has values
filtered_df = df[df['extracted_text'].notna()]
print(filtered_df.count())

movie_id          81
title             81
overview          81
genres            81
cast              81
keywords          81
runtime           81
release_year      81
language          81
vote_average      81
vote_count        81
extracted_text    81
dtype: int64


In [18]:
from thefuzz import fuzz, process

def get_unique_fuzzy_keywords(input_str, threshold=70):
    # 1. Clean and split the string into a list
    raw_keywords = [k.strip() for k in input_str.split(',') if k.strip()]
    raw_keywords.sort(key= len, reverse= True)
    
    unique_keywords = []

    for kw in raw_keywords:
        # 2. Check if the keyword is similar to anything already accepted
        # If the list is empty, just add the first word
        if not unique_keywords:
            unique_keywords.append(kw)
            continue
        
        # 3. Find the best match score among already accepted words
        # extractOne returns (best_match, score)
        _, score = process.extractOne(kw, unique_keywords, scorer=fuzz.token_set_ratio)
        
        # 4. If the similarity score is low, it's a "unique" new concept
        if score < threshold:
            unique_keywords.append(kw)
            
    return unique_keywords

# Example Usage
# data = "sports, basketball, national basketball association (nba)"
# result = get_unique_fuzzy_keywords(data)

# print(result) 
# Output: ['sports', 'national basketball association (nba)']

In [19]:
mask = (df['extracted_text'].notna())

df.loc[mask,'extracted_text']= df.loc[mask,'extracted_text'].apply(get_unique_fuzzy_keywords)

In [20]:
df.loc[mask,'extracted_text'].head(5)

364               [drama]
422               [radio]
482              [comedy]
534              [comedy]
545    [thriller, horror]
Name: extracted_text, dtype: object

In [21]:
def keywords_to_plot(keywords: list):
    if isinstance(keywords,list):
        keyword_text = ','.join(keywords)
        text_to_append = 'a movie that is about: '
        
        final_text = text_to_append + keyword_text
        return final_text
    

df.loc[mask,'extracted_text']= df.loc[mask,'extracted_text'].apply(keywords_to_plot)

df.loc[mask,'extracted_text'].head(5)

    



364              a movie that is about: drama
422              a movie that is about: radio
482             a movie that is about: comedy
534             a movie that is about: comedy
545    a movie that is about: thriller,horror
Name: extracted_text, dtype: object

In [22]:
import pandas as pd
import numpy as np

def new_overview(row):
    overview = row['overview']
    extracted_text = row['extracted_text']

    # Use pd.isna() instead of 'is pd.nan'
    if overview == 'no overview':
        if not pd.isna(extracted_text):
            row['new_overview'] = extracted_text
        else:
            row['new_overview'] = 'no overview'
    else:
        # Crucial: define what happens if overview is NOT 'no overview'
        row['new_overview'] = overview

    return row

# Specify axis=1 to process by row
df = df.apply(new_overview, axis=1)

In [23]:
df[df['new_overview']=='no overview'].count()

movie_id          13
title             13
overview          13
genres            13
cast              13
keywords          13
runtime           13
release_year      13
language          13
vote_average      13
vote_count        13
extracted_text     0
new_overview      13
dtype: int64

In [24]:
# Select all rows where the condition is False
df = df[~(df['new_overview'] == 'no overview')]


In [25]:
df.count()

movie_id          7987
title             7987
overview          7987
genres            7987
cast              7987
keywords          7987
runtime           7987
release_year      7987
language          7987
vote_average      7987
vote_count        7987
extracted_text      81
new_overview      7987
dtype: int64

In [26]:
df['new_overview'].tail(5)

7995    an editor and his daughter work to restore a f...
7996    a smalltown police chief pursues a serial kill...
7997    mikado and laetitia lead an alternative lifest...
7998    emperor philippa georgiou joins a secret divis...
7999    julien and marie have enjoyed years of seeming...
Name: new_overview, dtype: object

In [27]:
df.columns

Index(['movie_id', 'title', 'overview', 'genres', 'cast', 'keywords',
       'runtime', 'release_year', 'language', 'vote_average', 'vote_count',
       'extracted_text', 'new_overview'],
      dtype='object')

In [28]:
columns_to_keep = [
    'movie_id', 'title', 'genres', 'cast', 'keywords', 
    'runtime', 'release_year', 'language', 
    'vote_average', 'vote_count', 'new_overview'
]

new_df = df[columns_to_keep]

In [29]:
new_df.head(2)

Unnamed: 0,movie_id,title,genres,cast,keywords,runtime,release_year,language,vote_average,vote_count,new_overview
0,360920,The Grinch,"family, comedy, animation","Benedict Cumberbatch, Rashida Jones, Kenan Tho...","holiday, surrealism, remake, based on children...",85,2018,english,6.87,4327,the grinch hatches a scheme to ruin christmas ...
1,299536,Avengers: Infinity War,"adventure, action, science fiction","Robert Downey Jr., Chris Evans, Chris Hemsworth","sacrifice, magic, superhero, based on comic, s...",149,2018,english,8.235,31250,as the avengers and their allies have continue...


In [30]:
new_df= new_df.rename(columns={'new_overview':'overview'})

In [31]:
new_df['embedding_text']= new_df['overview']

In [32]:
new_df.iloc[67:74]

Unnamed: 0,movie_id,title,genres,cast,keywords,runtime,release_year,language,vote_average,vote_count,overview,embedding_text
67,450765,Bad Investigate,"crime, action, comedy, thriller, drama, mystery","Francisco Menezes, Lu√≠s Ismael, Enrique Arce",,120,2018,portuguese,6.6,14,"romeu, a corrupt cop, hires the services of ci...","romeu, a corrupt cop, hires the services of ci..."
68,507076,Climax,"horror, drama","Sofia Boutella, Romain Guillermic, Souheila Ya...","dancing, child abuse, sexual abuse, drug abuse...",97,2018,french,7.048,2272,when a dance troupe is lured to an empty schoo...,when a dance troupe is lured to an empty schoo...
69,492459,Dilan 1990,"drama, romance","Iqbaal Ramadhan, Vanesha Prescilla, Debo Andrios","high school, based on novel or book, love, sch...",109,2018,indonesian,7.7,34,"at a bandung high school, charming and rebelli...","at a bandung high school, charming and rebelli..."
70,535809,A Shoe Addict's Christmas,"romance, drama, tv movie, fantasy","Candace Cameron Bure, Luke Macfarlane, Jean Smart","holiday, guardian angel, christmas",90,2018,english,6.2,76,"as christmas approaches, a department store wo...","as christmas approaches, a department store wo..."
71,491584,Burning,"mystery, drama, thriller","Yoo Ah-in, Steven Yeun, Jun Jong-seo","fire, jealousy, dreams, countryside, love tria...",148,2018,korean,7.375,1739,deliveryman jongsu is out on a job when he run...,deliveryman jongsu is out on a job when he run...
72,466411,Speed Kills,"crime, drama, thriller","John Travolta, Katheryn Winnick, Jennifer Espo...","based on novel or book, sports, based on true ...",102,2018,english,5.431,138,speedboat racing champion and multimillionaire...,speedboat racing champion and multimillionaire...
73,399402,Hunter Killer,"action, adventure, thriller","Gerard Butler, Gary Oldman, Toby Stephens","sniper, washington dc, usa, submarine, pentago...",121,2018,english,6.9,2300,captain glass of the uss arkansas discovers th...,captain glass of the uss arkansas discovers th...


In [33]:
new_df.columns

Index(['movie_id', 'title', 'genres', 'cast', 'keywords', 'runtime',
       'release_year', 'language', 'vote_average', 'vote_count', 'overview',
       'embedding_text'],
      dtype='object')

In [34]:
col = new_df["embedding_text"]

print("Total rows:", len(col))
print("NaNs:", col.isna().sum())
print("Non-strings:", sum(not isinstance(x, str) for x in col if pd.notna(x)))


Total rows: 7987
NaNs: 0
Non-strings: 0


In [35]:
import os

os.makedirs("../datasets/cleaned", exist_ok=True)
new_df.to_csv("../datasets/cleaned/movies_cleaned_2.csv", index=False)

In [36]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parents[0]  # moviebot/
sys.path.append(str(PROJECT_ROOT))

In [37]:
from src.data.movie_repository import MovieRepository

repo = MovieRepository()
repo.load()

df = repo.get_all_movies()
df.head()
repo.filter_movies(language=["english"], max_runtime=120).head()


Unnamed: 0,movie_id,title,genres,cast,keywords,runtime,release_year,language,vote_average,vote_count,overview,embedding_text
2,750809,Counterintelligence,"action, comedy, thriller","Sharae Foxie, Brian Adrian Koch, Marcella Laasch",,89,2020,english,5.0,2,"two days after the u.s. presidential election,...","two days after the u.s. presidential election,..."
5,717634,Girl,thriller,"Bella Thorne, Mickey Rourke, Glen Gould","daughter, small town, sheriff, father murder, ...",90,2020,english,6.0,227,a young woman returns to her small hometown in...,a young woman returns to her small hometown in...
6,653744,Sergio,drama,"Wagner Moura, Ana de Armas, Garret Dillahunt",,118,2020,english,6.562,347,a sweeping drama set in the chaotic aftermath ...,a sweeping drama set in the chaotic aftermath ...
7,735220,Borderline Coffee,comedy,"Susanna Stahlmann, Elizabeth Stahlmann, Jeff A...",,10,2020,english,10.0,1,"""coffee"" asks the mysterious blue postit that ...","""coffee"" asks the mysterious blue postit that ..."
8,1293879,Fleeting Nirvana,"drama, fantasy","Michael A. Phoenix, Allison Pittel",,12,2020,english,0.0,0,"minute short film inspired by true events, you...","minute short film inspired by true events, you..."


In [40]:
from src.data.movie_repository import MovieRepository
from src.models.embedding_model import EmbeddingModel
from src.index.index_builder import IndexBuilder

repo = MovieRepository()
repo.load()

embedder = EmbeddingModel()

builder = IndexBuilder(
    repository=repo,
    embedding_model=embedder
)

builder.build()


Batches:   1%|          | 1/157 [00:11<31:05, 11.96s/it]


KeyboardInterrupt: 