In [77]:
import pandas as pd
import os
import ast

In [78]:
def preprocess():
    """
    Loads TMDB movies and credits dataset.
    Expected file structure:
    data/movies.csv
    data/credits.csv
    """

    movies_path = "../data/tmdb_5000_movies.csv"
    credits_path = "../data/tmdb_5000_credits.csv"

    if not os.path.exists(movies_path) or not os.path.exists(credits_path):
        raise FileNotFoundError("Movies or Credits file missing in data/ folder")

    movies_df = pd.read_csv(movies_path)
    credits_df = pd.read_csv(credits_path)

    df = movies_df.merge(credits_df, on="title")
    df.head()

    df = df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
    df.head()

    df['genres'] = df['genres'].apply(convert)
    df['keywords'] = df['keywords'].apply(convert)

    df['cast'] = df['cast'].apply(convert_cast)

    df['crew'] = df['crew'].apply(fetch_director)

    df['overview'] = df['overview'].astype(str).apply(lambda x: x.split())

    df['genres'] = df['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
    df['keywords'] = df['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
    df['cast'] = df['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
    df['crew'] = df['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

    print("Data loaded successfully:")
    print("Movies shape:", movies_df.shape)
    print("Credits shape:", credits_df.shape)
    print("Merged shape:", df.shape)

    return df

def convert(obj):
    L = []
    for item in ast.literal_eval(obj):
        L.append(item['name'])
    return L

def convert_cast(obj):
    L = []
    for i, item in enumerate(ast.literal_eval(obj)):
        if i < 3:
            L.append(item['name'])
    return L

def fetch_director(obj):
    for item in ast.literal_eval(obj):
        if item['job'] == 'Director':
            return [item['name']]
    return []

In [79]:
df = preprocess()
df.head()

Data loaded successfully:
Movies shape: (4803, 20)
Credits shape: (4803, 4)
Merged shape: (4809, 7)


Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bondâ€™s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LÃ©aSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [80]:
df[['title','crew']][df['movie_id']==285]

Unnamed: 0,title,crew
1,Pirates of the Caribbean: At World's End,[GoreVerbinski]


In [81]:
import numpy as np
l=[[1,2,3],
   [0,8,4]]
l[1][2]

n=np.array(l)
n[1,1]

np.int64(8)

In [82]:
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']

In [83]:
df['tags'] = df['tags'].apply(lambda x: " ".join(x))

In [84]:
df['tags'] = df['tags'].apply(lambda x: x.lower())

In [85]:
df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,"[A, cryptic, message, from, Bondâ€™s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LÃ©aSeydoux]",[SamMendes],a cryptic message from bondâ€™s past sends him o...
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],following the death of district attorney harve...
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"john carter is a war-weary, former military ca..."


In [86]:
df = df[['movie_id','title','tags']]

In [87]:
df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bondâ€™s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [88]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [89]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(df['tags']).toarray()

In [90]:
vector[1000].sum()

np.int64(49)

In [91]:
similarity = cosine_similarity(vector)

In [92]:
similarity

array([[1.        , 0.08980265, 0.05986843, ..., 0.0248452 , 0.02635231,
        0.        ],
       [0.08980265, 1.        , 0.06451613, ..., 0.02677398, 0.        ,
        0.        ],
       [0.05986843, 0.06451613, 1.        , ..., 0.02677398, 0.        ,
        0.        ],
       ...,
       [0.0248452 , 0.02677398, 0.02677398, ..., 1.        , 0.07071068,
        0.04836508],
       [0.02635231, 0.        , 0.        , ..., 0.07071068, 1.        ,
        0.05129892],
       [0.        , 0.        , 0.        , ..., 0.04836508, 0.05129892,
        1.        ]])

In [96]:
def recommend(movie):
    movie = movie.lower()
    if movie not in df['title'].str.lower().values:
        print("Movie not found in database.")
        return
    
    # get index of the movie
    idx = df[df['title'].str.lower() == movie].index[0]
    
    # get list of similarity scores
    distances = similarity[idx]
    
    # sort the movies based on similarity in descending order
    movies_list = sorted(
        list(enumerate(distances)), 
        reverse=True, 
        key=lambda x: x[1]
    )
    
    # print top 5 recommended movies (skip the first one: itself)
    recommended = [df.iloc[i[0]].title for i in movies_list[1:6]]
    return recommended


In [97]:
recommend("back to the future")

['Back to the Future Part II',
 'Back to the Future Part III',
 'Tomorrowland',
 'Jimmy Neutron: Boy Genius',
 'Date Movie']

In [99]:
import pickle


In [100]:
with open('../data/movies.pkl', 'wb') as f:
    pickle.dump(df, f)


In [101]:
with open('../data/similarity.pkl', 'wb') as f:
    pickle.dump(similarity, f)


In [102]:
with open('../data/cv.pkl', 'wb') as f:
    pickle.dump(cv, f)


In [103]:
import streamlit as st

ModuleNotFoundError: No module named 'streamlit'

In [None]:
df = pickle.load(open('../data/movies.pkl', 'rb'))
similarity = pickle.load(open('../data/similarity.pkl', 'rb'))


In [98]:
st.title("ðŸŽ¬ Movie Recommendation System")

# Dropdown with all movie titles
selected_movie = st.selectbox("Select a movie:", df['title'].values)

if st.button("Show Recommendations"):
    recommendations = recommend(selected_movie)
    st.subheader("Top 5 Recommended Movies:")
    for i, movie in enumerate(recommendations):
        st.write(f"{i+1}. {movie}")


NameError: name 'st' is not defined

In [None]:
!streamlit run ../src/app.py