In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import lzma

In [8]:
# Cell 2: Load and merge data
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = movies.merge(credits, on='title')

# Select necessary columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [9]:
# Cell 3: Define helper functions
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

def collapse(L):
    return [i.replace(" ", "") for i in L]

In [4]:
# Cell 4: Define conversion functions
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 

def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 

def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ",""))
    return L1

In [10]:
# Cell 4: Process data
# Clean data
movies.dropna(inplace=True)

# Convert string representations to lists
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x: x[:3])
movies['crew'] = movies['crew'].apply(fetch_director)

# Remove spaces
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)

# Create tags
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [11]:
# Cell 5: Create final DataFrame
final_movies = pd.DataFrame({
    'id': movies['movie_id'],
    'title': movies['title'],
    'genres': movies['genres'],
    'overview': movies['overview'],
    'keywords': movies['keywords'],
    'cast': movies['cast'],
    'crew': movies['crew'],
    'tags': movies['tags'].apply(lambda x: " ".join(x))
})

In [13]:
# Cell 6: Create similarity matrix
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(final_movies['tags']).toarray()
similarity = cosine_similarity(vectors)

In [None]:
# Cell 7: Save files
# Save processed data
with open('movie_dict.pickle', 'wb') as file:
    pickle.dump(final_movies.to_dict(), file)

# Save similarity matrix with proper compression
with lzma.open('similarity3.xz', 'wb') as file:
    pickle.dump(similarity, file)

print("Files saved successfully!")

In [None]:
# Cell 8: Test recommendations
def recommend_test(movie):
    idx = final_movies[final_movies['title'] == movie].index[0]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return final_movies['title'].iloc[movie_indices]

# Test with Avatar
print("\nTest recommendations for 'Avatar':")
print(recommend_test('Avatar'))

In [None]:
# Cell 10: Test recommendation
def recommend_test(movie):
    idx = final_movies[final_movies['title'] == movie].index[0]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return final_movies['title'].iloc[movie_indices]

# Test with a movie
print("\nRecommendations for 'Avatar':")
print(recommend_test("Avatar"))

In [None]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import lzma

# Cell 2: Load and merge data
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = movies.merge(credits, on='title')

# Select necessary columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Define conversion functions
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

# Clean data
movies.dropna(inplace=True)

# Convert string representations to lists
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x: x[:3])
movies['crew'] = movies['crew'].apply(fetch_director)

# Create tags
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create final DataFrame
final_movies = pd.DataFrame({
    'id': movies['movie_id'],
    'title': movies['title'],
    'genres': movies['genres'],
    'overview': movies['overview'],
    'keywords': movies['keywords'],
    'cast': movies['cast'],
    'crew': movies['crew'],
    'tags': movies['tags'].apply(lambda x: " ".join(x))
})

# Create similarity matrix
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(final_movies['tags']).toarray()
similarity = cosine_similarity(vectors)

# Save processed data
with open('movie_dict.pickle', 'wb') as file:
    pickle.dump(final_movies.to_dict(), file)

# Save similarity matrix with proper compression
with lzma.open('similarity3.xz', 'wb') as file:
    pickle.dump(similarity, file)

print("Files saved successfully!")

# Test recommendations
def recommend_test(movie):
    idx = final_movies[final_movies['title'] == movie].index[0]
    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    return final_movies['title'].iloc[movie_indices]

# Test with a movie
print("\nTest recommendations for 'Avatar':")
print(recommend_test('Avatar'))