In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/movie_reco/


Mounted at /content/drive
/content/drive/MyDrive/movie_reco


In [None]:
import pandas as pd
import numpy as np

movies = pd.read_csv('movies_metadata.csv', low_memory=False)
keywords = pd.read_csv('keywords.csv')
credits = pd.read_csv('credits.csv')
links = pd.read_csv('links.csv')
links_small = pd.read_csv('links_small.csv')
ratings = pd.read_csv('ratings_small.csv')

print(movies.head())
print(keywords.head())
print(credits.head())
print(links.head())
print(links_small.head())
print(ratings.head())


   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                   NaN   8844  tt0113497         

In [None]:
movies = movies[['id', 'title', 'genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]

movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies.dropna(subset=['id'])
movies['id'] = movies['id'].astype(int)

movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')
movies['release_date'] = movies['release_date'].fillna(pd.to_datetime('1900-01-01'))

import ast

def parse_genres(genre_str):
    try:
        genres = ast.literal_eval(genre_str)
        genre_names = [genre['name'] for genre in genres]
        return ', '.join(genre_names)
    except:
        return ''

movies['genres'] = movies['genres'].apply(parse_genres)

print(movies.head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['id'] = pd.to_numeric(movies['id'], errors='coerce')


      id                        title                      genres  \
0    862                    Toy Story   Animation, Comedy, Family   
1   8844                      Jumanji  Adventure, Fantasy, Family   
2  15602             Grumpier Old Men             Romance, Comedy   
3  31357            Waiting to Exhale      Comedy, Drama, Romance   
4  11862  Father of the Bride Part II                      Comedy   

  release_date  runtime  vote_average  vote_count  
0   1995-10-30     81.0           7.7      5415.0  
1   1995-12-15    104.0           6.9      2413.0  
2   1995-12-22    101.0           6.5        92.0  
3   1995-12-22    127.0           6.1        34.0  
4   1995-02-10    106.0           5.7       173.0  


In [None]:
keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')
keywords = keywords.dropna(subset=['id'])
keywords['id'] = keywords['id'].astype(int)

def parse_keywords(keyword_str):
    try:
        keywords = ast.literal_eval(keyword_str)
        keyword_names = [keyword['name'] for keyword in keywords]
        return ', '.join(keyword_names)
    except:
        return ''

keywords['keywords'] = keywords['keywords'].apply(parse_keywords)

print(keywords.head())


      id                                           keywords
0    862  jealousy, toy, boy, friendship, friends, rival...
1   8844  board game, disappearance, based on children's...
2  15602  fishing, best friend, duringcreditsstinger, ol...
3  31357  based on novel, interracial relationship, sing...
4  11862  baby, midlife crisis, confidence, aging, daugh...


In [None]:
credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
credits = credits.dropna(subset=['id'])
credits['id'] = credits['id'].astype(int)

def get_director(crew_data):
    try:
        crew = ast.literal_eval(crew_data)
        for member in crew:
            if member['job'] == 'Director':
                return member['name']
        return ''
    except:
        return ''

def get_top_cast(cast_data, top_n=3):
    try:
        cast = ast.literal_eval(cast_data)
        cast_names = [member['name'] for member in cast[:top_n]]
        return ', '.join(cast_names)
    except:
        return ''

credits['director'] = credits['crew'].apply(get_director)
credits['top_cast'] = credits['cast'].apply(get_top_cast)

credits = credits.drop(columns=['cast', 'crew'])

print(credits.head())


      id         director                                         top_cast
0    862    John Lasseter                Tom Hanks, Tim Allen, Don Rickles
1   8844     Joe Johnston     Robin Williams, Jonathan Hyde, Kirsten Dunst
2  15602    Howard Deutch         Walter Matthau, Jack Lemmon, Ann-Margret
3  31357  Forest Whitaker  Whitney Houston, Angela Bassett, Loretta Devine
4  11862    Charles Shyer         Steve Martin, Diane Keaton, Martin Short


In [None]:
links['tmdbId'] = pd.to_numeric(links['tmdbId'], errors='coerce')
links_small['tmdbId'] = pd.to_numeric(links_small['tmdbId'], errors='coerce')

print(links.head())
print(links_small.head())


   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


In [None]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

print(ratings.head())


   userId  movieId  rating           timestamp
0       1       31     2.5 2009-12-14 02:52:24
1       1     1029     3.0 2009-12-14 02:52:59
2       1     1061     3.0 2009-12-14 02:53:02
3       1     1129     2.0 2009-12-14 02:53:05
4       1     1172     4.0 2009-12-14 02:53:25


In [None]:
movies_with_credits = pd.merge(movies, credits, left_on='id', right_on='id', how='left')

movies_with_metadata = pd.merge(movies_with_credits, keywords, left_on='id', right_on='id', how='left')

movies_with_metadata = pd.merge(movies_with_metadata, links_small, left_on='id', right_on='tmdbId', how='left')

merged_data = pd.merge(ratings, movies_with_metadata, left_on='movieId', right_on='movieId', how='left')

print(merged_data.head())


   userId  movieId  rating           timestamp       id                 title  \
0       1       31     2.5 2009-12-14 02:52:24   9909.0       Dangerous Minds   
1       1     1029     3.0 2009-12-14 02:52:59  11360.0                 Dumbo   
2       1     1061     3.0 2009-12-14 02:53:02    819.0              Sleepers   
3       1     1129     2.0 2009-12-14 02:53:05   1103.0  Escape from New York   
4       1     1172     4.0 2009-12-14 02:53:25  11216.0       Cinema Paradiso   

                    genres release_date  runtime  vote_average  vote_count  \
0             Drama, Crime   1995-08-11     99.0           6.4       249.0   
1        Animation, Family   1941-10-22     64.0           6.8      1206.0   
2   Crime, Drama, Thriller   1996-10-18    147.0           7.3       729.0   
3  Science Fiction, Action   1981-05-22     99.0           6.9       720.0   
4           Drama, Romance   1988-11-17    124.0           8.2       834.0   

             director                       

In [None]:
print(merged_data.isnull().sum())

merged_data['genres'] = merged_data['genres'].fillna('Unknown')
merged_data['director'] = merged_data['director'].fillna('Unknown')
merged_data['top_cast'] = merged_data['top_cast'].fillna('Unknown')
merged_data['keywords'] = merged_data['keywords'].fillna('')

merged_data = merged_data.dropna(subset=['rating', 'title'])

print(merged_data.isnull().sum())

userId            0
movieId           0
rating            0
timestamp         0
id              194
title           194
genres          194
release_date    194
runtime         194
vote_average    194
vote_count      194
director        194
top_cast        194
keywords        194
imdbId          194
tmdbId          194
dtype: int64
userId          0
movieId         0
rating          0
timestamp       0
id              0
title           0
genres          0
release_date    0
runtime         0
vote_average    0
vote_count      0
director        0
top_cast        0
keywords        0
imdbId          0
tmdbId          0
dtype: int64


In [None]:
C = merged_data['vote_average'].mean()
m = merged_data['vote_count'].quantile(0.90)

def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

merged_data['weighted_rating'] = merged_data.apply(weighted_rating, axis=1)

print(merged_data.head())


   userId  movieId  rating           timestamp       id                 title  \
0       1       31     2.5 2009-12-14 02:52:24   9909.0       Dangerous Minds   
1       1     1029     3.0 2009-12-14 02:52:59  11360.0                 Dumbo   
2       1     1061     3.0 2009-12-14 02:53:02    819.0              Sleepers   
3       1     1129     2.0 2009-12-14 02:53:05   1103.0  Escape from New York   
4       1     1172     4.0 2009-12-14 02:53:25  11216.0       Cinema Paradiso   

                    genres release_date  runtime  vote_average  vote_count  \
0             Drama, Crime   1995-08-11     99.0           6.4       249.0   
1        Animation, Family   1941-10-22     64.0           6.8      1206.0   
2   Crime, Drama, Thriller   1996-10-18    147.0           7.3       729.0   
3  Science Fiction, Action   1981-05-22     99.0           6.9       720.0   
4           Drama, Romance   1988-11-17    124.0           8.2       834.0   

             director                       

In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

import nltk
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    words = word_tokenize(text)
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

merged_data['combined_features'] = (
    merged_data['genres'].fillna('') + ' ' +
    merged_data['director'].fillna('') + ' ' +
    merged_data['top_cast'].fillna('') + ' ' +
    merged_data['keywords'].fillna('')).apply(preprocess_text)

merged_data = merged_data.drop(columns=['genres', 'director', 'top_cast', 'keywords'])


tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf.fit_transform(merged_data['combined_features'])

svd = TruncatedSVD(n_components=100)
tfidf_matrix_reduced = svd.fit_transform(tfidf_matrix)

cosine_sim = cosine_similarity(tfidf_matrix_reduced, tfidf_matrix_reduced)


In [None]:
def get_recommendations(titles, cosine_sim=cosine_sim, data=merged_data, top_n=10):
    recommended_titles = []

    for title in titles:
        if title in data['title'].values:
            idx = data[data['title'] == title].index[0]

            sim_scores = list(enumerate(cosine_sim[idx]))

            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

            sim_scores = sim_scores[1:top_n + 1]

            movie_indices = [i[0] for i in sim_scores]

            recommended_titles.extend(data['title'].iloc[movie_indices].drop_duplicates().tolist())
        else:
            print(f"Title '{title}' not found in the dataset.")

    recommended_titles = list(dict.fromkeys(recommended_titles))

    return recommended_titles[:top_n]


In [None]:
titles = ['The Godfather', 'The Dark Knight', 'Pulp Fiction']
recommendations = get_recommendations(titles)
print(recommendations)
print("\n")
print(len(recommendations))
print("\n")

['Vertigo', 'No End in Sight', 'loudQUIETloud: A Film About the Pixies', 'Powaqqatsi', 'Naqoyqatsi', 'It Might Get Loud', 'Beauty Is Embarrassing', 'Standing in the Shadows of Motown', 'Pulp Fiction']


9




In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
ratings['userId'] = ratings['userId'].astype(np.int32)
ratings['movieId'] = ratings['movieId'].astype(np.int32)
ratings['rating'] = ratings['rating'].astype(np.float32)

merged_data['id'] = merged_data['id'].astype(np.int32)


In [None]:
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

train_merged = pd.merge(train_data, merged_data,
                        left_on='movieId', right_on='id', how='left')
test_merged = pd.merge(test_data, merged_data,
                       left_on='movieId', right_on='id', how='left')


In [None]:
def predict_rating(user_id, movie_id, train_data=train_merged, cosine_sim=cosine_sim):
    if movie_id not in train_data['movieId_x'].values or movie_id not in train_data['movieId_x'].unique():
        return np.nan

    movie_idx_train = train_data[train_data['movieId_x'] == movie_id].index[0]

    movie_idx_sim = train_data['movieId_x'].unique().tolist().index(movie_id)

    if movie_idx_sim < len(cosine_sim):
        sim_scores = cosine_sim[movie_idx_sim]
        similar_indices = np.argsort(sim_scores)[::-1][1:11]
        similar_movies = train_data.iloc[similar_indices]
        user_ratings = similar_movies[similar_movies['userId_x'] == user_id]['rating_x']

        if not user_ratings.empty:
            return user_ratings.mean()
        else:
            return train_data[train_data['movieId_x'] == movie_id]['rating_x'].mean()
    else:
        return np.nan

chunk_size = 1000
num_chunks = int(np.ceil(len(test_merged) / chunk_size))

results = []

for i in range(num_chunks):
    chunk = test_merged.iloc[i*chunk_size:(i+1)*chunk_size].copy()
    chunk['predicted_rating'] = chunk.apply(lambda x: predict_rating(x['userId_x'], x['movieId_x']), axis=1)
    chunk = chunk.dropna(subset=['predicted_rating'])
    results.append(chunk)

final_results = pd.concat(results)

mse = mean_squared_error(final_results['rating_x'], final_results['predicted_rating'])
print(f'MSE: {mse}')
mae = mean_absolute_error(final_results['rating_x'], final_results['predicted_rating'])
print(f'MSE: {mae}')



MSE: 0.9822622192361159
MSE: 0.7646929921818708
