In [3]:
from google.colab import drive, files
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
from google.colab import files
files.upload()



Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"sxwmya","key":"d6915e0187992d17d63a2a7c67cf6fee"}'}

In [6]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [7]:
!kaggle datasets download -d rounakbanik/the-movies-dataset


Downloading the-movies-dataset.zip to /content
 97% 221M/228M [00:01<00:00, 206MB/s]
100% 228M/228M [00:01<00:00, 221MB/s]


In [8]:
!unzip the-movies-dataset.zip -d /content/data
!rm the-movies-dataset.zip


Archive:  the-movies-dataset.zip
  inflating: /content/data/credits.csv  
  inflating: /content/data/keywords.csv  
  inflating: /content/data/links.csv  
  inflating: /content/data/links_small.csv  
  inflating: /content/data/movies_metadata.csv  
  inflating: /content/data/ratings.csv  
  inflating: /content/data/ratings_small.csv  


In [11]:
!pip install --quiet fastparquet
!pip install --quiet pyarrow


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
%matplotlib inline
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer
import pyarrow as pa
import pyarrow.parquet as pq
import warnings
warnings.simplefilter('ignore')


In [13]:
def get_director(x):

    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


In [14]:
# Reading datasets
movies_dataset = pd.read_csv('/content/data/movies_metadata.csv')
credits = pd.read_csv('/content/data/credits.csv')
keywords = pd.read_csv('/content/data/keywords.csv')
links = pd.read_csv('/content/data/links.csv')

# Dropping three rows with string dates in the release_date column
movies_dataset = movies_dataset.drop([19730, 29503, 35587])

# Extracting genres of movies from the genres dictionary
movies_dataset['genres'] = movies_dataset['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

# Convert to a common data type for the primary key in our dataset
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movies_dataset['id'] = movies_dataset['id'].astype('int')

# Merging movies dataset with credits & keywords to form a master dataset
movies_dataset = movies_dataset.merge(credits, on='id')
master_dataset = movies_dataset.merge(keywords, on='id')

# Filtering master dataset based on links
links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')
master_dataset = master_dataset[master_dataset['id'].isin(links)]

# Displaying the shape of the resulting master dataset
print(master_dataset.shape)


(46628, 27)


In [15]:
master_dataset['cast'] = master_dataset['cast'].apply(literal_eval)
master_dataset['crew'] = master_dataset['crew'].apply(literal_eval)
master_dataset['keywords'] = master_dataset['keywords'].apply(literal_eval)


In [16]:
master_dataset['cast'] = master_dataset['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
master_dataset['cast'] = master_dataset['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)


In [17]:
master_dataset['director'] = master_dataset['crew'].apply(get_director)


In [19]:
master_dataset['cast'] = master_dataset['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
master_dataset['main_director'] = master_dataset['director']
master_dataset['director'] = master_dataset['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
master_dataset['director'] = master_dataset['director'].apply(lambda x: [x,x,x])
s = master_dataset.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
print(s[:5])


{'id': 187056, 'name': 'woman director'}     3128
{'id': 10183, 'name': 'independent film'}    1942
{'id': 9826, 'name': 'murder'}               1314
{'id': 818, 'name': 'based on novel'}         841
{'id': 4344, 'name': 'musical'}               734
Name: keyword, dtype: int64


In [21]:
# Checking and handling the case where 'keywords' column contains dictionaries
master_dataset['keywords'] = master_dataset['keywords'].apply(lambda x: [i['name'] if isinstance(i, dict) else i for i in x])

# Applying stemming and lowercase operations
master_dataset['keywords'] = master_dataset['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
master_dataset['keywords'] = master_dataset['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

print(master_dataset['keywords'].head(3))



0    [jealousi, toy, boy, friendship, friend, rival...
1    [boardgam, disappear, basedonchildren'sbook, n...
2       [fish, bestfriend, duringcreditssting, oldmen]
Name: keywords, dtype: object


In [22]:
master_dataset['soup'] = master_dataset['keywords'] + master_dataset['cast'] + master_dataset['director'] + master_dataset['genres']
master_dataset['soup'] = master_dataset['soup'].apply(lambda x: ' '.join(x))
print(master_dataset['soup'].head(3))


0    jealousi toy boy friendship friend rivalri boy...
1    boardgam disappear basedonchildren'sbook newho...
2    fish bestfriend duringcreditssting oldmen walt...
Name: soup, dtype: object


In [23]:
master_dataset.drop(['adult', 'belongs_to_collection', 'budget', 'homepage', 'original_language', 'production_companies', 'production_countries', 'revenue', 'runtime', 'spoken_languages', 'status', 'video'], axis=1, inplace=True)
master_dataset.drop(['overview', 'tagline', 'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director'], axis=1, inplace=True)
master_dataset.drop(['id', 'imdb_id', 'original_title', 'poster_path', 'genres'], axis=1, inplace=True)


In [24]:
master_dataset['popularity'] = master_dataset.apply(lambda r: r['popularity'] if type(r['popularity']) == float else np.nan, axis=1)
master_dataset.dropna(inplace=True)


In [25]:
master_dataset['main_director'] = master_dataset.apply(lambda r: r['main_director'] if len(r['main_director']) > 1 else np.nan, axis=1)
master_dataset.dropna(inplace=True)


In [27]:
master_dataset.sort_values(by=['popularity'], ascending=False, inplace=True)

# Drop 'popularity' column after sorting
master_dataset.drop(['popularity'], axis=1, inplace=True)

# Reset index after sorting
master_dataset.reset_index(inplace=True, drop=True)

# Check 'release_date' column for non-string values and remove them
master_dataset['release_date'] = master_dataset.apply(lambda r: r['release_date'] if len(r['release_date']) > 1 else np.nan, axis=1)
master_dataset.dropna(inplace=True)


In [28]:
master_dataset = master_dataset[:2500]
print(master_dataset.head())

  release_date                           title  \
0   2015-06-17                         Minions   
1   2014-10-24                      Big Hero 6   
2   2016-02-09                        Deadpool   
3   2017-04-19  Guardians of the Galaxy Vol. 2   
4   2009-12-10                          Avatar   

                                   main_director  \
0              [kylebalda, kylebalda, kylebalda]   
1  [chriswilliams, chriswilliams, chriswilliams]   
2              [timmiller, timmiller, timmiller]   
3              [jamesgunn, jamesgunn, jamesgunn]   
4     [jamescameron, jamescameron, jamescameron]   

                                                soup  
0  assist aftercreditssting duringcreditssting ev...  
1  brotherbrotherrelationship hero talent reveng ...  
2  antihero mercenari marvelcom superhero basedon...  
3  sequel superhero basedoncom misfit space outer...  
4  cultureclash futur spacewar spacecoloni societ...  


In [29]:
print(master_dataset.shape)

(2500, 4)


In [30]:
count = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2, stop_words='english')
count_matrix = count.fit_transform(master_dataset['soup'])
print(count_matrix.shape)


(2500, 7277)


In [31]:
table = pa.Table.from_pandas(pd.DataFrame(cosine_similarity(count_matrix, count_matrix)))
master_dataset.to_parquet('/content/movie_database.parquet', engine='fastparquet', index=False)
pq.write_table(table, '/content/model.parquet')


In [36]:
import pandas as pd
import pyarrow as pa

# Loading the trained model and master dataset
master_dataset = pd.read_parquet('/content/movie_database.parquet')
table = pa.parquet.read_table('/content/model.parquet').to_pandas()

# Resetting the index and creating indices
master_dataset = master_dataset.reset_index()
titles = master_dataset['title']
indices = pd.Series(master_dataset.index, index=master_dataset['title'])

# Function for getting movie recommendations
def get_recommendations(movie_id_from_db, movie_db):
    try:
        sim_scores = list(enumerate(movie_db[movie_id_from_db]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:15]  # get top 15 Recommendations
        movie_indices = [i[0] for i in sim_scores]
        output = master_dataset.iloc[movie_indices]
        output.reset_index(inplace=True, drop=True)
        response = []
        for i in range(len(output)):
            response.append({
                'movie_title': output['title'].iloc[i],
                'movie_release_date': output['release_date'].iloc[i],
                'movie_director': output['main_director'].iloc[i],
                'google_link': "https://www.google.com/search?q=" + '+'.join(output['title'].iloc[i].strip().split())
            })
        return response
    except Exception as e:
        print("error: ", e)
        return []

# User input and displaying recommendations
movie_name = input('Enter a movie Name: ')
movie_index = titles.to_list().index(movie_name)
recommendations = get_recommendations(movie_index, table)
# Displaying recommendations
print(f"{'Movie Title':<40} | {'Director':<20} | {'Release Date':<15}")
print(f"-"*80)
for recommendation in recommendations:
    print("{:<40} | {:<20} | {:<15}".format(
        str(recommendation['movie_title']),
        str(recommendation['movie_director']),
        str(recommendation['movie_release_date'])
    ))





Enter a movie Name: Toy Story
Movie Title                              | Director             | Release Date   
--------------------------------------------------------------------------------
Toy Story 2                              | b'["johnlasseter","johnlasseter","johnlasseter"]' | 1999-10-30     
Cars 2                                   | b'["johnlasseter","johnlasseter","johnlasseter"]' | 2011-06-11     
A Bug's Life                             | b'["johnlasseter","johnlasseter","johnlasseter"]' | 1998-11-25     
Cars                                     | b'["johnlasseter","johnlasseter","johnlasseter"]' | 2006-06-08     
Oliver & Company                         | b'["georgescribner","georgescribner","georgescribner"]' | 1988-11-18     
Toy Story 3                              | b'["leeunkrich","leeunkrich","leeunkrich"]' | 2010-06-16     
Dug's Special Mission                    | b'["ronniedelcarmen","ronniedelcarmen","ronniedelcarmen"]' | 2009-11-09     
Leroy & Stitch       

In [39]:
from google.colab import files

# Download the saved model file
files.download('your_model.joblib')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>