In [25]:
import pandas as pd
import polars as pl
import re
import numpy as np
import ast, os, zipfile
import torch
from pathlib import Path
import pickle
from typing import List
from sklearn.metrics.pairwise import cosine_similarity
from nltk import download
from nltk.stem import PorterStemmer
from collections import Counter
# import concurrent.futures as cf
# import time
import string
# import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

Dataset [link here](https://www.kaggle.com/datasets/moon114/imdb-all-movies-dataset)

In [26]:
# nltk.download('punkt')
# nltk.download('stopwords')

In [27]:
if 'colab' in str(get_ipython()):
    print('Running in COlab')

    # download spacy model
    # !python -m spacy download en_core_web_lg -q
    # spacy.load("en_core_web_lg")
    download('stopwords')
    download('punkt')

    # get data
    try:
        ! git clone https://github.com/tikendraw/movie-recommender-system.git
    except:
        pass

    os.chdir('movie-recommender-system')

    with zipfile.ZipFile('imdb all movies dataset.zip') as f:
        f.extractall('dataset/')

In [28]:
# os.chdir('movie-recommender-system')

In [29]:
dataset_filepath = Path('./dataset/movies.csv')

# df = pd.read_csv(dataset_filepath)
df = pl.read_csv(dataset_filepath)

In [30]:
#lower the column names for ease of typing
df.columns = [i.lower() for i in df.columns]

# try:
#     # dropping extra columns if exists
#     df.drop('unnamed: 0', inplace =True, axis = 1)
# except:
#     pass


In [31]:
df.shape, df.to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285341 entries, 0 to 285340
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0                285341 non-null  int64  
 1   title        285340 non-null  object 
 2   year         245225 non-null  object 
 3   certificate  72952 non-null   object 
 4   genre        285341 non-null  object 
 5   time         204239 non-null  object 
 6   rating       184313 non-null  float64
 7   rate         239176 non-null  object 
 8   synopsis     285341 non-null  object 
 9   content      285336 non-null  object 
 10  director     269260 non-null  object 
 11  actor 1      253716 non-null  object 
 12  actor 2      246317 non-null  object 
 13  actor 3      239798 non-null  object 
 14  actor 4      230068 non-null  object 
 15  votes        184368 non-null  object 
dtypes: float64(1), int64(1), object(14)
memory usage: 34.8+ MB


((285341, 16), None)

## Objective

We will convert all important features of movies into vectors and then find similarity between movies.

### Clean the data
0. remove Duplicates
1. Remove the alpha chars from years using regex
2. Remove \n chars from genres and synopsis, content using regex
3. Remove rate column(no info), drop content(contains director and actors name which we already have)
4. Remove spaces from names

In [32]:
def remove_newline_chars(text):
    """Removes `\n` characters from a text string."""
    text_str = re.sub(r"\n", "", text)
    text_str = text_str.strip()
    return text_str
    

def remove_punctuations(text):
    # text = str(text).lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()
    return text
    

def join_names(text):
    # text = str(text).lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.replace(' ','')
    text = text.strip()
    return text
    

def flatten_nested_list(nested_list):
    return [item for sublist in nested_list for item in (flatten_nested_list(sublist) if isinstance(sublist, list) else [sublist])]


def clean_genre(x):
    x = str(x)
    x = [remove_newline_chars(i) for i in x.split()]
    x = ' '.join(x)
    return x.strip()


def clean_year(x):
    x = remove_punctuations(x)
    x = re.sub(r'[^0-9]', '', x)
    if x == '':
        x = 0
    return int(x)

In [33]:
df.head()

Unnamed: 0_level_0,title,year,certificate,genre,time,rating,rate,synopsis,content,director,actor 1,actor 2,actor 3,actor 4,votes
i64,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,str
0,"""Indiana Jones …","""-2023""","""PG-13""",""" Action, Adven…","""154 min""",6.9,"""Rate this""",""" Archaeologist…","""  Director:…","""James Mangold""","""Harrison Ford""","""Phoebe Waller-…","""Antonio Bander…","""Karen Allen""","""41,029"""
1,"""The Flash""","""-2023""","""PG-13""",""" Action, Adven…","""144 min""",7.2,"""Rate this""",""" Barry Allen u…","""  Director:…","""Andy Muschiett…","""Ezra Miller""","""Michael Keaton…","""Sasha Calle""","""Michael Shanno…","""80,814"""
2,"""Spider-Man: Ac…","""-2023""","""PG""",""" Animation, Ac…","""140 min""",8.9,"""Rate this""",""" Miles Morales…","""  Directors…","""Joaquim Dos Sa…","""Kemp Powers""","""Justin K. Thom…","""Shameik Moore""","""Hailee Steinfe…","""1,68,355"""
3,"""Extraction II""","""-2023""","""R""",""" Action, Thril…","""122 min""",7.1,"""Rate this""",""" After barely …","""  Director:…","""Sam Hargrave""","""Chris Hemswort…","""Golshifteh Far…","""Adam Bessa""","""Tornike Gogric…","""81,123"""
4,"""Avatar: The Wa…","""-2022""","""PG-13""",""" Action, Adven…","""192 min""",7.6,"""Rate this""",""" Jake Sully li…","""  Director:…","""James Cameron""","""Sam Worthingto…","""Zoe Saldana""","""Sigourney Weav…","""Stephen Lang""","""4,24,931"""


In [34]:
# This df is for filtering, and should have clean values
ddf = df[['title', 'certificate', 'genre', 'rating', 'year','votes']]

In [35]:
ddf= ddf.with_columns([
    pl.Series(name="movies_id", values=range(len(ddf)))
])

In [36]:
ddf.shape

(285341, 7)

In [37]:
ddf.null_count()

title,certificate,genre,rating,year,votes,movies_id
u32,u32,u32,u32,u32,u32,u32
1,212389,0,101028,40116,100973,0


In [38]:
ddf.head(1)

title,certificate,genre,rating,year,votes,movies_id
str,str,str,f64,str,str,i64
"""Indiana Jones …","""PG-13""",""" Action, Adven…",6.9,"""-2023""","""41,029""",0


In [39]:
ddf = ddf.drop_nulls(subset=['title'])
ddf.shape

(285340, 7)

In [40]:
ddf = ddf.with_columns([
    pl.col('year').fill_null(0),
    pl.col('certificate').fill_null('Unavailable'),
    pl.col('rating').fill_null(0.1),
    pl.col('votes').fill_null(0)
    
])

In [41]:
ddf.head(4)

title,certificate,genre,rating,year,votes,movies_id
str,str,str,f64,str,str,i64
"""Indiana Jones …","""PG-13""",""" Action, Adven…",6.9,"""-2023""","""41,029""",0
"""The Flash""","""PG-13""",""" Action, Adven…",7.2,"""-2023""","""80,814""",1
"""Spider-Man: Ac…","""PG""",""" Animation, Ac…",8.9,"""-2023""","""1,68,355""",2
"""Extraction II""","""R""",""" Action, Thril…",7.1,"""-2023""","""81,123""",3


In [42]:
ddf = ddf.with_columns([
        pl.col('year').apply(clean_year),
        pl.col('votes').apply(clean_year),
        pl.col('genre').apply(clean_genre)
])

In [43]:
u_categories = ddf['certificate'].unique().to_list()
u_genre = ddf['genre'].to_list()
u_genre = list(set(flatten_nested_list(u_genre)))
u_movies_latest = ddf.filter((pl.col('rating')>=8.0) & (pl.col('votes')>=10_000))['title'].to_list()[:50]
u_year = ddf['year'].unique().to_list()


In [44]:

movie_info = {'movie_list': u_movies_latest,
              'categories': u_categories, 
              'genre'     : u_genre,
              'year'      : u_year,
             
             }

In [45]:
pickle.dump(movie_info, open('movie_info.pkl','wb'))

In [46]:
ddf.head()
# ddf.null_count()

title,certificate,genre,rating,year,votes,movies_id
str,str,str,f64,i64,i64,i64
"""Indiana Jones …","""PG-13""","""Action, Advent…",6.9,2023,41029,0
"""The Flash""","""PG-13""","""Action, Advent…",7.2,2023,80814,1
"""Spider-Man: Ac…","""PG""","""Animation, Act…",8.9,2023,168355,2
"""Extraction II""","""R""","""Action, Thrill…",7.1,2023,81123,3
"""Avatar: The Wa…","""PG-13""","""Action, Advent…",7.6,2022,424931,4


In [47]:
ddf.write_csv(Path('./dataset/movies_clean_final.csv'))

# Preprocessing

In [16]:
# check for duplicates
df.is_duplicated().sum()

0

## Removing Nulls

In [None]:
# check for nulls
df.null_count()

Here dataset sets has alot of missing values, That won't stop us from over purpose.

The Most Important factor for Movie here is to have a title and synopsis. We will fill nans with space. and drop those who doesn't have title or synopsis.

In [None]:
df.shape

In [None]:
# movie index with no title
df = df.drop_nulls(subset=["title"])

In [None]:
df.shape

## Dropping Columns

Content column only contains 4 actors name and director's name  which we already have as columns, so dropping it

In [None]:
#dropping columns
df = df.drop(['rate', 'content', ''])

In [None]:
df.shape

## Column preprocessing

In [None]:
a = 'heloo  909)(*)()(&3gg '
remove_punctuations(a)

In [None]:
# Movies doesn't have any id, we will create id
df= df.with_columns([
    pl.Series(name="movies_id", values=range(len(df)))
])

In [None]:
# cleaning number columns
df = df.with_columns([
    pl.col('year').str.replace_all(r'[^0-9]', '').alias('year'),
    pl.col('time').str.replace_all(r'[^0-9]', '').alias('time'),
    pl.col('votes').str.replace_all(r'[^0-9]', '').alias('votes'),
])

In [None]:
# filling 0 inplace of Nans, in votes number column (we need this for popular movies)
df = df.with_columns(pl.col('votes').fill_null('0'))
df = df.with_columns(pl.col('votes').cast(pl.Float64))
df = df.with_columns(pl.col('votes').cast(pl.Int64))

In [None]:
df = df.with_columns([
    pl.col('certificate').apply(remove_punctuations),
    pl.col('synopsis').apply(remove_punctuations),

])

In [None]:
df.head()

In [None]:
# filling 0 inplace of Nans, in votes number column (we need this for popular movies)
# df['year'] = pd.to_numeric(df['year'] )
# df['year']=df.year.fillna(0)
# df['year'] = df['year'].astype('int')

df = df.with_columns(pl.col('year').fill_null(0))
df = df.with_columns(pl.col('year').str.replace('', 0))
df = df.with_columns(pl.col('year').cast(pl.Float64))
df = df.with_columns(pl.col('year').cast(pl.Int64))

In [None]:
df = df.with_columns(pl.col('rating').fill_null(0))
df = df.with_columns(pl.col('rating').str.replace('', 0))
df = df.with_columns(pl.col('rating').cast(pl.Float64))

In [None]:
df = df.with_columns(pl.col('time').fill_null(100))
df = df.with_columns(pl.col('time').str.replace('', 0))
df = df.with_columns(pl.col('time').cast(pl.Int64))

In [None]:
df.null_count()

In [None]:
df.head()

In [None]:
# df["genre"] = df["genre"].apply(remove_newline_chars)
# df["synopsis"] = df["synopsis"].apply(remove_newline_chars)


df = df.with_columns([
    pl.col('genre').apply(remove_newline_chars),
    pl.col('synopsis').apply(remove_newline_chars)
])

In [None]:
df.head()

In [None]:
df['genre'][0]

In [None]:
#removing space bw name and surname

In [None]:
df = df.with_columns([
    pl.col('director').apply(join_names),
    pl.col('actor 1' ).apply(join_names),
    pl.col('actor 2' ).apply(join_names),
    pl.col('actor 3' ).apply(join_names),
    pl.col('actor 4' ).apply(join_names),
    pl.col('genre'   ).apply(clean_genre),
])


In [None]:
df.head()

## Joining Names and SUrnames

In [None]:
''' I wanted to join all names in the data, but it crashes and requires more RAM,
'''

In [None]:
# def concatenate_names(sentence):
#     nlp = spacy.load("en_core_web_lg")
#     ner_dict = dict()

#     # Extract names from the sentence using NER
#     doc = nlp(sentence)

#     for ent in doc.ents:
#         if ent.label_ == "PERSON":
#             name = ent.text
#             new_name = name.replace(' ','')
#             ner_dict[name] = new_name

#     # Replace names with names without spaces in the sentence
#     for i, j in ner_dict.items():
#         sentence = sentence.replace(i,j)

#     return sentence

# # Test the function
# # sentence = "Ryan Holiday is an author, and Ryan is famous for his books."
# sentence = 'my name is dick greyson and this is my girifriend harley quinn'
# result = concatenate_names(sentence)
# print(result)


In [None]:
# all_synopsis = df['synopsis'].to_list()

In [None]:
# all_syn = ' <sos> '.join(all_synopsis)

In [None]:
# df  = df.with_columns(
#     pl.col('synopsis').apply(concatenate_names).alias('new')
# )

In [None]:
# start_time = time.perf_counter()
# print('Strated :',  time.strftime("%H:%M:%S", time.localtime()))

# with cf.ProcessPoolExecutor() as executer:
#     dodo = [executer.submit(concatenate_names, i) for i in all_synopsis]
#     # clean_tags = dodo.result()

# print('Took: ',time.perf_counter()-start_time)
# print('Finished :',  time.strftime("%H:%M:%S", time.localtime()))


# Create a Tag Column
Combining all data

In [None]:
df.to_pandas().info()

In [None]:

# df = df.fill_nan('')
df = df.fill_null('')


In [None]:
df = df.with_columns(
    pl.col('year').cast(pl.Utf8).alias('year_as_str'),
    pl.col('rating').cast(pl.Utf8).alias('rating_as_str'),
    pl.col('time').cast(pl.Utf8).alias('time_as_str')
)

In [None]:
print(df.columns)

In [None]:
cols_to_combine = ['title', 'year_as_str', 'certificate', 'genre', 'time_as_str', 'rating_as_str', 'synopsis', 
                   'director', 'actor 1', 'actor 2', 'actor 3', 'actor 4', ]

In [None]:
df = df.with_columns(pl.Series('tags', values = df[cols_to_combine].apply(' '.join).to_series()))


In [None]:
df.head()

In [None]:
df = df.with_columns(pl.col('tags').apply(remove_punctuations))

In [None]:
# Create a PorterStemmer object
stemmer = PorterStemmer()

# Apply the stemming function to each word in the series
df = df.with_columns(pl.col('tags').apply(stemmer.stem))


In [None]:
dff = df[['title', 'movies_id', 'tags', 'votes', 'rating']]

In [None]:
dff.head()

In [None]:
# dff['tags'] = dff['tags'].str.split()
# dff = dff.with_columns(pl.col('tags').str.split(' '))


In [None]:
all_words = dff['tags'].to_list()

In [None]:
all_wordss = flatten_nested_list(all_words)

In [None]:
len(all_wordss), len(set(all_wordss))

In [None]:
from sklearn.feature_extraction.text import  CountVectorizer

In [None]:
cv = CountVectorizer(lowercase=True, stop_words='english', max_features=5000, dtype=np.int16 )

In [None]:
vector = cv.fit_transform(dff['tags'].to_list())

In [None]:
vector

In [None]:
vector[2]

In [None]:
# pickle.dump(movie_vector, open('movie_vector.pkl','wb'))  # creates file of 2.6 gb
pickle.dump(vector, open('vector.pkl','wb'))

In [None]:
dff.to_pandas().info()

In [None]:
def find_similar_movies(x:str, k = 5) -> List[str]:
    movie_id  = dff.filter(pl.col('title') == x)['movies_id'].to_list()[0]
    # print(movie_id)
    sim_vec  = cosine_similarity(vector, vector[movie_id])
    y = sorted(enumerate(sim_vec), key=lambda x: x[1], reverse=True)
    recommended_movies_ids = [i[0] for i in y[1:k+1]]
    # print(recommended_movies_ids)
    for i in recommended_movies_ids:
        print(dff.filter(pl.col('movies_id') == i)['title'].to_list()[0])

In [None]:
find_similar_movies(
 'Killing a Traitor',
k = 10)

In [None]:
dff['title'].sample(50).to_list()

In [None]:
df.sort(by=['votes','rating'], descending=True)

In [None]:
dff.write_csv(Path('./dataset/movies_cleaned.csv'))