In [None]:
import pandas as pd
import polars as pl
import re
import numpy as np
import ast, os, zipfile
import torch
from pathlib import Path
import pickle
from typing import List
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import PorterStemmer
import concurrent.futures as cf
import time
import string
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

Dataset [link here](https://www.kaggle.com/datasets/moon114/imdb-all-movies-dataset)

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
if 'colab' in str(get_ipython()):
    print('Running in COlab')

    # download spacy model
    !python -m spacy download en_core_web_lg -q
    spacy.load("en_core_web_lg")
    nltk.download('stopwords')
    nltk.download('punkt')

    # get data
    try:
        ! git clone https://github.com/tikendraw/movie-recommender-system.git
    except:
        pass

    os.chdir('movie-recommender-system')

    with zipfile.ZipFile('imdb all movies dataset.zip') as f:
        f.extractall('dataset/')

Running in COlab
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
fatal: destination path 'movie-recommender-system' already exists and is not an empty directory.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
os.chdir('movie-recommender-system')

In [None]:
dataset_filepath = Path('./dataset/movies.csv')

# df = pd.read_csv(dataset_filepath)
df = pl.read_csv(dataset_filepath)

In [None]:
#lower the column names for ease of typing
df.columns = [i.lower() for i in df.columns]

# try:
#     # dropping extra columns if exists
#     df.drop('unnamed: 0', inplace =True, axis = 1)
# except:
#     pass


In [None]:
df.shape, df.to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285341 entries, 0 to 285340
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0                285341 non-null  int64  
 1   title        285340 non-null  object 
 2   year         245225 non-null  object 
 3   certificate  72952 non-null   object 
 4   genre        285341 non-null  object 
 5   time         204239 non-null  object 
 6   rating       184313 non-null  float64
 7   rate         239176 non-null  object 
 8   synopsis     285341 non-null  object 
 9   content      285336 non-null  object 
 10  director     269260 non-null  object 
 11  actor 1      253716 non-null  object 
 12  actor 2      246317 non-null  object 
 13  actor 3      239798 non-null  object 
 14  actor 4      230068 non-null  object 
 15  votes        184368 non-null  object 
dtypes: float64(1), int64(1), object(14)
memory usage: 34.8+ MB


((285341, 16), None)

## Objective

We will convert all important features of movies into vectors and then find similarity between movies.

### Clean the data
0. remove Duplicates
1. Remove the alpha chars from years using regex
2. Remove \n chars from genres and synopsis, content using regex
3. Remove rate column(no info), drop content(contains director and actors name which we already have)
4. Remove spaces from names

In [None]:
df.head()

Unnamed: 0_level_0,title,year,certificate,genre,time,rating,rate,synopsis,content,director,actor 1,actor 2,actor 3,actor 4,votes
i64,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,str
0,"""Indiana Jones …","""-2023""","""PG-13""",""" Action, Adven…","""154 min""",6.9,"""Rate this""",""" Archaeologist…","""  Director:…","""James Mangold""","""Harrison Ford""","""Phoebe Waller-…","""Antonio Bander…","""Karen Allen""","""41,029"""
1,"""The Flash""","""-2023""","""PG-13""",""" Action, Adven…","""144 min""",7.2,"""Rate this""",""" Barry Allen u…","""  Director:…","""Andy Muschiett…","""Ezra Miller""","""Michael Keaton…","""Sasha Calle""","""Michael Shanno…","""80,814"""
2,"""Spider-Man: Ac…","""-2023""","""PG""",""" Animation, Ac…","""140 min""",8.9,"""Rate this""",""" Miles Morales…","""  Directors…","""Joaquim Dos Sa…","""Kemp Powers""","""Justin K. Thom…","""Shameik Moore""","""Hailee Steinfe…","""1,68,355"""
3,"""Extraction II""","""-2023""","""R""",""" Action, Thril…","""122 min""",7.1,"""Rate this""",""" After barely …","""  Director:…","""Sam Hargrave""","""Chris Hemswort…","""Golshifteh Far…","""Adam Bessa""","""Tornike Gogric…","""81,123"""
4,"""Avatar: The Wa…","""-2022""","""PG-13""",""" Action, Adven…","""192 min""",7.6,"""Rate this""",""" Jake Sully li…","""  Director:…","""James Cameron""","""Sam Worthingto…","""Zoe Saldana""","""Sigourney Weav…","""Stephen Lang""","""4,24,931"""


# Preprocessing

In [None]:
# check for duplicates
df.is_duplicated().sum()

0

## Removing Nulls

In [None]:
# check for nulls
df.null_count()

Unnamed: 0_level_0,title,year,certificate,genre,time,rating,rate,synopsis,content,director,actor 1,actor 2,actor 3,actor 4,votes
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,1,40116,212389,0,81102,101028,46165,0,5,16081,31625,39024,45543,55273,100973


Here dataset sets has alot of missing values, That won't stop us from over purpose.

The Most Important factor for Movie here is to have a title and content/synopsis. We will fill nans with space. and drop those who doesn't have title or content/synopsis.

In [None]:
idx_to_drop  = []

In [None]:
df.shape

(285341, 16)

In [None]:
# movie index with no title
df = df.drop_nulls(subset=["title"])

In [None]:
df.shape

(285340, 16)

In [None]:
df.columns

['',
 'title',
 'year',
 'certificate',
 'genre',
 'time',
 'rating',
 'rate',
 'synopsis',
 'content',
 'director',
 'actor 1',
 'actor 2',
 'actor 3',
 'actor 4',
 'votes']

## Dropping Columns

Content column only contains 4 actors name and director's name  which we already have as columns, so dropping it

In [None]:
#dropping columns
df = df.drop(['rate', 'content', ''])

In [None]:
df.shape

(285340, 13)

## Column preprocessing

In [None]:
# Movies doesn't have any id, we will create id
df= df.with_columns([
pl.Series(name="movies_id", values=range(len(df)))
    ])

In [None]:
# cleaning number columns
df = df.with_columns([
pl.col('year').str.replace_all(r'[^0-9]', '').alias('year'),
pl.col('time').str.replace_all(r'[^0-9]', '').alias('time'),
pl.col('votes').str.replace_all(r'[^0-9]', '').alias('votes'),
# df['rating'] = df['rating'].str.replace(r'[^\D]', '', regex=True)
])

In [None]:
# filling 0 inplace of Nans, in votes number column (we need this for popular movies)
df = df.with_columns([
    pl.col('votes').fill_null('0')
])

df = df.with_columns(
    pl.col('votes').cast(pl.Float64)
)

df = df.with_columns(
    pl.col('votes').cast(pl.Int64)
)



In [None]:
df.head()

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id
str,str,str,str,str,f64,str,str,str,str,str,str,i64,i64
"""Indiana Jones …","""2023""","""PG-13""",""" Action, Adven…","""154""",6.9,""" Archaeologist…","""James Mangold""","""Harrison Ford""","""Phoebe Waller-…","""Antonio Bander…","""Karen Allen""",41029,0
"""The Flash""","""2023""","""PG-13""",""" Action, Adven…","""144""",7.2,""" Barry Allen u…","""Andy Muschiett…","""Ezra Miller""","""Michael Keaton…","""Sasha Calle""","""Michael Shanno…",80814,1
"""Spider-Man: Ac…","""2023""","""PG""",""" Animation, Ac…","""140""",8.9,""" Miles Morales…","""Joaquim Dos Sa…","""Kemp Powers""","""Justin K. Thom…","""Shameik Moore""","""Hailee Steinfe…",168355,2
"""Extraction II""","""2023""","""R""",""" Action, Thril…","""122""",7.1,""" After barely …","""Sam Hargrave""","""Chris Hemswort…","""Golshifteh Far…","""Adam Bessa""","""Tornike Gogric…",81123,3
"""Avatar: The Wa…","""2022""","""PG-13""",""" Action, Adven…","""192""",7.6,""" Jake Sully li…","""James Cameron""","""Sam Worthingto…","""Zoe Saldana""","""Sigourney Weav…","""Stephen Lang""",424931,4


In [None]:
# filling 0 inplace of Nans, in votes number column (we need this for popular movies)
# df['year'] = pd.to_numeric(df['year'] )
# df['year']=df.year.fillna(0)
# df['year'] = df['year'].astype('int')

df = df.with_columns([
    pl.col('year').fill_null('0')
])

df = df.with_columns([
    pl.col('year').str.replace('', 0)
])

df = df.with_columns(
    pl.col('year').cast(pl.Float64)
)

df = df.with_columns(
    pl.col('year').cast(pl.Int64)
)



In [None]:
def remove_newline_chars(text):
    """Removes `\n` characters from a text string."""
    text_str = re.sub(r"\n", "", text)
    return text_str

In [None]:
# df["genre"] = df["genre"].apply(remove_newline_chars)
# df["synopsis"] = df["synopsis"].apply(remove_newline_chars)


df = df.with_columns([
    pl.col('genre').apply(remove_newline_chars),
    pl.col('synopsis').apply(remove_newline_chars)
])

In [None]:
df.head()

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id
str,i64,str,str,str,f64,str,str,str,str,str,str,i64,i64
"""Indiana Jones …",2023,"""PG-13""","""Action, Advent…","""154""",6.9,"""Archaeologist …","""James Mangold""","""Harrison Ford""","""Phoebe Waller-…","""Antonio Bander…","""Karen Allen""",41029,0
"""The Flash""",2023,"""PG-13""","""Action, Advent…","""144""",7.2,"""Barry Allen us…","""Andy Muschiett…","""Ezra Miller""","""Michael Keaton…","""Sasha Calle""","""Michael Shanno…",80814,1
"""Spider-Man: Ac…",2023,"""PG""","""Animation, Act…","""140""",8.9,"""Miles Morales …","""Joaquim Dos Sa…","""Kemp Powers""","""Justin K. Thom…","""Shameik Moore""","""Hailee Steinfe…",168355,2
"""Extraction II""",2023,"""R""","""Action, Thrill…","""122""",7.1,"""After barely s…","""Sam Hargrave""","""Chris Hemswort…","""Golshifteh Far…","""Adam Bessa""","""Tornike Gogric…",81123,3
"""Avatar: The Wa…",2022,"""PG-13""","""Action, Advent…","""192""",7.6,"""Jake Sully liv…","""James Cameron""","""Sam Worthingto…","""Zoe Saldana""","""Sigourney Weav…","""Stephen Lang""",424931,4


In [None]:
#removing space bw name and surname

In [None]:
df = df.with_columns([pl.col('director').str.replace(' ', ''),
pl.col('actor 1' ).str.replace(' ', ''),
pl.col('actor 2' ).str.replace(' ', ''),
pl.col('actor 3' ).str.replace(' ', ''),
pl.col('actor 4' ).str.replace(' ', ''),
pl.col('genre'   ).str.replace(' ', ''),
])

df= df.with_columns(pl.col('genre'   ).str.replace(',', ', '))

In [None]:
df.head()

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id
str,i64,str,str,str,f64,str,str,str,str,str,str,i64,i64
"""Indiana Jones …",2023,"""PG-13""","""Action, Advent…","""154""",6.9,"""Archaeologist …","""JamesMangold""","""HarrisonFord""","""PhoebeWaller-B…","""AntonioBandera…","""KarenAllen""",41029,0
"""The Flash""",2023,"""PG-13""","""Action, Advent…","""144""",7.2,"""Barry Allen us…","""AndyMuschietti…","""EzraMiller""","""MichaelKeaton""","""SashaCalle""","""MichaelShannon…",80814,1
"""Spider-Man: Ac…",2023,"""PG""","""Animation, Act…","""140""",8.9,"""Miles Morales …","""JoaquimDosSant…","""KempPowers""","""JustinK.Thomps…","""ShameikMoore""","""HaileeSteinfel…",168355,2
"""Extraction II""",2023,"""R""","""Action, Thrill…","""122""",7.1,"""After barely s…","""SamHargrave""","""ChrisHemsworth…","""GolshiftehFara…","""AdamBessa""","""TornikeGogrich…",81123,3
"""Avatar: The Wa…",2022,"""PG-13""","""Action, Advent…","""192""",7.6,"""Jake Sully liv…","""JamesCameron""","""SamWorthington…","""ZoeSaldana""","""SigourneyWeave…","""StephenLang""",424931,4


#remove STOPWORDS

In [None]:
# 72 sec
stop_words = set(stopwords.words('english'))


In [None]:
df  = df.with_columns(
    pl.col('synopsis').apply(lambda x: ' '.join([word for word in word_tokenize(x) if word.lower() not in stop_words]))
)

## Joining Names and SUrnames

In [None]:
''' I wanted to join all names in the data, but it crashes and requires more RAM,
'''

' I wanted to join all names in the data, but it crashes and requires more RAM,\n'

In [None]:
import spacy

def concatenate_names(sentence):
    nlp = spacy.load("en_core_web_lg")
    ner_dict = dict()

    # Extract names from the sentence using NER
    doc = nlp(sentence)

    for ent in doc.ents:
        if ent.label_ == "PERSON":
            name = ent.text
            new_name = name.replace(' ','')
            ner_dict[name] = new_name

    # Replace names with names without spaces in the sentence
    for i, j in ner_dict.items():
        sentence = sentence.replace(i,j)

    return sentence

# Test the function
# sentence = "Ryan Holiday is an author, and Ryan is famous for his books."
sentence = 'my name is dick greyson and this is my girifriend harley quinn'
result = concatenate_names(sentence)
print(result)


my name is dickgreyson and this is my girifriend harleyquinn


In [None]:
# all_synopsis = df['synopsis'].to_list()

In [None]:
# all_syn = ' <sos> '.join(all_synopsis)

In [None]:
# df  = df.with_columns(
#     pl.col('synopsis').apply(concatenate_names).alias('new')
# )

In [None]:
# start_time = time.perf_counter()
# print('Strated :',  time.strftime("%H:%M:%S", time.localtime()))

# with cf.ProcessPoolExecutor() as executer:
#     dodo = [executer.submit(concatenate_names, i) for i in all_synopsis]
#     # clean_tags = dodo.result()

# print('Took: ',time.perf_counter()-start_time)
# print('Finished :',  time.strftime("%H:%M:%S", time.localtime()))


# Create a Tag Column
Combining all data

In [None]:
df.to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285340 entries, 0 to 285339
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   title        285340 non-null  object 
 1   year         285340 non-null  int64  
 2   certificate  72952 non-null   object 
 3   genre        285340 non-null  object 
 4   time         204239 non-null  object 
 5   rating       184313 non-null  float64
 6   synopsis     285340 non-null  object 
 7   director     269259 non-null  object 
 8   actor 1      253715 non-null  object 
 9   actor 2      246316 non-null  object 
 10  actor 3      239797 non-null  object 
 11  actor 4      230067 non-null  object 
 12  votes        285340 non-null  int64  
 13  movies_id    285340 non-null  int64  
dtypes: float64(1), int64(3), object(10)
memory usage: 30.5+ MB


In [None]:
df = df.with_columns([
    pl.col('votes').fill_null(0),
    pl.col('rating').fill_null(0.0)

    ])

# df = df.with_columns([
#     pl.col('votes').fill_nan(0),
#     pl.col('rating').fill_nan(0),

#     ])
df = df.fill_nan('')
df = df.fill_null('')


In [None]:
df = df.with_columns(
    pl.col('rating').cast(pl.Float64))


In [None]:
df.sort([ 'votes', 'rating'], descending=True)

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id,tags
str,i64,str,str,str,f64,str,str,str,str,str,str,i64,i64,str
"""The Shawshank …",1994,"""R""","""Drama …","""142""",9.3,"""course several…","""FrankDarabont""","""TimRobbins""","""MorganFreeman""","""BobGunton""","""WilliamSadler""",2761119,151549,"""the shawshank …"
"""The Dark Knigh…",2008,"""PG-13""","""Action, Crime,…","""152""",9.0,"""menace known J…","""ChristopherNol…","""ChristianBale""","""HeathLedger""","""AaronEckhart""","""MichaelCaine""",2733681,38,"""the dark knigh…"
"""Inception""",2010,"""PG-13""","""Action, Advent…","""148""",8.8,"""thief steals c…","""ChristopherNol…","""LeonardoDiCapr…","""JosephGordon-L…","""ElliotPage""","""KenWatanabe""",2425939,40,"""inception 2010…"
"""Fight Club""",1999,"""R""","""Drama …","""139""",8.8,"""insomniac offi…","""DavidFincher""","""BradPitt""","""EdwardNorton""","""MeatLoaf""","""ZachGrenier""",2198501,151563,"""fight club 199…"
"""Forrest Gump""",1994,"""PG-13""","""Drama, Romance…","""142""",8.8,"""history United…","""RobertZemeckis…","""TomHanks""","""RobinWright""","""GarySinise""","""SallyField""",2147674,151566,"""forrest gump 1…"
"""Pulp Fiction""",1994,"""R""","""Crime, Drama …","""154""",8.9,"""lives two mob …","""QuentinTaranti…","""JohnTravolta""","""UmaThurman""","""SamuelL.Jackso…","""BruceWillis""",2119742,141031,"""pulp fiction 1…"
"""The Matrix""",1999,"""R""","""Action, Sci-Fi…","""136""",8.7,"""beautiful stra…","""LanaWachowski""","""LillyWachowski…","""KeanuReeves""","""LaurenceFishbu…","""Carrie-AnneMos…",1967085,75,"""the matrix 199…"
"""Interstellar""",2014,"""PG-13""","""Adventure, Dra…","""169""",8.7,"""Earth becomes …","""ChristopherNol…","""MatthewMcConau…","""AnneHathaway""","""JessicaChastai…","""MackenzieFoy""",1928943,49931,"""interstellar 2…"
"""The Lord of th…",2001,"""PG-13""","""Action, Advent…","""178""",8.8,"""meek Hobbit Sh…","""PeterJackson""","""ElijahWood""","""IanMcKellen""","""OrlandoBloom""","""SeanBean""",1923733,57,"""the lord of th…"
"""The Godfather""",1972,"""R""","""Crime, Drama …","""175""",9.2,"""Vito Corleone …","""FrancisFordCop…","""MarlonBrando""","""AlPacino""","""JamesCaan""","""DianeKeaton""",1921318,141032,"""the godfather …"


In [None]:
df.shape

(285340, 14)

In [None]:
df2 = df.with_columns(
    pl.col('year').cast('str').alias('year_as_str'),
    pl.col('rating').cast('str').alias('rating_as_str')
)

In [None]:
cols_to_combine = ['title', 'year_as_str', 'certificate', 'genre', 'time', 'rating_as_str', 'synopsis',
       'director', 'actor 1', 'actor 2', 'actor 3', 'actor 4',
       ]

In [None]:
df = df.with_columns(pl.Series('tags', values = df2[cols_to_combine].apply(' '.join).to_series()))


In [None]:
df.head()

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id,tags
str,i64,str,str,str,str,str,str,str,str,str,str,i64,i64,str
"""Indiana Jones …",2023,"""PG-13""","""Action, Advent…","""154""","""6.9""","""Archaeologist …","""JamesMangold""","""HarrisonFord""","""PhoebeWaller-B…","""AntonioBandera…","""KarenAllen""",41029,0,"""Indiana Jones …"
"""The Flash""",2023,"""PG-13""","""Action, Advent…","""144""","""7.2""","""Barry Allen us…","""AndyMuschietti…","""EzraMiller""","""MichaelKeaton""","""SashaCalle""","""MichaelShannon…",80814,1,"""The Flash 2023…"
"""Spider-Man: Ac…",2023,"""PG""","""Animation, Act…","""140""","""8.9""","""Miles Morales …","""JoaquimDosSant…","""KempPowers""","""JustinK.Thomps…","""ShameikMoore""","""HaileeSteinfel…",168355,2,"""Spider-Man: Ac…"
"""Extraction II""",2023,"""R""","""Action, Thrill…","""122""","""7.1""","""barely survivi…","""SamHargrave""","""ChrisHemsworth…","""GolshiftehFara…","""AdamBessa""","""TornikeGogrich…",81123,3,"""Extraction II …"
"""Avatar: The Wa…",2022,"""PG-13""","""Action, Advent…","""192""","""7.6""","""Jake Sully liv…","""JamesCameron""","""SamWorthington…","""ZoeSaldana""","""SigourneyWeave…","""StephenLang""",424931,4,"""Avatar: The Wa…"


In [None]:
def remove_punctuations(text):
    return text.translate(str.maketrans("", "", string.punctuation))

In [None]:
df = df.with_columns(pl.col('tags').apply(remove_punctuations))
df = df.with_columns(pl.col('tags').str.to_lowercase())


In [None]:
# Create a PorterStemmer object
stemmer = PorterStemmer()

# Apply the stemming function to each word in the series
df = df.with_columns(pl.col('tags').apply(stemmer.stem))


In [None]:
dff = df[['title', 'movies_id', 'tags', 'votes', 'rating']]

In [None]:
dff.head()

title,movies_id,tags,votes,rating
str,i64,str,i64,str
"""Indiana Jones …",0,"""indiana jones …",41029,"""6.9"""
"""The Flash""",1,"""the flash 2023…",80814,"""7.2"""
"""Spider-Man: Ac…",2,"""spiderman acro…",168355,"""8.9"""
"""Extraction II""",3,"""extraction ii …",81123,"""7.1"""
"""Avatar: The Wa…",4,"""avatar the way…",424931,"""7.6"""


In [None]:
# dff['tags'] = dff['tags'].str.split()
# dff = dff.with_columns(pl.col('tags').str.split(' '))


In [None]:
dff.head()

title,movies_id,tags,votes,rating
str,i64,str,i64,str
"""Indiana Jones …",0,"""indiana jones …",41029,"""6.9"""
"""The Flash""",1,"""the flash 2023…",80814,"""7.2"""
"""Spider-Man: Ac…",2,"""spiderman acro…",168355,"""8.9"""
"""Extraction II""",3,"""extraction ii …",81123,"""7.1"""
"""Avatar: The Wa…",4,"""avatar the way…",424931,"""7.6"""


In [None]:
all_words = dff['tags'].to_list()

In [None]:
def flatten_nested_list(nested_list):
    return [item for sublist in nested_list for item in (flatten_nested_list(sublist) if isinstance(sublist, list) else [sublist])]


In [None]:
all_wordss = flatten_nested_list(all_words)

In [None]:
len(all_wordss), len(set(all_wordss))

(285340, 285335)

In [None]:
from sklearn.feature_extraction.text import  CountVectorizer

In [None]:
cv = CountVectorizer(lowercase=True, stop_words='english', max_features=5000, dtype=np.int16 )

In [None]:
vector = cv.fit_transform(dff['tags'].to_list())

In [None]:
vector

<285340x5000 sparse matrix of type '<class 'numpy.int16'>'
	with 4107158 stored elements in Compressed Sparse Row format>

In [None]:
vector[2]

<1x5000 sparse matrix of type '<class 'numpy.int16'>'
	with 19 stored elements in Compressed Sparse Row format>

In [None]:
# pickle.dump(movie_vector, open('movie_vector.pkl','wb'))  # creates file of 2.6 gb
pickle.dump(vector, open('vector.pkl','wb'))

In [None]:
dff.to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285340 entries, 0 to 285339
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   title      285340 non-null  object
 1   movies_id  285340 non-null  int64 
 2   tags       285340 non-null  object
 3   votes      285340 non-null  int64 
 4   rating     285340 non-null  object
dtypes: int64(2), object(3)
memory usage: 10.9+ MB


In [None]:
def find_similar_movies(x:str, k = 5) -> List[str]:
    movie_id  = dff.filter(pl.col('title') == x)['movies_id'].to_list()[0]
    # print(movie_id)
    sim_vec  = cosine_similarity(vector, vector[movie_id])
    y = sorted(enumerate(sim_vec), key=lambda x: x[1], reverse=True)
    recommended_movies_ids = [i[0] for i in y[1:k+1]]
    # print(recommended_movies_ids)
    for i in recommended_movies_ids:
        print(dff.filter(pl.col('movies_id') == i)['title'].to_list()[0])

In [None]:
find_similar_movies(
 'Intimate Witness',
k = 10)

Crestfallen
Dead Name
Burglar and Umbrella
Out of Death
Konsento
Ferrugem
Brother Sister Lover
Jailbreak in Hamburg
Temptations Again
Code of Hunting


In [None]:
dff['title'].sample(50).to_list()

['Ball Lightning',
 'Evil Behind You',
 'The Smiley Face Killers',
 'Kaali Pahadi',
 'The Right Juice',
 'Seoul Rainbow',
 'Andromedia',
 'Young Shim',
 'Lashkar',
 '21 Days',
 'Man Down',
 'Hatyare',
 'Leonardo Syndrome',
 '12 AM Madhyarathri',
 'Office Invasion',
 'Anandache Jhaad',
 'Harts Ridge',
 'On the Waterfront',
 'The Life of a Jackeroo',
 'The Flaming Frontier',
 'Dayavan',
 "Far til fire's vilde ferie",
 'The Devils Are Here',
 'Un amor extraño',
 'Caballo patas de acero',
 'Hostage: Missing Celebrity',
 'Crimen en Chihuahua',
 'The Four Poster',
 'Acceptera Mig',
 'Slime',
 'Seven Days in Never',
 'Maryade Ramanna',
 'Faerie Ring',
 'Can yue li hun',
 'Salam-e-Madrasa',
 'No Retreat, No Surrender',
 'Apache Street',
 'Kora Kids',
 'Hintertreppe',
 'Maya and Her Lover',
 'Bobby Fischer Live',
 'Dirty Rhetoric',
 'Suvarna Sundari',
 'The Man from the Future',
 'Warat',
 'Jack Zollo: My Life in Crime',
 'R A N D, inc.',
 'Missing Evidence',
 'Intimate Witness',
 'Find the Lad

In [None]:
if str(dff['rating'].dtype) == 'Utf8':
    dff = dff.with_columns([pl.col('rating').str.replace_all('',0)])
    dff = dff.with_columns([pl.col('rating').cast(pl.Float64), pl.col('movies_id').cast(pl.Int64) ])

if str(dff['votes'].dtype) == 'Int64':
    pass

In [None]:
dff.median()

title,movies_id,tags,votes,rating
str,f64,str,f64,f64
,142669.5,,26.0,50.01


In [None]:
df.sort(by=['rating', 'votes'], descending=True)

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id,tags
str,i64,str,str,str,str,str,str,str,str,str,str,i64,i64,str
"""Uruttu Tech""",2023,"""U""","""Comedy …","""""","""9.9""","""tech war truth…","""PcdocNandha""","""""","""""","""""","""""",7069,78580,"""uruttu tech 20…"
"""Shubh Yatra""",2023,"""UA""","""Drama, Family …","""132""","""9.9""","""young man , ad…","""ManishSaini""","""MalharThakar""","""MonalGajjar""","""DarshanJariwal…","""HeminTrivedi""",1337,153662,"""shubh yatra 20…"
"""Saachi""",2023,"""""","""Drama …","""118""","""9.9""","""Saachi high sc…","""VivekPothagoni…","""GeethikaRathan…","""MulaviratAshok…","""SanjanaReddy""","""ChellySwapna""",1226,168388,"""saachi 2023 d…"
"""Mariguddada Ga…",2023,"""""","""Thriller …","""""","""9.9""","""fictional stor…","""RChandrakant""","""PraveenRaj""","""DineshKumarD""","""Namratha""","""Avinash""",710,271126,"""mariguddada ga…"
"""Yuzuru Hanyu I…",2023,"""""","""Sport …","""""","""9.9""","""Add Plot""","""YuzuruHanyu""","""YuzuruHanyu""","""""","""""","""""",292,267309,"""yuzuru hanyu i…"
"""IRavan""",2023,"""""","""Thriller …","""""","""9.9""","""man studying a…","""RamsRanga""","""Avinash""","""KrishnaHebbale…","""KanthrajuKaddi…","""J.Karthik""",97,269687,"""iravan 2023 t…"
"""Oye Jassi Oye""",2023,"""""","""Comedy, Romanc…","""""","""9.9""","""Add Plot""","""GanyaRajput""","""AshishBhat""","""GanyaRajput""","""""","""""",68,88135,"""oye jassi oye …"
"""Mary""",2023,"""UA 13+""","""Action, Thrill…","""94""","""9.9""","""looking succes…","""ManojP.Nadalum…","""AnooshaKrishna…","""ThejaswiniShar…","""VikashUttaiah""","""ChethanVicky""",45,8936,"""mary 2023 ua 1…"
"""Bablee""",2023,"""UA""","""Romance …","""130""","""9.9""","""Satya exiled m…","""RobertMegha""","""ChauthmolAniru…","""GaggnGajarlwwa…","""DhoteKanchan""","""DadgalNilesh""",41,258423,"""bablee 2023 ua…"
"""The Next Morni…",2022,"""""","""Crime …","""109""","""9.9""","""Children simil…","""SenthilkumarAl…","""EramAli""","""SusmitaBanerje…","""KaakanDebnath""","""MinaGhosh""",32,144443,"""the next morni…"


In [None]:
def find_popular_movies( k = 5) -> List[str]:
    movie_id  = dff.filter(pl.col('title') == x)['movies_id'].to_list()[0]
    # print(movie_id)
    sim_vec  = cosine_similarity(vector, vector[movie_id])
    y = sorted(enumerate(sim_vec), key=lambda x: x[1], reverse=True)
    recommended_movies_ids = [i[0] for i in y[1:k+1]]
    # print(recommended_movies_ids)
    for i in recommended_movies_ids:
        print(dff.filter(pl.col('movies_id') == i)['title'].to_list()[0])

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.sort_values(['votes','rating' ], ascending=False)