In [116]:
import pandas as pd
import polars as pl
import re
import numpy as np
import ast, os, zipfile
import torch
from pathlib import Path
import pickle
from typing import List
from sklearn.metrics.pairwise import cosine_similarity
from nltk import download
from nltk.stem import PorterStemmer
# import concurrent.futures as cf
# import time
import string
# import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

Dataset [link here](https://www.kaggle.com/datasets/moon114/imdb-all-movies-dataset)

In [30]:
# nltk.download('punkt')
# nltk.download('stopwords')

In [31]:
if 'colab' in str(get_ipython()):
    print('Running in COlab')

    # download spacy model
    # !python -m spacy download en_core_web_lg -q
    # spacy.load("en_core_web_lg")
    download('stopwords')
    download('punkt')

    # get data
    try:
        ! git clone https://github.com/tikendraw/movie-recommender-system.git
    except:
        pass

    os.chdir('movie-recommender-system')

    with zipfile.ZipFile('imdb all movies dataset.zip') as f:
        f.extractall('dataset/')

In [32]:
# os.chdir('movie-recommender-system')

In [33]:
dataset_filepath = Path('./dataset/movies.csv')

# df = pd.read_csv(dataset_filepath)
df = pl.read_csv(dataset_filepath)

In [34]:
#lower the column names for ease of typing
df.columns = [i.lower() for i in df.columns]

# try:
#     # dropping extra columns if exists
#     df.drop('unnamed: 0', inplace =True, axis = 1)
# except:
#     pass


In [35]:
df.shape, df.to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285341 entries, 0 to 285340
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0                285341 non-null  int64  
 1   title        285340 non-null  object 
 2   year         245225 non-null  object 
 3   certificate  72952 non-null   object 
 4   genre        285341 non-null  object 
 5   time         204239 non-null  object 
 6   rating       184313 non-null  float64
 7   rate         239176 non-null  object 
 8   synopsis     285341 non-null  object 
 9   content      285336 non-null  object 
 10  director     269260 non-null  object 
 11  actor 1      253716 non-null  object 
 12  actor 2      246317 non-null  object 
 13  actor 3      239798 non-null  object 
 14  actor 4      230068 non-null  object 
 15  votes        184368 non-null  object 
dtypes: float64(1), int64(1), object(14)
memory usage: 34.8+ MB


((285341, 16), None)

## Objective

We will convert all important features of movies into vectors and then find similarity between movies.

### Clean the data
0. remove Duplicates
1. Remove the alpha chars from years using regex
2. Remove \n chars from genres and synopsis, content using regex
3. Remove rate column(no info), drop content(contains director and actors name which we already have)
4. Remove spaces from names

In [36]:
df.head()

Unnamed: 0_level_0,title,year,certificate,genre,time,rating,rate,synopsis,content,director,actor 1,actor 2,actor 3,actor 4,votes
i64,str,str,str,str,str,f64,str,str,str,str,str,str,str,str,str
0,"""Indiana Jones …","""-2023""","""PG-13""",""" Action, Adven…","""154 min""",6.9,"""Rate this""",""" Archaeologist…","""  Director:…","""James Mangold""","""Harrison Ford""","""Phoebe Waller-…","""Antonio Bander…","""Karen Allen""","""41,029"""
1,"""The Flash""","""-2023""","""PG-13""",""" Action, Adven…","""144 min""",7.2,"""Rate this""",""" Barry Allen u…","""  Director:…","""Andy Muschiett…","""Ezra Miller""","""Michael Keaton…","""Sasha Calle""","""Michael Shanno…","""80,814"""
2,"""Spider-Man: Ac…","""-2023""","""PG""",""" Animation, Ac…","""140 min""",8.9,"""Rate this""",""" Miles Morales…","""  Directors…","""Joaquim Dos Sa…","""Kemp Powers""","""Justin K. Thom…","""Shameik Moore""","""Hailee Steinfe…","""1,68,355"""
3,"""Extraction II""","""-2023""","""R""",""" Action, Thril…","""122 min""",7.1,"""Rate this""",""" After barely …","""  Director:…","""Sam Hargrave""","""Chris Hemswort…","""Golshifteh Far…","""Adam Bessa""","""Tornike Gogric…","""81,123"""
4,"""Avatar: The Wa…","""-2022""","""PG-13""",""" Action, Adven…","""192 min""",7.6,"""Rate this""",""" Jake Sully li…","""  Director:…","""James Cameron""","""Sam Worthingto…","""Zoe Saldana""","""Sigourney Weav…","""Stephen Lang""","""4,24,931"""


# Preprocessing

In [37]:
# check for duplicates
df.is_duplicated().sum()

0

## Removing Nulls

In [38]:
# check for nulls
df.null_count()

Unnamed: 0_level_0,title,year,certificate,genre,time,rating,rate,synopsis,content,director,actor 1,actor 2,actor 3,actor 4,votes
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,1,40116,212389,0,81102,101028,46165,0,5,16081,31625,39024,45543,55273,100973


Here dataset sets has alot of missing values, That won't stop us from over purpose.

The Most Important factor for Movie here is to have a title and synopsis. We will fill nans with space. and drop those who doesn't have title or synopsis.

In [39]:
df.shape

(285341, 16)

In [40]:
# movie index with no title
df = df.drop_nulls(subset=["title"])

In [41]:
df.shape

(285340, 16)

## Dropping Columns

Content column only contains 4 actors name and director's name  which we already have as columns, so dropping it

In [42]:
#dropping columns
df = df.drop(['rate', 'content', ''])

In [43]:
df.shape

(285340, 13)

## Column preprocessing

In [44]:
def remove_newline_chars(text):
    """Removes `\n` characters from a text string."""
    text_str = re.sub(r"\n", "", text)
    return text_str

def remove_punctuations(text):
    text = str(text).lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.strip()
    return text

def join_names(text):
    text = str(text).lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.replace(' ','')
    text = text.strip()
    return text

In [45]:
a = 'heloo  909)(*)()(&3gg '
remove_punctuations(a)

'heloo  9093gg'

In [46]:
# Movies doesn't have any id, we will create id
df= df.with_columns([
    pl.Series(name="movies_id", values=range(len(df)))
])

In [47]:
# cleaning number columns
df = df.with_columns([
    pl.col('year').str.replace_all(r'[^0-9]', '').alias('year'),
    pl.col('time').str.replace_all(r'[^0-9]', '').alias('time'),
    pl.col('votes').str.replace_all(r'[^0-9]', '').alias('votes'),
])

In [48]:
# filling 0 inplace of Nans, in votes number column (we need this for popular movies)
df = df.with_columns(pl.col('votes').fill_null('0'))
df = df.with_columns(pl.col('votes').cast(pl.Float64))
df = df.with_columns(pl.col('votes').cast(pl.Int64))

In [49]:
df = df.with_columns([
    pl.col('certificate').apply(remove_punctuations),
    pl.col('synopsis').apply(remove_punctuations),

])

In [50]:
df.head()

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id
str,str,str,str,str,f64,str,str,str,str,str,str,i64,i64
"""Indiana Jones …","""2023""","""pg13""",""" Action, Adven…","""154""",6.9,"""archaeologist …","""James Mangold""","""Harrison Ford""","""Phoebe Waller-…","""Antonio Bander…","""Karen Allen""",41029,0
"""The Flash""","""2023""","""pg13""",""" Action, Adven…","""144""",7.2,"""barry allen us…","""Andy Muschiett…","""Ezra Miller""","""Michael Keaton…","""Sasha Calle""","""Michael Shanno…",80814,1
"""Spider-Man: Ac…","""2023""","""pg""",""" Animation, Ac…","""140""",8.9,"""miles morales …","""Joaquim Dos Sa…","""Kemp Powers""","""Justin K. Thom…","""Shameik Moore""","""Hailee Steinfe…",168355,2
"""Extraction II""","""2023""","""r""",""" Action, Thril…","""122""",7.1,"""after barely s…","""Sam Hargrave""","""Chris Hemswort…","""Golshifteh Far…","""Adam Bessa""","""Tornike Gogric…",81123,3
"""Avatar: The Wa…","""2022""","""pg13""",""" Action, Adven…","""192""",7.6,"""jake sully liv…","""James Cameron""","""Sam Worthingto…","""Zoe Saldana""","""Sigourney Weav…","""Stephen Lang""",424931,4


In [51]:
# filling 0 inplace of Nans, in votes number column (we need this for popular movies)
# df['year'] = pd.to_numeric(df['year'] )
# df['year']=df.year.fillna(0)
# df['year'] = df['year'].astype('int')

df = df.with_columns(pl.col('year').fill_null(0))
df = df.with_columns(pl.col('year').str.replace('', 0))
df = df.with_columns(pl.col('year').cast(pl.Float64))
df = df.with_columns(pl.col('year').cast(pl.Int64))

In [52]:
df = df.with_columns(pl.col('rating').fill_null(0))
df = df.with_columns(pl.col('rating').str.replace('', 0))
df = df.with_columns(pl.col('rating').cast(pl.Float64))

In [53]:
df = df.with_columns(pl.col('time').fill_null(100))
df = df.with_columns(pl.col('time').str.replace('', 0))
df = df.with_columns(pl.col('time').cast(pl.Int64))

In [54]:
df.null_count()

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,212388,0,0,0,0,16081,31625,39024,45543,55273,0,0


In [55]:
df.head()

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id
str,i64,str,str,i64,f64,str,str,str,str,str,str,i64,i64
"""Indiana Jones …",2023,"""pg13""",""" Action, Adven…",154,6.9,"""archaeologist …","""James Mangold""","""Harrison Ford""","""Phoebe Waller-…","""Antonio Bander…","""Karen Allen""",41029,0
"""The Flash""",2023,"""pg13""",""" Action, Adven…",144,7.2,"""barry allen us…","""Andy Muschiett…","""Ezra Miller""","""Michael Keaton…","""Sasha Calle""","""Michael Shanno…",80814,1
"""Spider-Man: Ac…",2023,"""pg""",""" Animation, Ac…",140,8.9,"""miles morales …","""Joaquim Dos Sa…","""Kemp Powers""","""Justin K. Thom…","""Shameik Moore""","""Hailee Steinfe…",168355,2
"""Extraction II""",2023,"""r""",""" Action, Thril…",122,7.1,"""after barely s…","""Sam Hargrave""","""Chris Hemswort…","""Golshifteh Far…","""Adam Bessa""","""Tornike Gogric…",81123,3
"""Avatar: The Wa…",2022,"""pg13""",""" Action, Adven…",192,7.6,"""jake sully liv…","""James Cameron""","""Sam Worthingto…","""Zoe Saldana""","""Sigourney Weav…","""Stephen Lang""",424931,4


In [56]:
# df["genre"] = df["genre"].apply(remove_newline_chars)
# df["synopsis"] = df["synopsis"].apply(remove_newline_chars)


df = df.with_columns([
    pl.col('genre').apply(remove_newline_chars),
    pl.col('synopsis').apply(remove_newline_chars)
])

In [57]:
df.head()

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id
str,i64,str,str,i64,f64,str,str,str,str,str,str,i64,i64
"""Indiana Jones …",2023,"""pg13""","""Action, Advent…",154,6.9,"""archaeologist …","""James Mangold""","""Harrison Ford""","""Phoebe Waller-…","""Antonio Bander…","""Karen Allen""",41029,0
"""The Flash""",2023,"""pg13""","""Action, Advent…",144,7.2,"""barry allen us…","""Andy Muschiett…","""Ezra Miller""","""Michael Keaton…","""Sasha Calle""","""Michael Shanno…",80814,1
"""Spider-Man: Ac…",2023,"""pg""","""Animation, Act…",140,8.9,"""miles morales …","""Joaquim Dos Sa…","""Kemp Powers""","""Justin K. Thom…","""Shameik Moore""","""Hailee Steinfe…",168355,2
"""Extraction II""",2023,"""r""","""Action, Thrill…",122,7.1,"""after barely s…","""Sam Hargrave""","""Chris Hemswort…","""Golshifteh Far…","""Adam Bessa""","""Tornike Gogric…",81123,3
"""Avatar: The Wa…",2022,"""pg13""","""Action, Advent…",192,7.6,"""jake sully liv…","""James Cameron""","""Sam Worthingto…","""Zoe Saldana""","""Sigourney Weav…","""Stephen Lang""",424931,4


In [58]:
df['genre'][0]

'Action, Adventure            '

In [59]:
#removing space bw name and surname

In [60]:
df = df.with_columns([
    pl.col('director').apply(join_names),
    pl.col('actor 1' ).apply(join_names),
    pl.col('actor 2' ).apply(join_names),
    pl.col('actor 3' ).apply(join_names),
    pl.col('actor 4' ).apply(join_names),
    pl.col('genre'   ).str.replace(' ', ''),
])

df= df.with_columns(pl.col('genre'   ).str.replace(',', ', '))

In [61]:
df.head()

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id
str,i64,str,str,i64,f64,str,str,str,str,str,str,i64,i64
"""Indiana Jones …",2023,"""pg13""","""Action, Advent…",154,6.9,"""archaeologist …","""jamesmangold""","""harrisonford""","""phoebewallerbr…","""antoniobandera…","""karenallen""",41029,0
"""The Flash""",2023,"""pg13""","""Action, Advent…",144,7.2,"""barry allen us…","""andymuschietti…","""ezramiller""","""michaelkeaton""","""sashacalle""","""michaelshannon…",80814,1
"""Spider-Man: Ac…",2023,"""pg""","""Animation, Act…",140,8.9,"""miles morales …","""joaquimdossant…","""kemppowers""","""justinkthompso…","""shameikmoore""","""haileesteinfel…",168355,2
"""Extraction II""",2023,"""r""","""Action, Thrill…",122,7.1,"""after barely s…","""samhargrave""","""chrishemsworth…","""golshiftehfara…","""adambessa""","""tornikegogrich…",81123,3
"""Avatar: The Wa…",2022,"""pg13""","""Action, Advent…",192,7.6,"""jake sully liv…","""jamescameron""","""samworthington…","""zoesaldana""","""sigourneyweave…","""stephenlang""",424931,4


## Joining Names and SUrnames

In [None]:
''' I wanted to join all names in the data, but it crashes and requires more RAM,
'''

' I wanted to join all names in the data, but it crashes and requires more RAM,\n'

In [63]:
# def concatenate_names(sentence):
#     nlp = spacy.load("en_core_web_lg")
#     ner_dict = dict()

#     # Extract names from the sentence using NER
#     doc = nlp(sentence)

#     for ent in doc.ents:
#         if ent.label_ == "PERSON":
#             name = ent.text
#             new_name = name.replace(' ','')
#             ner_dict[name] = new_name

#     # Replace names with names without spaces in the sentence
#     for i, j in ner_dict.items():
#         sentence = sentence.replace(i,j)

#     return sentence

# # Test the function
# # sentence = "Ryan Holiday is an author, and Ryan is famous for his books."
# sentence = 'my name is dick greyson and this is my girifriend harley quinn'
# result = concatenate_names(sentence)
# print(result)


my name is dickgreyson and this is my girifriend harleyquinn


In [None]:
# all_synopsis = df['synopsis'].to_list()

In [None]:
# all_syn = ' <sos> '.join(all_synopsis)

In [None]:
# df  = df.with_columns(
#     pl.col('synopsis').apply(concatenate_names).alias('new')
# )

In [None]:
# start_time = time.perf_counter()
# print('Strated :',  time.strftime("%H:%M:%S", time.localtime()))

# with cf.ProcessPoolExecutor() as executer:
#     dodo = [executer.submit(concatenate_names, i) for i in all_synopsis]
#     # clean_tags = dodo.result()

# print('Took: ',time.perf_counter()-start_time)
# print('Finished :',  time.strftime("%H:%M:%S", time.localtime()))


# Create a Tag Column
Combining all data

In [64]:
df.to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285340 entries, 0 to 285339
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   title        285340 non-null  object 
 1   year         285340 non-null  int64  
 2   certificate  72952 non-null   object 
 3   genre        285340 non-null  object 
 4   time         285340 non-null  int64  
 5   rating       285340 non-null  float64
 6   synopsis     285340 non-null  object 
 7   director     269259 non-null  object 
 8   actor 1      253715 non-null  object 
 9   actor 2      246316 non-null  object 
 10  actor 3      239797 non-null  object 
 11  actor 4      230067 non-null  object 
 12  votes        285340 non-null  int64  
 13  movies_id    285340 non-null  int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 30.5+ MB


In [69]:

# df = df.fill_nan('')
df = df.fill_null('')


In [81]:
df = df.with_columns(
    pl.col('year').cast(pl.Utf8).alias('year_as_str'),
    pl.col('rating').cast(pl.Utf8).alias('rating_as_str'),
    pl.col('time').cast(pl.Utf8).alias('time_as_str')
)

In [82]:
print(df.columns)

['title', 'year', 'certificate', 'genre', 'time', 'rating', 'synopsis', 'director', 'actor 1', 'actor 2', 'actor 3', 'actor 4', 'votes', 'movies_id', 'year_as_str', 'rating_as_str', 'time_as_str']


In [83]:
cols_to_combine = ['title', 'year_as_str', 'certificate', 'genre', 'time_as_str', 'rating_as_str', 'synopsis', 
                   'director', 'actor 1', 'actor 2', 'actor 3', 'actor 4', ]

In [85]:
df = df.with_columns(pl.Series('tags', values = df[cols_to_combine].apply(' '.join).to_series()))


In [86]:
df.head()

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id,year_as_str,rating_as_str,time_as_str,tags
str,i64,str,str,i64,f64,str,str,str,str,str,str,i64,i64,str,str,str,str
"""Indiana Jones …",2023,"""pg13""","""Action, Advent…",154,6.9,"""archaeologist …","""jamesmangold""","""harrisonford""","""phoebewallerbr…","""antoniobandera…","""karenallen""",41029,0,"""2023""","""6.9""","""154""","""Indiana Jones …"
"""The Flash""",2023,"""pg13""","""Action, Advent…",144,7.2,"""barry allen us…","""andymuschietti…","""ezramiller""","""michaelkeaton""","""sashacalle""","""michaelshannon…",80814,1,"""2023""","""7.2""","""144""","""The Flash 2023…"
"""Spider-Man: Ac…",2023,"""pg""","""Animation, Act…",140,8.9,"""miles morales …","""joaquimdossant…","""kemppowers""","""justinkthompso…","""shameikmoore""","""haileesteinfel…",168355,2,"""2023""","""8.9""","""140""","""Spider-Man: Ac…"
"""Extraction II""",2023,"""r""","""Action, Thrill…",122,7.1,"""after barely s…","""samhargrave""","""chrishemsworth…","""golshiftehfara…","""adambessa""","""tornikegogrich…",81123,3,"""2023""","""7.1""","""122""","""Extraction II …"
"""Avatar: The Wa…",2022,"""pg13""","""Action, Advent…",192,7.6,"""jake sully liv…","""jamescameron""","""samworthington…","""zoesaldana""","""sigourneyweave…","""stephenlang""",424931,4,"""2022""","""7.6""","""192""","""Avatar: The Wa…"


In [87]:
df = df.with_columns(pl.col('tags').apply(remove_punctuations))

In [88]:
# Create a PorterStemmer object
stemmer = PorterStemmer()

# Apply the stemming function to each word in the series
df = df.with_columns(pl.col('tags').apply(stemmer.stem))


In [89]:
dff = df[['title', 'movies_id', 'tags', 'votes', 'rating']]

In [90]:
dff.head()

title,movies_id,tags,votes,rating
str,i64,str,i64,f64
"""Indiana Jones …",0,"""indiana jones …",41029,6.9
"""The Flash""",1,"""the flash 2023…",80814,7.2
"""Spider-Man: Ac…",2,"""spiderman acro…",168355,8.9
"""Extraction II""",3,"""extraction ii …",81123,7.1
"""Avatar: The Wa…",4,"""avatar the way…",424931,7.6


In [91]:
# dff['tags'] = dff['tags'].str.split()
# dff = dff.with_columns(pl.col('tags').str.split(' '))


In [93]:
all_words = dff['tags'].to_list()

In [94]:
def flatten_nested_list(nested_list):
    return [item for sublist in nested_list for item in (flatten_nested_list(sublist) if isinstance(sublist, list) else [sublist])]


In [95]:
all_wordss = flatten_nested_list(all_words)

In [96]:
len(all_wordss), len(set(all_wordss))

(285340, 285335)

In [97]:
from sklearn.feature_extraction.text import  CountVectorizer

In [98]:
cv = CountVectorizer(lowercase=True, stop_words='english', max_features=5000, dtype=np.int16 )

In [99]:
vector = cv.fit_transform(dff['tags'].to_list())

In [100]:
vector

<285340x5000 sparse matrix of type '<class 'numpy.int16'>'
	with 4266507 stored elements in Compressed Sparse Row format>

In [101]:
vector[2]

<1x5000 sparse matrix of type '<class 'numpy.int16'>'
	with 20 stored elements in Compressed Sparse Row format>

In [102]:
# pickle.dump(movie_vector, open('movie_vector.pkl','wb'))  # creates file of 2.6 gb
pickle.dump(vector, open('vector.pkl','wb'))

In [103]:
dff.to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285340 entries, 0 to 285339
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   title      285340 non-null  object 
 1   movies_id  285340 non-null  int64  
 2   tags       285340 non-null  object 
 3   votes      285340 non-null  int64  
 4   rating     285340 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 10.9+ MB


In [104]:
def find_similar_movies(x:str, k = 5) -> List[str]:
    movie_id  = dff.filter(pl.col('title') == x)['movies_id'].to_list()[0]
    # print(movie_id)
    sim_vec  = cosine_similarity(vector, vector[movie_id])
    y = sorted(enumerate(sim_vec), key=lambda x: x[1], reverse=True)
    recommended_movies_ids = [i[0] for i in y[1:k+1]]
    # print(recommended_movies_ids)
    for i in recommended_movies_ids:
        print(dff.filter(pl.col('movies_id') == i)['title'].to_list()[0])

In [110]:
find_similar_movies(
 'Killing a Traitor',
k = 10)

Elli Kelimelik Mektuplar
Viddikalude Maash
Budhiya
Caravaggio's Shadow
Botev
Samo edna zelba
Charandas Chor
Haemolymph
Chinna Sharma
Panduan Mempersiapkan Perpisahan


In [106]:
dff['title'].sample(50).to_list()

['Love Suicides',
 'Court',
 'Aroni Tokhon',
 'Untitled Charlie Mohr Film',
 'Killing a Traitor',
 'Nava Pappa',
 'The Table Read',
 'Geçerken Ugradim',
 'Crazy Moon',
 'Bestezuelas',
 'Basagan ng mukha',
 'Andar Bahar',
 'Mother Monster',
 'Fantasy·World',
 'El jinete solitario',
 'A Grand Little Lie',
 'O Thandri O Koduku',
 'Les dépossédés',
 'J.R. contraataca',
 'Rakhta Charitra 2',
 'Happy for You',
 'Spirits of the Air: Gremlins of the Clouds',
 'Hostage House',
 'The Rest Is Silent',
 'Arangetra Velai',
 'Mr. Go',
 'Oras na para lumaban',
 'The Boy Who Played with Fusion',
 'Shen mi xiao jie',
 'Wild Roses',
 'Gutland',
 'Freedom Flight',
 'Heroes Like Us',
 'Ghoomketu',
 'Strange Bedfellows',
 "The Music Goes 'Round",
 'Els camps magnètics',
 'Temps mort',
 'The Real Reason (Men Commit Crimes)',
 'Lola the Truck Driving Woman',
 'Perfect Education: Maid, for You',
 'The Hospital 2',
 'Coming of Age 2',
 'Valentines Night',
 'Four Feathers',
 'Muerte: Tales of Horror',
 'Mr. Tax

In [112]:
df.sort(by=['votes','rating'], descending=True)

title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id,year_as_str,rating_as_str,time_as_str,tags
str,i64,str,str,i64,f64,str,str,str,str,str,str,i64,i64,str,str,str,str
"""The Shawshank …",1994,"""r""","""Drama …",142,9.3,"""over the cours…","""frankdarabont""","""timrobbins""","""morganfreeman""","""bobgunton""","""williamsadler""",2761119,151549,"""1994""","""9.3""","""142""","""the shawshank …"
"""The Dark Knigh…",2008,"""pg13""","""Action, Crime,…",152,9.0,"""when the menac…","""christophernol…","""christianbale""","""heathledger""","""aaroneckhart""","""michaelcaine""",2733681,38,"""2008""","""9.0""","""152""","""the dark knigh…"
"""Inception""",2010,"""pg13""","""Action, Advent…",148,8.8,"""a thief who st…","""christophernol…","""leonardodicapr…","""josephgordonle…","""elliotpage""","""kenwatanabe""",2425939,40,"""2010""","""8.8""","""148""","""inception 2010…"
"""Fight Club""",1999,"""r""","""Drama …",139,8.8,"""an insomniac o…","""davidfincher""","""bradpitt""","""edwardnorton""","""meatloaf""","""zachgrenier""",2198501,151563,"""1999""","""8.8""","""139""","""fight club 199…"
"""Forrest Gump""",1994,"""pg13""","""Drama, Romance…",142,8.8,"""the history of…","""robertzemeckis…","""tomhanks""","""robinwright""","""garysinise""","""sallyfield""",2147674,151566,"""1994""","""8.8""","""142""","""forrest gump 1…"
"""Pulp Fiction""",1994,"""r""","""Crime, Drama …",154,8.9,"""the lives of t…","""quentintaranti…","""johntravolta""","""umathurman""","""samuelljackson…","""brucewillis""",2119742,141031,"""1994""","""8.9""","""154""","""pulp fiction 1…"
"""The Matrix""",1999,"""r""","""Action, Sci-Fi…",136,8.7,"""when a beautif…","""lanawachowski""","""lillywachowski…","""keanureeves""","""laurencefishbu…","""carrieannemoss…",1967085,75,"""1999""","""8.7""","""136""","""the matrix 199…"
"""Interstellar""",2014,"""pg13""","""Adventure, Dra…",169,8.7,"""when earth bec…","""christophernol…","""matthewmcconau…","""annehathaway""","""jessicachastai…","""mackenziefoy""",1928943,49931,"""2014""","""8.7""","""169""","""interstellar 2…"
"""The Lord of th…",2001,"""pg13""","""Action, Advent…",178,8.8,"""a meek hobbit …","""peterjackson""","""elijahwood""","""ianmckellen""","""orlandobloom""","""seanbean""",1923733,57,"""2001""","""8.8""","""178""","""the lord of th…"
"""The Godfather""",1972,"""r""","""Crime, Drama …",175,9.2,"""don vito corle…","""francisfordcop…","""marlonbrando""","""alpacino""","""jamescaan""","""dianekeaton""",1921318,141032,"""1972""","""9.2""","""175""","""the godfather …"


In [115]:
dff.write_csv(Path('./dataset/movies_cleaned.csv'))