In [5]:
import pandas as pd
import pygwalker as pyg
import re
import numpy as np
import ast
import torch
from pathlib import Path
import pickle
from typing import List
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import PorterStemmer

Dataset [link here](https://www.kaggle.com/datasets/moon114/imdb-all-movies-dataset)

In [6]:
dataset_filepath = Path('./dataset/movies.csv')
df = pd.read_csv(dataset_filepath)

In [7]:
#lower the column names for ease of typing
df.columns = df.columns.str.lower()

try:
    # dropping extra columns if exists
    df.drop('unnamed: 0', inplace =True, axis = 1)
except:
    pass


In [8]:
df.shape, df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285341 entries, 0 to 285340
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   title        285340 non-null  object 
 1   year         245225 non-null  object 
 2   certificate  72952 non-null   object 
 3   genre        285341 non-null  object 
 4   time         204239 non-null  object 
 5   rating       184313 non-null  float64
 6   rate         239176 non-null  object 
 7   synopsis     285341 non-null  object 
 8   content      285336 non-null  object 
 9   director     269260 non-null  object 
 10  actor 1      253716 non-null  object 
 11  actor 2      246317 non-null  object 
 12  actor 3      239798 non-null  object 
 13  actor 4      230068 non-null  object 
 14  votes        184368 non-null  object 
dtypes: float64(1), object(14)
memory usage: 32.7+ MB


((285341, 15), None)

## Objective 

We will convert all important features of movies into vectors and then find similarity between movies.

### Clean the data
0. remove Duplicates
1. Remove the alpha chars from years using regex
2. Remove \n chars from genres and synopsis, content using regex
3. Remove rate column(no info), drop content(contains director and actors name which we already have)
4. Remove spaces from names

In [9]:
df.head()

Unnamed: 0,title,year,certificate,genre,time,rating,rate,synopsis,content,director,actor 1,actor 2,actor 3,actor 4,votes
0,Indiana Jones and the Dial of Destiny,-2023,PG-13,"\nAction, Adventure",154 min,6.9,Rate this,\nArchaeologist Indiana Jones races against ti...,\n Director:\nJames Mangold\n ...,James Mangold,Harrison Ford,Phoebe Waller-Bridge,Antonio Banderas,Karen Allen,41029
1,The Flash,-2023,PG-13,"\nAction, Adventure, Fantasy",144 min,7.2,Rate this,\nBarry Allen uses his super speed to change t...,\n Director:\nAndy Muschietti\n ...,Andy Muschietti,Ezra Miller,Michael Keaton,Sasha Calle,Michael Shannon,80814
2,Spider-Man: Across the Spider-Verse,-2023,PG,"\nAnimation, Action, Adventure",140 min,8.9,Rate this,\nMiles Morales catapults across the Multivers...,"\n Directors:\nJoaquim Dos Santos, \nKemp P...",Joaquim Dos Santos,Kemp Powers,Justin K. Thompson,Shameik Moore,Hailee Steinfeld,168355
3,Extraction II,-2023,R,"\nAction, Thriller",122 min,7.1,Rate this,\nAfter barely surviving his grievous wounds f...,\n Director:\nSam Hargrave\n ...,Sam Hargrave,Chris Hemsworth,Golshifteh Farahani,Adam Bessa,Tornike Gogrichiani,81123
4,Avatar: The Way of Water,-2022,PG-13,"\nAction, Adventure, Fantasy",192 min,7.6,Rate this,\nJake Sully lives with his newfound family fo...,\n Director:\nJames Cameron\n ...,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,Stephen Lang,424931


# Preprocessing

In [10]:
# check for duplicates
df.duplicated().sum()

0

## Removing Nulls

In [11]:
# check for nulls
df.isnull().sum()

title               1
year            40116
certificate    212389
genre               0
time            81102
rating         101028
rate            46165
synopsis            0
content             5
director        16081
actor 1         31625
actor 2         39024
actor 3         45543
actor 4         55273
votes          100973
dtype: int64

Here dataset sets has alot of missing values, That won't stop us from over purpose. 

The Most Important factor for Movie here is to have a title and content/synopsis. We will fill nans with space. and drop those who doesn't have title or content/synopsis.

In [12]:
idx_to_drop  = []

In [13]:
# movie index with no title
print(df.index[df.title.isnull()].values)
idx_to_drop.extend(df.index[df.title.isnull()].values)

[12708]


In [14]:
# movie index with no content
print(df.index[df.content.isnull()].values)
idx_to_drop.extend(df.index[df.content.isnull()].values)

[ 55118 210224 269574 271950 285258]


In [15]:
idx_to_drop

[12708, 55118, 210224, 269574, 271950, 285258]

In [16]:
df.shape

(285341, 15)

In [17]:
df.drop(idx_to_drop, axis = 0, inplace = True)

In [18]:
df.shape

(285335, 15)

## Dropping Columns

In [19]:
#dropping columns
df.drop(['rate', 'content'], axis = 1, inplace = True)

## Column preprocessing

In [20]:
# Movies doesn't have any id, we will create id 
df['movies_id'] = np.arange(len(df))

In [21]:
# cleaning number columns 
df['year'] = df['year'].str.replace(r'[^0-9]', '', regex=True)
df['time'] = df['time'].str.replace(r'[^0-9]', '', regex=True)
df['votes'] = df['votes'].str.replace(r'[^0-9]', '', regex=True)
# df['rating'] = df['rating'].str.replace(r'[^\D]', '', regex=True)

In [22]:
# filling 0 inplace of Nans, in votes number column (we need this for popular movies)
df['votes']=df.votes.fillna(0)
df['votes'] = df['votes'].astype('int')

In [23]:
# filling 0 inplace of Nans, in votes number column (we need this for popular movies)
df['year'] = pd.to_numeric(df['year'] )
df['year']=df.year.fillna(0)
df['year'] = df['year'].astype('int')

In [24]:
def remove_newline_chars(text):
    """Removes `\n` characters from a text string."""
    text_str = re.sub(r"\n", "", text)
    return text_str

In [25]:
df["genre"] = df["genre"].apply(remove_newline_chars)
df["synopsis"] = df["synopsis"].apply(remove_newline_chars)


In [26]:
df.head()

Unnamed: 0,title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id
0,Indiana Jones and the Dial of Destiny,2023,PG-13,"Action, Adventure",154,6.9,Archaeologist Indiana Jones races against time...,James Mangold,Harrison Ford,Phoebe Waller-Bridge,Antonio Banderas,Karen Allen,41029,0
1,The Flash,2023,PG-13,"Action, Adventure, Fantasy",144,7.2,Barry Allen uses his super speed to change the...,Andy Muschietti,Ezra Miller,Michael Keaton,Sasha Calle,Michael Shannon,80814,1
2,Spider-Man: Across the Spider-Verse,2023,PG,"Animation, Action, Adventure",140,8.9,"Miles Morales catapults across the Multiverse,...",Joaquim Dos Santos,Kemp Powers,Justin K. Thompson,Shameik Moore,Hailee Steinfeld,168355,2
3,Extraction II,2023,R,"Action, Thriller",122,7.1,After barely surviving his grievous wounds fro...,Sam Hargrave,Chris Hemsworth,Golshifteh Farahani,Adam Bessa,Tornike Gogrichiani,81123,3
4,Avatar: The Way of Water,2022,PG-13,"Action, Adventure, Fantasy",192,7.6,Jake Sully lives with his newfound family form...,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,Stephen Lang,424931,4


In [27]:
df['director'] = df['director'].str.replace(' ', '')
df['actor 1'] = df['actor 1'].str.replace(' ', '')
df['actor 2'] = df['actor 2'].str.replace(' ', '')
df['actor 3'] = df['actor 3'].str.replace(' ', '')
df['actor 4'] = df['actor 4'].str.replace(' ', '')
df['genre']   = df['genre'].str.replace(' ', '')
df['genre']   = df['genre'].str.replace(',', ', ')

In [28]:
df.head()

Unnamed: 0,title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id
0,Indiana Jones and the Dial of Destiny,2023,PG-13,"Action, Adventure",154,6.9,Archaeologist Indiana Jones races against time...,JamesMangold,HarrisonFord,PhoebeWaller-Bridge,AntonioBanderas,KarenAllen,41029,0
1,The Flash,2023,PG-13,"Action, Adventure, Fantasy",144,7.2,Barry Allen uses his super speed to change the...,AndyMuschietti,EzraMiller,MichaelKeaton,SashaCalle,MichaelShannon,80814,1
2,Spider-Man: Across the Spider-Verse,2023,PG,"Animation, Action, Adventure",140,8.9,"Miles Morales catapults across the Multiverse,...",JoaquimDosSantos,KempPowers,JustinK.Thompson,ShameikMoore,HaileeSteinfeld,168355,2
3,Extraction II,2023,R,"Action, Thriller",122,7.1,After barely surviving his grievous wounds fro...,SamHargrave,ChrisHemsworth,GolshiftehFarahani,AdamBessa,TornikeGogrichiani,81123,3
4,Avatar: The Way of Water,2022,PG-13,"Action, Adventure, Fantasy",192,7.6,Jake Sully lives with his newfound family form...,JamesCameron,SamWorthington,ZoeSaldana,SigourneyWeaver,StephenLang,424931,4


## Joining Names and SUrnames

In [3]:
import spacy

def concatenate_names(sentence):
    nlp = spacy.load("en_core_web_lg")
    ner_dict = dict()

    # Extract names from the sentence using NER
    doc = nlp(sentence)
    
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            name = ent.text
            new_name = name.replace(' ','')
            ner_dict[name] = new_name
    
    # Replace names with names without spaces in the sentence
    for i, j in ner_dict.items():
        sentence = sentence.replace(i,j)
        
    return sentence

# Test the function
# sentence = "Ryan Holiday is an author, and Ryan is famous for his books."
sentence = 'my name is dick greyson and this is my girifriend harley quinn'
result = concatenate_names(sentence)
print(result)


my name is dickgreyson and this is my girifriend harleyquinn


In [4]:
all_synopsis = df['synopsis'].tolist()

NameError: name 'df' is not defined

In [52]:
len(all_synopsis)

285335

In [1]:
import concurrent.futures as cf
import time
import string


In [None]:
start_time = time.perf_counter()
print('Strated :',  time.strftime("%H:%M:%S", time.localtime()))

with cf.ProcessPoolExecutor(max_workers=3) as executer:
    dodo = [executer.submit(concatenate_names, i) for i in all_synopsis]
    clean_tags = dodo.result()

print('Took: ',time.perf_counter()-a)
print('Finished :',  time.strftime("%H:%M:%S", time.localtime()))


Strated : 13:35:44


In [100]:

def remove_punctuations(text):
    return text.translate(str.maketrans("", "", string.punctuation))

In [101]:
df['tags'] = df['tags'].apply(remove_punctuations)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
cols_to_combine = ['title', 'year', 'certificate', 'genre', 'time', 'rating', 'synopsis',
       'director', 'actor 1', 'actor 2', 'actor 3', 'actor 4',
       'movies_id']

In [None]:
df['tags'] = df[cols_to_combine].astype(str).apply(' '.join, axis=1)


In [None]:
df.head()

In [None]:
dff = df[['title', 'movies_id', 'tags', 'votes', 'rating']]

In [None]:
dff.head()

In [None]:
dff['tags'] = dff['tags'].str.lower()


In [None]:
# Create a PorterStemmer object
stemmer = PorterStemmer()

# Apply the stemming function to each word in the series
dff['tags'] = dff['tags'].apply(stemmer.stem)


In [33]:
dff['tags'] = dff['tags'].str.split()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff['tags'] = dff['tags'].str.split()


In [34]:
dff.head()

Unnamed: 0,title,movies_id,tags,votes,rating
0,Indiana Jones and the Dial of Destiny,0,"[indiana, jones, and, the, dial, of, destiny, ...",41029,6.9
1,The Flash,1,"[the, flash, 80814, pg-13, action,, adventure,...",80814,7.2
2,Spider-Man: Across the Spider-Verse,2,"[spider-man:, across, the, spider-verse, 16835...",168355,8.9
3,Extraction II,3,"[extraction, ii, 81123, r, action,, thriller, ...",81123,7.1
4,Avatar: The Way of Water,4,"[avatar:, the, way, of, water, 424931, pg-13, ...",424931,7.6


In [35]:
all_words = dff['tags'].values.tolist()

In [36]:
def flatten_nested_list(nested_list):
    return [item for sublist in nested_list for item in (flatten_nested_list(sublist) if isinstance(sublist, list) else [sublist])]


In [37]:
all_wordss = flatten_nested_list(all_words)

In [38]:
len(all_wordss), len(set(all_wordss))

(11069853, 1105181)

In [39]:
from sklearn.feature_extraction.text import  CountVectorizer

In [40]:
cv = CountVectorizer(lowercase=True, stop_words='english', max_features=5000, dtype=np.int16 )

In [41]:
vector = cv.fit_transform(df['tags'])

In [42]:
vector

<285335x5000 sparse matrix of type '<class 'numpy.int16'>'
	with 4124550 stored elements in Compressed Sparse Row format>

In [43]:
vector[2]

<1x5000 sparse matrix of type '<class 'numpy.int16'>'
	with 22 stored elements in Compressed Sparse Row format>

In [44]:
# pickle.dump(movie_vector, open('movie_vector.pkl','wb'))  # creates file of 2.6 gb
pickle.dump(vector, open('vector.pkl','wb'))

In [45]:
def find_similar_movies(x:str, k = 5) -> List[str]:
    movie_id  = dff[dff.title == x].movies_id.values[0]
    movie_idx = dff[dff.title == x].index.values[0]
    # movie_vector  = vector.toarray()
    sim_vec  = cosine_similarity(vector, vector[movie_idx])
    y = sorted(enumerate(sim_vec), key=lambda x: x[1], reverse=True)
    # print(movie_id, movie_idx)
    # print(y[:4])
    recommended_movies_ids = [i[0] for i in y[1:k+1]]
    print(recommended_movies_ids)
    for i in recommended_movies_ids:
        print(df.iloc[i].title)

In [46]:
find_similar_movies(
    'Iron Man',

k = 10)

[298, 291, 1297, 1024, 53293, 3290, 52, 947, 100, 5]
Iron Man 3
Iron Man 2
Spectral
Bloodshot
Centipede!
Wing Commander
Transformers
Gemini Man
The Fifth Element
Transformers: Rise of the Beasts


In [47]:
dff.title[:4]

0    Indiana Jones and the Dial of Destiny
1                                The Flash
2      Spider-Man: Across the Spider-Verse
3                            Extraction II
Name: title, dtype: object

In [48]:
dff.title.sample(50).values.tolist()

['La Secta',
 'If Nothing Changes',
 'The Legend Trip',
 'Dutta Vs. Dutta',
 'Suicide Love',
 'Werner - Volles Rooäää!!!',
 'Road to Hell',
 'Das blinde Ohr der Oper',
 'Just Like Weather',
 'Lulu on the Bridge',
 'Disappearance at Clifton Hill',
 'The Protomen - Live in Nashville',
 'Katana',
 'Luen oi hang sing',
 'Kamay ni Cain',
 'My Crazy Mother',
 'The Leaving',
 'The Battle of Algiers',
 'Private High Musical',
 'The Secret of the Urn',
 'Krishna Mantras',
 'Super Monkey Returns',
 'Devils to Worry',
 'Mission: Love',
 'Otra vuelta de tuerca',
 'Exquisite Sinner',
 'The Story of Hearts',
 "Maskeli Süvari'nin Dönüsü",
 'The Real Us',
 'Na putu za Katangu',
 'A Man of No Importance',
 'Addicts',
 'Taurins Senior',
 'The Private',
 'The Little Crazy Thing',
 'Asene Kunuba Hiyat',
 'Horns',
 'Losing Addison',
 'School for Suicide',
 'Intoxicated by Love',
 'Butterflies',
 'Aline',
 'All Men Are Wicked',
 'DeMonD the movie',
 'Kadaisi Bench Karthi',
 'Eli Regrets',
 'Il ritorno dei P

In [49]:
dff[dff.title=='Iron Man']

Unnamed: 0,title,movies_id,tags,votes,rating
148,Iron Man,148,"[iron, man, 1086969, pg-13, action,, adventure...",1086969,7.9


In [50]:
def find_popular_movies( k = 5) -> List[str]:
    movie_id  = dff[dff.title == x].movies_id.values[0]
    movie_idx = dff[dff.title == x].index.values[0]
    # movie_vector  = vector.toarray()
    sim_vec  = cosine_similarity(vector, vector[movie_idx])
    y = sorted(enumerate(sim_vec), key=lambda x: x[1], reverse=True)
    # print(movie_id, movie_idx)
    # print(y[:4])
    recommended_movies_ids = [i[0] for i in y[1:k+1]]
    print(recommended_movies_ids)
    for i in recommended_movies_ids:
        print(df.iloc[i].title)

In [51]:
df.columns

Index(['title', 'year', 'certificate', 'genre', 'time', 'rating', 'synopsis',
       'director', 'actor 1', 'actor 2', 'actor 3', 'actor 4', 'votes',
       'movies_id', 'tags'],
      dtype='object')

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 285335 entries, 0 to 285340
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   title        285335 non-null  object 
 1   year         285335 non-null  int64  
 2   certificate  72950 non-null   object 
 3   genre        285335 non-null  object 
 4   time         204235 non-null  object 
 5   rating       184308 non-null  float64
 6   synopsis     285335 non-null  object 
 7   director     269259 non-null  object 
 8   actor 1      253715 non-null  object 
 9   actor 2      246316 non-null  object 
 10  actor 3      239797 non-null  object 
 11  actor 4      230067 non-null  object 
 12  votes        285335 non-null  int64  
 13  movies_id    285335 non-null  int64  
 14  tags         285335 non-null  object 
dtypes: float64(1), int64(3), object(11)
memory usage: 34.8+ MB


In [53]:
df.sort_values(['votes','rating' ], ascending=False)

Unnamed: 0,title,year,certificate,genre,time,rating,synopsis,director,actor 1,actor 2,actor 3,actor 4,votes,movies_id,tags
151550,The Shawshank Redemption,2761119,R,Drama,142,9.3,"Over the course of several years, two convicts...",FrankDarabont,TimRobbins,MorganFreeman,BobGunton,WilliamSadler,2761119,151548,The Shawshank Redemption 2761119 R Drama 142 9...
38,The Dark Knight,2733681,PG-13,"Action, Crime, Drama",152,9.0,When the menace known as the Joker wreaks havo...,ChristopherNolan,ChristianBale,HeathLedger,AaronEckhart,MichaelCaine,2733681,38,"The Dark Knight 2733681 PG-13 Action, Crime, D..."
40,Inception,2425939,PG-13,"Action, Adventure, Sci-Fi",148,8.8,A thief who steals corporate secrets through t...,ChristopherNolan,LeonardoDiCaprio,JosephGordon-Levitt,ElliotPage,KenWatanabe,2425939,40,"Inception 2425939 PG-13 Action, Adventure, Sci..."
151564,Fight Club,2198501,R,Drama,139,8.8,An insomniac office worker and a devil-may-car...,DavidFincher,BradPitt,EdwardNorton,MeatLoaf,ZachGrenier,2198501,151562,Fight Club 2198501 R Drama 139 8.8 An insomnia...
151567,Forrest Gump,2147674,PG-13,"Drama, Romance",142,8.8,The history of the United States from the 1950...,RobertZemeckis,TomHanks,RobinWright,GarySinise,SallyField,2147674,151565,"Forrest Gump 2147674 PG-13 Drama, Romance 142 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285336,Imposition,0,,Western,,,A mentally ill killer causes mayhem during a m...,,,,,,0,285330,Imposition 0 nan Western nan nan A mentally il...
285337,Man from the West,0,,Western,,,1912. The Old West is dying when an aging outl...,Jean-ChristopheJeauffre,,,,,0,285331,Man from the West 0 nan Western nan nan 1912. ...
285338,Horizon: An American Saga 4,0,,Western,,,Plot under wraps.,KevinCostner,KevinCostner,AustinArcher,,,0,285332,Horizon: An American Saga 4 0 nan Western nan ...
285339,Horizon: An American Saga 3,0,,Western,,,Plot under wraps.,KevinCostner,KevinCostner,AustinArcher,,,0,285333,Horizon: An American Saga 3 0 nan Western nan ...
