## **Movie Recommander System**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
movie = pd.read_csv("imdb-movies-dataset.csv")
movie.head()

Unnamed: 0,Poster,Title,Year,Certificate,Duration (min),Genre,Rating,Metascore,Director,Cast,Votes,Description,Review Count,Review Title,Review
0,https://m.media-amazon.com/images/M/MV5BYWRkZj...,The Idea of You,2023.0,R,115.0,"Comedy, Drama, Romance",6.4,67.0,Michael Showalter,"Anne Hathaway, Nicholas Galitzine, Ella Rubin,...",28744,"Solène, a 40-year-old single mom, begins an un...",166,Hypocrisy as an idea,"This film, as well as the reaction to it, is a..."
1,https://m.media-amazon.com/images/M/MV5BZGI4NT...,Kingdom of the Planet of the Apes,2023.0,PG-13,145.0,"Action, Adventure, Sci-Fi",7.3,66.0,Wes Ball,"Owen Teague, Freya Allan, Kevin Durand, Peter ...",22248,"Many years after the reign of Caesar, a young ...",183,A phenomenal start to another trilogy!,"I'm a big fan of all the planet of the apes, a..."
2,https://m.media-amazon.com/images/M/MV5BZjIyOT...,Unfrosted,2023.0,PG-13,97.0,"Biography, Comedy, History",5.5,42.0,Jerry Seinfeld,"Isaac Bae, Jerry Seinfeld, Chris Rickett, Rach...",18401,"In 1963 Michigan, business rivals Kellogg's an...",333,not funny,Pretty much the worst criticism you can lay on...
3,https://m.media-amazon.com/images/M/MV5BMjA5Zj...,The Fall Guy,2023.0,PG-13,126.0,"Action, Comedy, Drama",7.3,73.0,David Leitch,"Ryan Gosling, Emily Blunt, Aaron Taylor-Johnso...",38953,A down-and-out stuntman must find the missing ...,384,Everything you needed and more!,Just got out of the Austin premier at SXSW and...
4,https://m.media-amazon.com/images/M/MV5BNTk1MT...,Challengers,2023.0,R,131.0,"Drama, Romance, Sport",7.7,82.0,Luca Guadagnino,"Zendaya, Mike Faist, Josh O'Connor, Darnell Ap...",32517,"Tashi, a former tennis prodigy turned coach, t...",194,"Watch ""Match Point"" instead",This is a tough one. I liked the concept and t...


## **Data preprocessing**

In [4]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Poster          10000 non-null  object 
 1   Title           10000 non-null  object 
 2   Year            9850 non-null   float64
 3   Certificate     7370 non-null   object 
 4   Duration (min)  9664 non-null   float64
 5   Genre           9993 non-null   object 
 6   Rating          9596 non-null   float64
 7   Metascore       7555 non-null   float64
 8   Director        9995 non-null   object 
 9   Cast            9961 non-null   object 
 10  Votes           9596 non-null   object 
 11  Description     10000 non-null  object 
 12  Review Count    9999 non-null   object 
 13  Review Title    9483 non-null   object 
 14  Review          9484 non-null   object 
dtypes: float64(4), object(11)
memory usage: 1.1+ MB


In [5]:
movie_new = movie.drop(columns=["Certificate","Metascore","Review Count"])

In [6]:
movie_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Poster          10000 non-null  object 
 1   Title           10000 non-null  object 
 2   Year            9850 non-null   float64
 3   Duration (min)  9664 non-null   float64
 4   Genre           9993 non-null   object 
 5   Rating          9596 non-null   float64
 6   Director        9995 non-null   object 
 7   Cast            9961 non-null   object 
 8   Votes           9596 non-null   object 
 9   Description     10000 non-null  object 
 10  Review Title    9483 non-null   object 
 11  Review          9484 non-null   object 
dtypes: float64(3), object(9)
memory usage: 937.6+ KB


In [7]:
movie_new.dropna(inplace=True)

In [8]:
movie_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9336 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Poster          9336 non-null   object 
 1   Title           9336 non-null   object 
 2   Year            9336 non-null   float64
 3   Duration (min)  9336 non-null   float64
 4   Genre           9336 non-null   object 
 5   Rating          9336 non-null   float64
 6   Director        9336 non-null   object 
 7   Cast            9336 non-null   object 
 8   Votes           9336 non-null   object 
 9   Description     9336 non-null   object 
 10  Review Title    9336 non-null   object 
 11  Review          9336 non-null   object 
dtypes: float64(3), object(9)
memory usage: 948.2+ KB


In [9]:
def convert_time(duration):
    h = str(int(duration//60))
    if len(str(h)) == 1:
        h = "0"+str(h)
    m = str(int(duration % 60))
    if len(str(m)) == 1:
        m = "0" + str(m)
    s = str(duration).split(".")[-1]

    return h+":"+m+":"+s


print(convert_time(143.20))
    

02:23:2


In [10]:
movie_new["Duration"] = movie_new["Duration (min)"].apply(convert_time)

In [11]:
def convert_year(data):
    return str(data).split(".")[0]

convert_year(2023.0)

'2023'

In [12]:
movie_new["Year"] = movie_new["Year"].apply(convert_year)

In [13]:
movie_new["info"] = movie_new["Genre"]+movie_new["Cast"]+[" "]+ movie_new["Year"].apply(str)+[" "]+[" "]+ movie_new["Rating"].apply(str)+[" "]+movie_new["Director"]+movie_new["Description"]

In [14]:
movie_new.head()

Unnamed: 0,Poster,Title,Year,Duration (min),Genre,Rating,Director,Cast,Votes,Description,Review Title,Review,Duration,info
0,https://m.media-amazon.com/images/M/MV5BYWRkZj...,The Idea of You,2023,115.0,"Comedy, Drama, Romance",6.4,Michael Showalter,"Anne Hathaway, Nicholas Galitzine, Ella Rubin,...",28744,"Solène, a 40-year-old single mom, begins an un...",Hypocrisy as an idea,"This film, as well as the reaction to it, is a...",01:55:0,"Comedy, Drama, RomanceAnne Hathaway, Nicholas ..."
1,https://m.media-amazon.com/images/M/MV5BZGI4NT...,Kingdom of the Planet of the Apes,2023,145.0,"Action, Adventure, Sci-Fi",7.3,Wes Ball,"Owen Teague, Freya Allan, Kevin Durand, Peter ...",22248,"Many years after the reign of Caesar, a young ...",A phenomenal start to another trilogy!,"I'm a big fan of all the planet of the apes, a...",02:25:0,"Action, Adventure, Sci-FiOwen Teague, Freya Al..."
2,https://m.media-amazon.com/images/M/MV5BZjIyOT...,Unfrosted,2023,97.0,"Biography, Comedy, History",5.5,Jerry Seinfeld,"Isaac Bae, Jerry Seinfeld, Chris Rickett, Rach...",18401,"In 1963 Michigan, business rivals Kellogg's an...",not funny,Pretty much the worst criticism you can lay on...,01:37:0,"Biography, Comedy, HistoryIsaac Bae, Jerry Sei..."
3,https://m.media-amazon.com/images/M/MV5BMjA5Zj...,The Fall Guy,2023,126.0,"Action, Comedy, Drama",7.3,David Leitch,"Ryan Gosling, Emily Blunt, Aaron Taylor-Johnso...",38953,A down-and-out stuntman must find the missing ...,Everything you needed and more!,Just got out of the Austin premier at SXSW and...,02:06:0,"Action, Comedy, DramaRyan Gosling, Emily Blunt..."
4,https://m.media-amazon.com/images/M/MV5BNTk1MT...,Challengers,2023,131.0,"Drama, Romance, Sport",7.7,Luca Guadagnino,"Zendaya, Mike Faist, Josh O'Connor, Darnell Ap...",32517,"Tashi, a former tennis prodigy turned coach, t...","Watch ""Match Point"" instead",This is a tough one. I liked the concept and t...,02:11:0,"Drama, Romance, SportZendaya, Mike Faist, Josh..."


In [15]:
movie_new.drop(columns=["Cast","Description","Genre","Director","Review Title","Review","Duration (min)"],inplace=True)

In [16]:
movie_new

Unnamed: 0,Poster,Title,Year,Rating,Votes,Duration,info
0,https://m.media-amazon.com/images/M/MV5BYWRkZj...,The Idea of You,2023,6.4,28744,01:55:0,"Comedy, Drama, RomanceAnne Hathaway, Nicholas ..."
1,https://m.media-amazon.com/images/M/MV5BZGI4NT...,Kingdom of the Planet of the Apes,2023,7.3,22248,02:25:0,"Action, Adventure, Sci-FiOwen Teague, Freya Al..."
2,https://m.media-amazon.com/images/M/MV5BZjIyOT...,Unfrosted,2023,5.5,18401,01:37:0,"Biography, Comedy, HistoryIsaac Bae, Jerry Sei..."
3,https://m.media-amazon.com/images/M/MV5BMjA5Zj...,The Fall Guy,2023,7.3,38953,02:06:0,"Action, Comedy, DramaRyan Gosling, Emily Blunt..."
4,https://m.media-amazon.com/images/M/MV5BNTk1MT...,Challengers,2023,7.7,32517,02:11:0,"Drama, Romance, SportZendaya, Mike Faist, Josh..."
...,...,...,...,...,...,...,...
9995,https://m.media-amazon.com/images/M/MV5BMzg5MW...,The Greatest Show on Earth,2020,6.5,16078,02:32:0,"Drama, Family, RomanceJames Stewart, Charlton ..."
9996,https://m.media-amazon.com/images/M/MV5BYzA0ZG...,Berserk: Ougon Jidai-hen I - Haou no Tamago,2020,7.5,14300,01:16:0,"Animation, Action, AdventureHiroaki Iwanaga, C..."
9997,https://m.media-amazon.com/images/M/MV5BM2U1Mj...,Is-slottet,2020,6.5,740,01:18:0,"Mystery, DramaLine Storesund, Hilde Nyeggen Ma..."
9998,https://m.media-amazon.com/images/M/MV5BMTAwOD...,Loving Pablo,2020,6.4,22447,02:03:0,"Biography, Crime, DramaJavier Bardem, Penélope..."


In [17]:
movie_new.drop_duplicates(inplace=True)

In [18]:
movie_new.reset_index(inplace=True)

In [19]:
movie_new["info"][0]

'Comedy, Drama, RomanceAnne Hathaway, Nicholas Galitzine, Ella Rubin, Annie Mumolo 2023  6.4 Michael ShowalterSolène, a 40-year-old single mom, begins an unexpected romance with 24-year-old Hayes Campbell, the lead singer of August Moon, the hottest boy band on the planet.'

In [20]:
# def define_shape(data):
#     new_data = data.split()
#     if len(new_data)<50:
#         return None
#     return new_data[:50]

In [21]:
# movie_new["info"] = movie_new["info"].apply(define_shape)

In [22]:
movie_new.dropna(inplace=True)

In [23]:

movie_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9336 entries, 0 to 9335
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   index     9336 non-null   int64  
 1   Poster    9336 non-null   object 
 2   Title     9336 non-null   object 
 3   Year      9336 non-null   object 
 4   Rating    9336 non-null   float64
 5   Votes     9336 non-null   object 
 6   Duration  9336 non-null   object 
 7   info      9336 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 583.6+ KB


In [24]:
def votes_convert(data):
    ans = ""
    for i in data:
        if i in "1234567890":
            ans+=i
    return float(ans)
    

In [25]:
movie_new["Votes"] = movie_new["Votes"].apply(votes_convert)

In [26]:
movie_new.describe()

Unnamed: 0,index,Rating,Votes
count,9336.0,9336.0,9336.0
mean,4912.202978,6.442063,98691.4
std,2858.832153,1.043144,183940.5
min,0.0,1.3,25.0
25%,2437.75,5.8,12242.75
50%,4881.5,6.5,38476.0
75%,7344.25,7.2,103582.5
max,9999.0,9.7,2894940.0


In [27]:
# Select top 100  movie with highest rating and votes movie
top_100 = movie_new.sort_values(by=["Rating","Votes"],ascending=False).iloc[:100,:]

In [28]:
# selection top 100 for viewing home page
top_100

Unnamed: 0,index,Poster,Title,Year,Rating,Votes,Duration,info
6112,6401,https://m.media-amazon.com/images/M/MV5BOGMyZT...,August,1985,9.7,2299.0,00:52:0,DocumentaryZhehan Zhang 1985 9.7 Zhehan Zhang...
64,71,https://m.media-amazon.com/images/M/MV5BNDE3OD...,The Shawshank Redemption,2023,9.3,2894940.0,02:22:0,"DramaTim Robbins, Morgan Freeman, Bob Gunton, ..."
49,55,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,2023,9.2,2017131.0,02:55:0,"Crime, DramaMarlon Brando, Al Pacino, James Ca..."
97,106,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2002,9.0,2876586.0,02:32:0,"Action, Crime, DramaChristian Bale, Heath Ledg..."
106,116,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,The Lord of the Rings: The Return of the King,2002,9.0,1982971.0,03:21:0,"Action, Adventure, DramaElijah Wood, Viggo Mor..."
...,...,...,...,...,...,...,...,...
633,674,https://m.media-amazon.com/images/M/MV5BMjMwND...,Spider-Man: Into the Spider-Verse,2012,8.4,673811.0,01:57:0,"Animation, Action, AdventurePeter Ramsey, Rodn..."
669,712,https://m.media-amazon.com/images/M/MV5BYjQ5Nj...,Coco,1996,8.4,592504.0,01:45:0,"Animation, Adventure, DramaAdrian Molina, Anth..."
1006,1059,https://m.media-amazon.com/images/M/MV5BMWMxYj...,Dr. Strangelove or: How I Learned to Stop Worr...,2018,8.4,518982.0,01:35:0,"Comedy, WarPeter Sellers, George C. Scott, Ste..."
2290,2393,https://m.media-amazon.com/images/M/MV5BNTkyOG...,3 Idiots,2013,8.4,435460.0,02:50:0,"Comedy, DramaAamir Khan, Madhavan, Mona Singh,..."


In [29]:
movie_new.drop(columns="Votes",inplace=True)
top_100.drop(columns="Votes",inplace=True)

In [30]:
movie_new.head()

Unnamed: 0,index,Poster,Title,Year,Rating,Duration,info
0,0,https://m.media-amazon.com/images/M/MV5BYWRkZj...,The Idea of You,2023,6.4,01:55:0,"Comedy, Drama, RomanceAnne Hathaway, Nicholas ..."
1,1,https://m.media-amazon.com/images/M/MV5BZGI4NT...,Kingdom of the Planet of the Apes,2023,7.3,02:25:0,"Action, Adventure, Sci-FiOwen Teague, Freya Al..."
2,2,https://m.media-amazon.com/images/M/MV5BZjIyOT...,Unfrosted,2023,5.5,01:37:0,"Biography, Comedy, HistoryIsaac Bae, Jerry Sei..."
3,3,https://m.media-amazon.com/images/M/MV5BMjA5Zj...,The Fall Guy,2023,7.3,02:06:0,"Action, Comedy, DramaRyan Gosling, Emily Blunt..."
4,4,https://m.media-amazon.com/images/M/MV5BNTk1MT...,Challengers,2023,7.7,02:11:0,"Drama, Romance, SportZendaya, Mike Faist, Josh..."


In [31]:
top_100

Unnamed: 0,index,Poster,Title,Year,Rating,Duration,info
6112,6401,https://m.media-amazon.com/images/M/MV5BOGMyZT...,August,1985,9.7,00:52:0,DocumentaryZhehan Zhang 1985 9.7 Zhehan Zhang...
64,71,https://m.media-amazon.com/images/M/MV5BNDE3OD...,The Shawshank Redemption,2023,9.3,02:22:0,"DramaTim Robbins, Morgan Freeman, Bob Gunton, ..."
49,55,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,2023,9.2,02:55:0,"Crime, DramaMarlon Brando, Al Pacino, James Ca..."
97,106,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2002,9.0,02:32:0,"Action, Crime, DramaChristian Bale, Heath Ledg..."
106,116,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,The Lord of the Rings: The Return of the King,2002,9.0,03:21:0,"Action, Adventure, DramaElijah Wood, Viggo Mor..."
...,...,...,...,...,...,...,...
633,674,https://m.media-amazon.com/images/M/MV5BMjMwND...,Spider-Man: Into the Spider-Verse,2012,8.4,01:57:0,"Animation, Action, AdventurePeter Ramsey, Rodn..."
669,712,https://m.media-amazon.com/images/M/MV5BYjQ5Nj...,Coco,1996,8.4,01:45:0,"Animation, Adventure, DramaAdrian Molina, Anth..."
1006,1059,https://m.media-amazon.com/images/M/MV5BMWMxYj...,Dr. Strangelove or: How I Learned to Stop Worr...,2018,8.4,01:35:0,"Comedy, WarPeter Sellers, George C. Scott, Ste..."
2290,2393,https://m.media-amazon.com/images/M/MV5BNTkyOG...,3 Idiots,2013,8.4,02:50:0,"Comedy, DramaAamir Khan, Madhavan, Mona Singh,..."


#### **Stem the data**

In [32]:
# here we need stem the words
import nltk

# function for steming the word
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [33]:
def stem(data):
    new = []
    for i in data.split():
        new.append(stemmer.stem(i))
    return " ".join(new)

In [34]:
stem(movie_new["info"][0])

'comedy, drama, romanceann hathaway, nichola galitzine, ella rubin, anni mumolo 2023 6.4 michael showaltersolène, a 40-year-old singl mom, begin an unexpect romanc with 24-year-old hay campbell, the lead singer of august moon, the hottest boy band on the planet.'

In [35]:
stem("Dancing created")

'danc creat'

In [36]:
movie_new["info"] = movie_new["info"].apply(stem)

## **Model Building**

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words="english")

In [38]:
vector = cv.fit_transform(movie_new["info"]).toarray()

In [39]:
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [41]:
vector[0]

array([0, 0, 0, ..., 0, 0, 0])

In [42]:
cv.get_feature_names_out()

array(['000', '10', '11', ..., 'zooey', 'zoë', 'zucker'], dtype=object)

In [97]:
# here we need stem the words

In [43]:
vector.shape

(9336, 5000)

In [44]:
stem("zombie zombi")

'zombi zombi'

In [45]:
# new calculate cosing similarity 
from sklearn.metrics.pairwise import cosine_similarity
similerity = cosine_similarity(vector)

In [46]:
similerity

array([[1.        , 0.14089399, 0.08466675, ..., 0.22718473, 0.        ,
        0.04233338],
       [0.14089399, 1.        , 0.04622502, ..., 0.06201737, 0.04756515,
        0.04622502],
       [0.08466675, 0.04622502, 1.        , ..., 0.        , 0.0571662 ,
        0.        ],
       ...,
       [0.22718473, 0.06201737, 0.        , ..., 1.        , 0.153393  ,
        0.1490712 ],
       [0.        , 0.04756515, 0.0571662 , ..., 0.153393  , 1.        ,
        0.11433239],
       [0.04233338, 0.04622502, 0.        , ..., 0.1490712 , 0.11433239,
        1.        ]])

In [47]:
# now function for fatch top five movie
def fatch_movie(movie):
    ans = []
    ind = movie_new[movie_new["Title"] == movie].index[0]
    distance = vector[ind]
    top_5_index = sorted(list(enumerate(distance)),key= (lambda x:x[1]),reverse=True)[1:6]

    for i in top_5_index:
        movies = (movie_new.iloc[i[0]].Title)
        ans.append(movies)
        print(movies)
    return ans


In [48]:
fatch_movie("August")

Swamp Thing
The Godfather
Killers of the Flower Moon
Bottoms
Talk to Me


['Swamp Thing',
 'The Godfather',
 'Killers of the Flower Moon',
 'Bottoms',
 'Talk to Me']

In [49]:
import pickle
pickle.dump(similerity, open("similarity.pkl","wb"))
pickle.dump(movie_new,open("movies.pkl","wb"))
pickle.dump(vector,open("vector.pkl","wb"))
pickle.dump(top_100,open("top_100.pkl","wb"))


In [50]:
movie_new.shape

(9336, 7)

In [51]:
movie_new.iloc[1265]

index                                                    1328
Poster      https://m.media-amazon.com/images/M/MV5BNzM1OD...
Title                            The School for Good and Evil
Year                                                     2017
Rating                                                    5.9
Duration                                              02:27:0
info        action, comedy, dramakit young, sophia ann car...
Name: 1265, dtype: object