# **Movie Recommendation System**

## **importing neccessary modules**

In [79]:
import pandas as pd
import numpy as np
import re
from nltk.stem.porter import PorterStemmer

## **importing dataset**

In [80]:
# importing dataset
df=pd.read_csv('data.csv')

In [81]:
# checking shape of data
df.shape

(1000, 16)

In [82]:
# taking 5 samples
df.sample(5)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
715,https://m.media-amazon.com/images/M/MV5BMmVkOT...,Bringing Up Baby,1938,Passed,102 min,"Comedy, Family, Romance",7.8,While trying to secure a $1 million donation f...,91.0,Howard Hawks,Katharine Hepburn,Cary Grant,Charles Ruggles,Walter Catlett,55163,
649,https://m.media-amazon.com/images/M/MV5BODg0Yj...,The Insider,1999,UA,157 min,"Biography, Drama, Thriller",7.8,A research chemist comes under personal and pr...,84.0,Michael Mann,Russell Crowe,Al Pacino,Christopher Plummer,Diane Venora,159886,28965197.0
569,https://m.media-amazon.com/images/M/MV5BMTlkMm...,The Gentlemen,2019,A,113 min,"Action, Comedy, Crime",7.8,An American expat tries to sell off his highly...,51.0,Guy Ritchie,Matthew McConaughey,Charlie Hunnam,Michelle Dockery,Jeremy Strong,237392,
641,https://m.media-amazon.com/images/M/MV5BMTI1ND...,Gongdong gyeongbi guyeok JSA,2000,,110 min,"Action, Drama, Thriller",7.8,After a shooting incident at the North/South K...,58.0,Chan-wook Park,Lee Yeong-ae,Lee Byung-Hun,Kang-ho Song,Kim Tae-Woo,26518,
441,https://m.media-amazon.com/images/M/MV5BMDE5Zj...,The Killing,1956,Approved,84 min,"Crime, Drama, Film-Noir",8.0,Crook Johnny Clay assembles a five man team to...,91.0,Stanley Kubrick,Sterling Hayden,Coleen Gray,Vince Edwards,Jay C. Flippen,81702,


In [83]:
# checking null values
df.isna().sum()

Poster_Link        0
Series_Title       0
Released_Year      0
Certificate      101
Runtime            0
Genre              0
IMDB_Rating        0
Overview           0
Meta_score       157
Director           0
Star1              0
Star2              0
Star3              0
Star4              0
No_of_Votes        0
Gross            169
dtype: int64

In [85]:
# feature selection
# selected features Poster_Link , Series_Title , Released_Year , Genre  , Overview , Star1 , Star2 , Star3 , Star4 , Director
df = df[['Poster_Link', 'Series_Title', 'Released_Year', 'Genre', 'Overview', 'Star1', 'Star2', 'Star3', 'Star4', 'Director']]

In [84]:
# checking for duplicates rows
df.duplicated().sum() # no duplicate rows

0

## **Data preprocessing**

In [86]:
#  Combines features into a single tag column.
def combine_features(data):
  data['Genre'] = data['Genre'].str.replace(',', '')
  data['tag'] = data['Overview'] + ' ' + data['Star1'] + ' ' + data['Star2'] + ' ' + data['Star3'] + ' ' + data['Star4'] + ' ' + data['Director'] + ' ' + data['Released_Year']
  return data

In [87]:
# applying above method
df = combine_features(df)
df = df[["Poster_Link","Series_Title","tag"]]

In [88]:
df.sample(5)

Unnamed: 0,Poster_Link,Series_Title,tag
138,https://m.media-amazon.com/images/M/MV5BMTgwNz...,Mandariinid,"In 1992, war rages in Abkhazia, a breakaway re..."
856,https://m.media-amazon.com/images/M/MV5BM2Y1ZT...,The Dirty Dozen,"During World War II, a rebellious U.S. Army Ma..."
649,https://m.media-amazon.com/images/M/MV5BODg0Yj...,The Insider,A research chemist comes under personal and pr...
634,https://m.media-amazon.com/images/M/MV5BOWE2MD...,Serenity,The crew of the ship Serenity try to evade an ...
810,https://m.media-amazon.com/images/M/MV5BMTQzMz...,Shine,"Pianist David Helfgott, driven by his father a..."


In [89]:
# downloading stopwords from nltk
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [90]:
# printing stopwords
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [91]:
# stemming the words
port_stem = PorterStemmer()
def stemming(text):
  stem_text=re.sub('[^a-zA-Z0-9 ]',' ',text).lower().split()
  stem_text=[port_stem.stem(word) for word in stem_text if not word in stopwords.words('english')]
  stem_text = ' '.join(stem_text)
  return stem_text

In [92]:
# apply method
df['tag']=df['tag'].apply(stemming)

In [73]:
df[:3]['tag']

0    1000
1    1000
2    1000
Name: tag, dtype: int64

In [100]:
# vectorization of text
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)
vectorized_data = vectorizer.fit_transform(df['tag']).toarray()

(1000, 1000)


In [97]:
vectorizer.vocabulary_

{'two': 930,
 'imprison': 456,
 'men': 612,
 'bond': 154,
 'year': 995,
 'find': 355,
 'redempt': 752,
 'act': 69,
 'tim': 904,
 'morgan': 630,
 'freeman': 375,
 'bob': 151,
 'william': 974,
 'frank': 371,
 '1994': 37,
 'crime': 252,
 'age': 74,
 'empir': 319,
 'son': 842,
 'al': 80,
 'pacino': 679,
 'jame': 473,
 'dian': 293,
 'keaton': 517,
 'franci': 370,
 'ford': 361,
 'coppola': 244,
 '1972': 18,
 'known': 533,
 'peopl': 694,
 'must': 642,
 'one': 672,
 'physic': 701,
 'fight': 350,
 'christian': 216,
 'bale': 126,
 'michael': 614,
 'cain': 180,
 'christoph': 218,
 'nolan': 659,
 '2008': 51,
 'earli': 307,
 'life': 564,
 'career': 189,
 'new': 653,
 'york': 997,
 'citi': 219,
 'famili': 342,
 'robert': 775,
 'de': 271,
 'niro': 658,
 '1974': 20,
 'attempt': 119,
 'prevent': 722,
 'forc': 360,
 'henri': 426,
 'fonda': 359,
 'lee': 553,
 'martin': 597,
 'john': 496,
 'sidney': 829,
 '1957': 7,
 'lead': 548,
 'world': 989,
 'armi': 109,
 'sam': 796,
 'ring': 770,
 'wood': 985,
 'mort

## **Model training**

In [113]:
# calculating cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(vectorized_data)

In [115]:
class MovieRecommender:
    def __init__(self, cosine_matrix, df):
        self.cosine_matrix = cosine_matrix
        self.df = df

    def predict_movies(self, series_title):
        try:
            index = self.df[self.df['Series_Title'] == series_title].index[0]
            movies = sorted(list(enumerate(self.cosine_matrix[index])), reverse=True, key=lambda x: x[1])[:5]

            for item in movies:
                print(self.df.iloc[item[0]].Series_Title)

        except IndexError:
            print(f"Movie not found with the title: {series_title}")

In [123]:
model = MovieRecommender(cosine_matrix,df)

In [122]:
model.predict_movies("The Lord of the Rings: The Return of the King")

The Lord of the Rings: The Return of the King
The Lord of the Rings: The Two Towers
The Lord of the Rings: The Fellowship of the Ring
The Hobbit: The Desolation of Smaug
The Hobbit: An Unexpected Journey


## **exporting model**

In [124]:
import pickle
with open('model.pkl', 'wb') as file:
  pickle.dump(model, file)