# there are three types of recomender systems

*   content based
*   collaborative based
*   Hybrid
---
Our model will mostly be content based

In [18]:
import numpy as np
import pandas as pd
import difflib
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
df = pd.read_csv('/content/movies.csv')
df.sample(5)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
3061,3061,10000000,Drama,,404,brother brother relationship mississippi wisco...,en,The Straight Story,"""The Straight Story"" chronicles a trip made by...",15.280261,...,112.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,The Straight Story,7.7,306,Richard Farnsworth Sissy Spacek Jane Galloway ...,"[{'name': 'David Lynch', 'gender': 2, 'departm...",David Lynch
436,436,80000000,Comedy,http://www.sonypictures.com/movies/grownups2,109418,,en,Grown Ups 2,The all-star comedy cast from Grown Ups return...,45.589568,...,100.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Just because they're a little older doesn't me...,Grown Ups 2,5.8,1155,Adam Sandler Kevin James Chris Rock David Spad...,"[{'name': 'Rupert Gregson-Williams', 'gender':...",Dennis Dugan
2243,2243,20000000,Drama Music,http://www.imnotthere-movie.com/,3902,rock and roll music style success john f. kenn...,en,I'm Not There.,Six actors portray six personas of music legen...,13.032308,...,135.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,All I Can Do Is Be Me Whoever That Is,I'm Not There.,6.6,195,Cate Blanchett Heath Ledger Julianne Moore Ben...,"[{'name': 'Jay Rabinowitz', 'gender': 2, 'depa...",Todd Haynes
1130,1130,42000000,Comedy,,18480,gay fame mockumentary hollywood lgbt,en,Brüno,Flamboyantly gay Austrian television reporter ...,16.122168,...,83.0,"[{""iso_639_1"": ""cs"", ""name"": ""\u010cesk\u00fd""...",Released,Borat was so 2006,Brüno,5.4,518,Sacha Baron Cohen Richard Bey Ron Paul Gustaf ...,"[{'name': 'Sacha Baron Cohen', 'gender': 2, 'd...",Larry Charles
2677,2677,0,Comedy Drama Romance,http://www.gooddeedsmovie.com/index.html,62008,african american single mother fianc\u00e9 fia...,en,Good Deeds,Businessman Wesley Deeds is jolted out of his ...,4.448494,...,111.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Wesley Deeds is About to Discover the Person H...,Good Deeds,6.2,45,Tyler Perry Phylicia Rash\u0101d Thandie Newto...,"[{'name': 'Kim Coleman', 'gender': 1, 'departm...",Tyler Perry


In [20]:
df.shape

(4803, 24)

In [21]:
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [22]:
df = df[['genres','id','keywords','overview','title','vote_average','vote_count','cast','director']]

In [23]:
df.sample(2)

Unnamed: 0,genres,id,keywords,overview,title,vote_average,vote_count,cast,director
3591,Action Crime Thriller,21610,,An international terrorist has New York in a g...,Nighthawks,6.4,86,Sylvester Stallone Rutger Hauer Billy Dee Will...,Bruce Malmuth
2281,Comedy,11247,jealousy cinderella work step mother high school,"Sam Montgomery is a tomboyish, unpopular girl ...",A Cinderella Story,6.1,713,Hilary Duff Jennifer Coolidge Chad Michael Mur...,Mark Rosman


In [24]:
df.shape

(4803, 9)

In [25]:
df.isna().sum()

Unnamed: 0,0
genres,28
id,0
keywords,412
overview,3
title,0
vote_average,0
vote_count,0
cast,43
director,30


# joining our main content

In [26]:
combined_lst = ['genres','keywords','overview','cast','director']
for i in combined_lst:
    df[i] = df[i].fillna('')
def join_content(row):
    return f"{row['genres']} {row['keywords']} {row['overview']} {row['cast']} {row['director']}"
df['combined_content'] = df.apply(join_content, axis=1)

In [27]:
df.isna().sum()

Unnamed: 0,0
genres,0
id,0
keywords,0
overview,0
title,0
vote_average,0
vote_count,0
cast,0
director,0
combined_content,0


In [28]:
df['combined_content']

Unnamed: 0,combined_content
0,Action Adventure Fantasy Science Fiction cultu...
1,Adventure Fantasy Action ocean drug abuse exot...
2,Action Adventure Crime spy based on novel secr...
3,Action Crime Drama Thriller dc comics crime fi...
4,Action Adventure Science Fiction based on nove...
...,...
4798,Action Crime Thriller united states\u2013mexic...
4799,Comedy Romance A newlywed couple's honeymoon ...
4800,Comedy Drama Romance TV Movie date love at fir...
4801,When ambitious New York attorney Sam is sent...


# Handling Scoring

In [29]:
C = df['vote_average'].mean()
m = df['vote_count'].quantile(0.70)  # Only consider movies with enough votes

def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v + m))*R + (m/(v + m))*C # IMDb Weighted Rating formula

df['score'] = df.apply(weighted_rating, axis=1)

In [30]:
df['score']

Unnamed: 0,score
0,7.148013
1,6.807627
2,6.276075
3,7.509565
4,6.098319
...,...
4798,6.239746
4799,6.090532
4800,6.101451
4801,6.087503


In [31]:
df.reset_index(drop=True,inplace=True)

In [32]:
movie_data = df[['id','title','score','combined_content']]

In [33]:
movie_data.shape

(4803, 4)

In [34]:
movie_data

Unnamed: 0,id,title,score,combined_content
0,19995,Avatar,7.148013,Action Adventure Fantasy Science Fiction cultu...
1,285,Pirates of the Caribbean: At World's End,6.807627,Adventure Fantasy Action ocean drug abuse exot...
2,206647,Spectre,6.276075,Action Adventure Crime spy based on novel secr...
3,49026,The Dark Knight Rises,7.509565,Action Crime Drama Thriller dc comics crime fi...
4,49529,John Carter,6.098319,Action Adventure Science Fiction based on nove...
...,...,...,...,...
4798,9367,El Mariachi,6.239746,Action Crime Thriller united states\u2013mexic...
4799,72766,Newlyweds,6.090532,Comedy Romance A newlywed couple's honeymoon ...
4800,231617,"Signed, Sealed, Delivered",6.101451,Comedy Drama Romance TV Movie date love at fir...
4801,126186,Shanghai Calling,6.087503,When ambitious New York attorney Sam is sent...


In [35]:
movie_data.to_csv('processed.csv',index=False)