In [10]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import requests

In [11]:
movie_db = pd.read_csv('./final_data.csv')
movie_db.dropna(how = 'any', inplace=True)
movie_db.reset_index(drop = True, inplace = True)
def strip(feature):
    movie_db[feature] = movie_db[feature].apply(lambda x: x.strip())
movie_db.drop(['Unnamed: 0'], axis = 1, inplace = True)
strip_features = ['director_name','actor_1_name','actor_2_name','actor_3_name']
for i in strip_features:
    strip(i)
movie_db['comb'] = movie_db['actor_1_name'].str.lower() + ", " + movie_db['actor_2_name'].str.lower() + ", " + movie_db['actor_3_name'].str.lower() + ", " + movie_db['director_name'].str.lower() + ", " + movie_db['genres'].str.lower()
movie_db

Unnamed: 0,id,title,vote_average,revenue,genres,release_date,year,actor_1_name,actor_2_name,actor_3_name,director_name,comb
0,157336,Interstellar,8.417,701729206,"Adventure, Drama, Science Fiction",2014-11-05,2014,Matthew McConaughey,Anne Hathaway,Michael Caine,Christopher Nolan,"matthew mcconaughey, anne hathaway, michael ca..."
1,24428,The Avengers,7.710,1518815515,"Science Fiction, Action, Adventure",2012-04-25,2012,Robert Downey Jr.,Chris Evans,Mark Ruffalo,Joss Whedon,"robert downey jr., chris evans, mark ruffalo, ..."
2,293660,Deadpool,7.606,783100000,"Action, Adventure, Comedy",2016-02-09,2016,Ryan Reynolds,Morena Baccarin,Ed Skrein,Tim Miller,"ryan reynolds, morena baccarin, ed skrein, tim..."
3,299536,Avengers: Infinity War,8.255,2052415039,"Adventure, Action, Science Fiction",2018-04-25,2018,Robert Downey Jr.,Chris Evans,Chris Hemsworth,Joe Russo,"robert downey jr., chris evans, chris hemswort..."
4,118340,Guardians of the Galaxy,7.906,772776600,"Action, Science Fiction, Adventure",2014-07-30,2014,Chris Pratt,Zoe Saldaña,Dave Bautista,James Gunn,"chris pratt, zoe saldaña, dave bautista, james..."
...,...,...,...,...,...,...,...,...,...,...,...,...
6192,427036,A Family On Edge,7.000,200,"Adventure, Crime, Drama, Family, Romance, Thri...",2015-12-22,2015,Sean White,Jules Nobles,Gabriel Davis,Hakim Robinson,"sean white, jules nobles, gabriel davis, hakim..."
6193,416280,Kamen Rider 1,6.000,4986103,"Action, Adventure, Science Fiction, Fantasy, D...",2016-03-26,2016,Shun Nishime,Hiroshi Fujioka,Ryosuke Yamamoto,Osamu Kaneda,"shun nishime, hiroshi fujioka, ryosuke yamamot..."
6194,436104,Kamen Rider Ghost: The 100 Eyecons and Ghost’s...,7.000,7740006,"Science Fiction, Action, Adventure, Drama, Fan...",2016-08-06,2016,Shun Nishime,Ryosuke Yamamoto,Hayato Isomura,Satoshi Morota,"shun nishime, ryosuke yamamoto, hayato isomura..."
6195,429674,Oye Kuch Kar Guzar,10.000,5000,"Adventure, Comedy",2016-12-03,2016,Ali Safina,Uzair Jaswal,Ushna Shah,Harris Rasheed,"ali safina, uzair jaswal, ushna shah, harris r..."


In [12]:
from collections import Counter, defaultdict

class TFIDFVectorizer:
    def __init__(self):
        self.vocab = {}
        self.idf_values = {}
    
    def fit(self, documents):
        # Create vocabulary and calculate document frequencies for each term
        doc_count = len(documents)
        doc_frequency = defaultdict(int)
        
        for doc in documents:
            tokens = self.tokenize(doc)
            unique_tokens = set(tokens)  # Only count each term once per document
            for token in unique_tokens:
                doc_frequency[token] += 1
        
        # Create vocabulary and compute IDF
        self.vocab = {term: idx for idx, term in enumerate(doc_frequency)}

        # Plus 1 for smoothing technique
        self.idf_values = {term: np.log(doc_count / (1 + doc_frequency[term])) for term in doc_frequency}

    def transform(self, documents):
        # Initialize TF-IDF matrix
        tfidf_matrix = np.zeros((len(documents), len(self.vocab)))
        
        for i, doc in enumerate(documents):
            tokens = self.tokenize(doc)
            term_counts = Counter(tokens)
            doc_length = len(tokens)
            
            for term, count in term_counts.items():
                if term in self.vocab:
                    tf = count / doc_length
                    idf = self.idf_values[term]
                    tfidf_matrix[i, self.vocab[term]] = tf * idf
        
        return tfidf_matrix

    def fit_transform(self, documents):
        self.fit(documents)
        return self.transform(documents)

    def tokenize(self, document):
        return document.split(", ")

In [13]:
tf_idf_vectorize = TFIDFVectorizer()
documents = movie_db['comb']
X = tf_idf_vectorize.fit_transform(documents)

In [15]:
class TMDB:
    def __init__(self):
        self.headers = {
            "accept": "application/json",
            "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI5YTBhYmZkNTJiYjZjMjkwYzVjMTIzZDZiNjlkODNjYiIsIm5iZiI6MTcyOTM1MzA2NS44MTA5MTgsInN1YiI6IjY2ZjI3NWU0YTgyYjAwNTcwMzI2ZDIxZiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.k1dQkNkB_2phSel35QLQSzoz98UoBve1fMRzHJyESKk"
        }

    def get_genres(self, movie_id):
        url = f"https://api.themoviedb.org/3/movie/{movie_id}?language=en-US"
        response = requests.get(url, headers= self.headers)
        data = response.json()
        genres = ''
        for i in data['genres']:
            genres += i['name'] + ', '
        return genres[:-2]
    
    def get_director_cast(self, movie_id):
        url = f"https://api.themoviedb.org/3/movie/{movie_id}/credits?language=en-US"
        response = requests.get(url, headers = self.headers)
        casts = response.json()['cast']
        director = response.json()['crew']
        actor1 = None 
        actor2 = None
        actor3 = None
        n = len(casts)
        if n >= 3:
            actor1, actor2, actor3 = [casts[i]['name'] for i in range(3)]
        else:
            actor1, actor2, actor3 = [casts[i]['name'] for i in range(n)] + [None * (3 - n)]
        director = director[0]['name']
        return actor1 + ', ' + actor2 + ', ' + actor3 + ', ' + director

In [16]:
from tmdbv3api import TMDb, Movie
tmdb = TMDb()
tmdb.api_key = '9a0abfd52bb6c290c5c123d6b69d83cb'
tmdb_movie = Movie()
TMDB  = TMDB()

class Suggestion:
    def distance(self, vector):
        return np.linalg.norm(vector)
    
    def get_suggestion(self, movie_title):
        result = tmdb_movie.search(movie_title)
        movie_id = result[0].id
        comb = (TMDB.get_director_cast(movie_id) +', '+ TMDB.get_genres(movie_id)).lower()
        print(comb)
        vector = tf_idf_vectorize.transform([comb])[0]
        distances = []
        movie = movie_db['title']
        for i in movie.index:
            distances.append([self.distance(X[i] - vector), movie[i]])
        distances.sort()
        return [distances[i][1] for i in range(1,21)]

In [17]:
# the suggestion function
suggest = Suggestion()
suggest.get_suggestion('Interstellar') # input is a movie title

matthew mcconaughey, anne hathaway, michael caine, christopher nolan, adventure, drama, science fiction


['Serenity',
 'The Dark Knight Rises',
 'Rio',
 'The Incident',
 "Roald Dahl's The Witches",
 'The Croods',
 'Army of the Dead',
 'Operation Seawolf',
 'Rio 2',
 'Cars 2',
 'Sherlock Gnomes',
 'The Addams Family 2',
 'How to Train Your Dragon 2',
 'Creators: The Past',
 'Alice Through the Looking Glass',
 'Dunkirk',
 'Pokémon the Movie: Kyurem vs. the Sword of Justice',
 'A Field in England',
 'Tenet',
 'Sing']

In [18]:
class KNeighborsRegressor:
    def __init__(self, n_neighbors=3):
        self.n_neighbors = n_neighbors
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, movie_title):
        result = tmdb_movie.search(movie_title)
        movie_id = result[0].id
        comb = (TMDB.get_director_cast(movie_id) +', '+ TMDB.get_genres(movie_id)).lower()
        vector = tf_idf_vectorize.transform([comb])[0]
        distances = []
        movie = movie_db['vote_average']
        for i in movie.index:
            distances.append([np.linalg.norm(X[i] - vector), movie[i]])
        distances.sort()
        return np.mean(distances[:self.n_neighbors], axis = 0)[1]

In [19]:
# the predicting score for a movie
kmr = KNeighborsRegressor()
kmr.fit(X, movie_db['vote_average'])
y_pred = kmr.predict('Toy Story') # input a movie title
y_pred

6.831