## Mounting Google Drive

## Reading the datasets

In [1]:
import pandas as pd
import numpy as np
dfTMDBMovies = pd.read_csv("TMDB/tmdb_5000_movies.csv")
dfTMDBCredits = pd.read_csv("TMDB/tmdb_5000_credits.csv")
dfTMDBCredits.rename(columns={"movie_id":"id"},inplace=True)
dfTMDB = dfTMDBMovies.merge(dfTMDBCredits,on="id")
dfTMDB.drop("title_y", axis=1, inplace=True)
dfTMDB.rename(columns={"title_x":"title"},inplace=True)

## Demographic recommender
Adding a score column using IMDB's weighted rating

In [None]:
meanVote = dfTMDB["vote_average"].mean()
minVotes = dfTMDB["vote_count"].quantile(0.9)
q_movies = dfTMDB.copy().loc[dfTMDB["vote_count"] >= minVotes]

def imdbWeightedRating(df):
    type(df)
    votes = df["vote_count"]
    avgRating = df["vote_average"]
    return ((votes/(votes+minVotes))*avgRating)+((minVotes/(votes+minVotes))*meanVote)

q_movies["score"] = q_movies.apply(imdbWeightedRating,axis=1)
q_movies = q_movies.sort_values("score",ascending=False)
q_movies[["title","vote_count","vote_average","score"]].head(10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")
dfTMDB["overview"] = dfTMDB["overview"].fillna("")
tfidfMatrix = tfidf.fit_transform(dfTMDB["overview"])

In [None]:
from sklearn.metrics.pairwise import linear_kernel
import joblib 
# cosineSimilarity = linear_kernel(tfidfMatrix,tfidfMatrix)
movieListIndexes = pd.Series(dfTMDB.index, index=dfTMDB['title']).drop_duplicates()
# joblib.dump(cosineSimilarity, "CosineSim1.joblib")
cosineSimilarity=joblib.load("CosineSim1.joblib")

In [None]:
def getRecommendations(title, cosineSim = cosineSimilarity):
    index = movieListIndexes[title]
    recommendations = list(enumerate(cosineSim[index]))
    recommendations = sorted(recommendations, key = lambda x:x[1], reverse = True)
    recommendations = recommendations[1:11]
    movieIndices = [i[0] for i in recommendations]
    return dfTMDB.iloc[movieIndices]["title"]
getRecommendations('The Dark Knight Rises')

In [None]:
from ast import literal_eval

dfOG = dfTMDB.copy()
# dfTMDB = dfOG.copy()
features = ["cast","crew","keywords","genres"]
for feature in features:
    dfTMDB[feature]=dfTMDB[feature].apply(literal_eval)

In [None]:
dfEval = dfTMDB.copy()
# dfTMDB = dfEval.copy()
def getDirector(x):
    for i in x:
        if i["job"]=="Director":
            return i["name"]
    return np.nan

dfTMDB['director'] = dfTMDB['crew'].apply(getDirector)
features = ["cast","keywords","genres"]
def getList(feature):
    if isinstance(feature, list):
        names = [i['name'] for i in feature]
        if len(names) > 5:
            names = names[:5]
        return names
    return []
for feature in features:
    dfTMDB[feature]=dfTMDB[feature].apply(getList)


In [None]:
def cleanData(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

features = ["cast","keywords","director","genres"]
for feature in features:
    dfTMDB[feature] = dfTMDB[feature].apply(cleanData)

In [None]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
dfTMDB['soup'] = dfTMDB.apply(create_soup, axis=1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

count = CountVectorizer(stop_words='english')
countMatrix = count.fit_transform(dfTMDB['soup'])
# cosineSimilarity2 = cosine_similarity(countMatrix, countMatrix)
# joblib.dump(cosineSimilarity2,"CosineSim2.joblib")

cosineSimilarity2=joblib.load("CosineSim2.joblib")

dfTMDB.reset_index()
indices = pd.Series(dfTMDB.index, index=dfTMDB['title'])

In [None]:
getRecommendations('The Godfather', cosineSimilarity2)

In [8]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from collections import defaultdict
import joblib

# reader = Reader()
# dfRatings = pd.read_csv("TheMovies/ratings.csv")
# tmdb_movie_ids = dfTMDBMovies["id"].tolist()
# filtered_ratings = dfRatings[dfRatings["movieId"].isin(tmdb_movie_ids)]

# data = Dataset.load_from_df(filtered_ratings[["userId", "movieId", "rating"]], reader)
# svd = SVD()
# cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5)
# trainset = data.build_full_trainset()
# svd.fit(trainset)
# trainset = joblib.load("trainset.joblib")
# svd = joblib.load('svd.joblib')
def get_unseen_items(trainset, user_id):
    seen_items = set([j for (j, _) in trainset.ur[user_id]])
    all_items = set(trainset.all_items())
    unseen_items = all_items - seen_items
    return unseen_items


top_n = defaultdict(list)
user_id=0
unseen_items = get_unseen_items(trainset, user_id)
predictions = [(item_id, svd.predict(trainset.to_raw_uid(user_id), trainset.to_raw_iid(item_id)).est)
                for item_id in unseen_items]
top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:10]

print(top_n)
print(f"User {trainset.to_raw_uid(user_id)}: {[trainset.to_raw_iid(item_id) for item_id, _ in top_n]}")


[(48, 4.363708112190022), (495, 4.308767696907775), (577, 4.304889381370242), (321, 4.277609922317406), (523, 4.2732277021333624), (66, 4.253710979303136), (602, 4.244685606545067), (586, 4.236220241317096), (320, 4.225369874986146), (408, 4.213584344271131)]
User 1: [1262, 1949, 214, 928, 951, 4995, 2069, 8456, 905, 954]


In [None]:
svd.predict(2, 302, 5)