In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df1 = pd.read_csv("tmdb_5000_credits.csv")
df2 = pd.read_csv("tmdb_5000_movies.csv")


In [4]:
main_df = df2
main_df['cast'] = df1["cast"]
main_df['crew'] = df1['crew']
main_df = main_df[["overview","popularity","title","vote_average","vote_count","cast","crew","keywords","genres"]]
main_df.fillna(" ",inplace= True)

In [5]:
demodf = main_df[main_df["vote_count"]>main_df['vote_count'].quantile(0.95)]


In [6]:
##weighted rating formula
def weighted_rating(df):
    v=df['vote_count']
    m=main_df['vote_count'].quantile(q=0.95)
    r=df['vote_average']
    c=main_df['vote_average'].mean()
    wr = (r*v+c*m)/(v+m)
    return wr

In [7]:
demodf["weighted_rating"] = demodf.apply(weighted_rating,axis =1)
## scaling popularity to same range
demodf["popularity"] = 10*(demodf["popularity"]/max(demodf["popularity"]))

In [8]:
def show_by_wr(n):
    dft = demodf.sort_values(by = 'weighted_rating',ascending = False).head(min(n,len(demodf)))
    toshow = list(dft['title'])
    for i in range(len(toshow)):
        print(i+1,toshow[i])

def show_by_wr_and_pop(n):
    demodf1 = demodf
    demodf1["sums"] = demodf["popularity"]+demodf["weighted_rating"]
    dft = demodf1.sort_values(by = 'sums',ascending = False).head(min(n,len(demodf)))
    toshow = list(dft['title'])
    for i in range(len(toshow)):
        print(i+1,toshow[i])

In [9]:

tf_vect = TfidfVectorizer(stop_words ="english") ##to remove words like and the etc
tf_matrix = tf_vect.fit_transform(main_df['overview'])

similarities = cosine_similarity(tf_matrix)

In [10]:
keys = {} ## to keep track of the indexes
for i in range(len(main_df['title'])):
    titl = main_df['title'][i]
    keys[titl.lower().replace(' ','')] = i

In [11]:

def contentbasedreco(title,n):
    idx = keys[title.lower().replace(' ','')]
    check = list(enumerate(similarities[idx]))
    check.sort(key = lambda x:x[1],reverse = True)
    print("the movie recommendations according to the title searched are:")
    for i in range(1,min(n+1,len(check)),1):
        print(i,main_df.iloc[check[i][0]]['title'])




In [12]:
new = ['cast' ,"crew","keywords","genres"]
for col in new:
    main_df[col] = main_df[col].apply(json.loads)


## functions for crew mebers fetch and normal name check
def director(x):
    for a in x:
        if a['job']=='Director':
            return a['name'].lower().replace(' ','')
    return 'NaN'.lower().replace(' ','')

def somecrew(x):
    new=[]
    for a in x[:min(5,len(x))]:
        new.append(a['name'].lower().replace(' ','')) 
    return new

    return []


main_df['director']=main_df['crew'].apply(lambda x: director(x))
main_df['actor']=main_df['cast'].apply(lambda x:somecrew(x))
main_df['genres']=main_df['genres'].apply(lambda x:somecrew(x))
main_df['keywords']=main_df['keywords'].apply(lambda x:somecrew(x))


In [13]:
def metadata(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['actor']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

main_df['meta'] = main_df.apply(metadata, axis=1)

In [14]:
tf_matrix_2 = tf_vect.fit_transform(main_df['meta'])
similarities_2 = cosine_similarity(tf_matrix_2)

In [15]:
def crewbasedreco(title,n):
    idx = keys[title.lower().replace(' ','')]
    check = list(enumerate(similarities_2[idx]))
    check.sort(key = lambda x:x[1],reverse = True)
    print("the movie recommendations according to the related crews,casts are:")
    for i in range(1,min(n+1,len(check)),1):
        print(i,main_df.iloc[check[i][0]]['title'])

In [16]:
def overallreco(title,n):
    if(title=="-1"):
        print("according to ratings")
        show_by_wr(n)
        print(" ")
        print("according to both ratings and popularity")
        show_by_wr_and_pop(n)
    else:
        try:
            contentbasedreco(title,n)           
            crewbasedreco(title,n)
        except Exception as e:
            print("not much movies related to this movie! try something else")

In [17]:
colldf = pd.read_csv("ratings_small.csv")
colldf.head()
ar = colldf['movieId'].value_counts().index
di = {}
ide = {}
for i in range(len(ar)):
    di[ar[i]] = i+1
    ide[i+1] = ar[i]
    

In [18]:
for i in range(len(colldf['movieId'])):
    if(i%10000==0):
        print(i)
    colldf['movieId'][i] = di[colldf['movieId'][i]]

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000


In [19]:
r_m = np.ndarray(shape = (np.max(colldf['movieId'].values),np.max(colldf['userId'])))
r_m[colldf['movieId'].values-1,colldf['userId'].values-1] = colldf['rating'].values

In [20]:
r_m = r_m - np.asarray([np.mean(r_m,1)]).T

In [21]:
u,s,v = np.linalg.svd(r_m)

In [22]:
checkdf = pd.read_csv("links_small.csv")

In [23]:
def collabreco(movie_id,n):
    rm = v.T[:,:50]
    dt = rm[movie_id-1,:]
    
    sim = np.dot(dt,rm.T)
    idxs = np.argsort(-sim)
    print("some movies that are liked by fans of this movie are")
    i =0
    for idx in idxs[:min(n,len(idxs))]:
        
        
        try:
            chidx = ide[idx]
            dfidx = checkdf[checkdf.movieId==chidx].tmdbId.values[0]
            print(i+1,df1[df1.movie_id==dfidx].title.values[0])
            i =i +1
        except Exception as e:
            pass
        

In [24]:
def recommend(movie_name,n):
    try:
        overallreco(movie_name,n)
    except Exception as e:
        print("not much data about the movie")
    try:
        id1 = keys[movie_name.lower().replace(' ','')]
        id2 = df1.iloc[id1]['movie_id']
        id3 = checkdf[checkdf.tmdbId==id2].movieId.values[0]
        id4 = di[id3]
        collabreco(id4,n)
    except Exception as e:
        pass


In [25]:
recommend("the shawshank redemption",2)

the movie recommendations according to the title searched are:
1 Civil Brand
2 Prison
the movie recommendations according to the related crews,casts are:
1 Catch a Fire
2 The Hudsucker Proxy
some movies that are liked by fans of this movie are
1 Dumb and Dumber
2 Big


In [27]:
##code taken from surprise documentation for performance measures
from surprise import Dataset, SVD, Reader
from surprise.model_selection import cross_validate

reader = Reader()
data = Dataset.load_from_df(colldf[['userId', 'movieId', 'rating']], reader)

algo = SVD()

cross_validate(algo, data, measures=["RMSE", "MAE"], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8891  0.8928  0.8971  0.8981  0.9102  0.8975  0.0071  
MAE (testset)     0.6855  0.6885  0.6938  0.6916  0.7000  0.6919  0.0050  
Fit time          1.01    1.14    0.90    0.92    1.01    0.99    0.08    
Test time         0.12    0.09    0.19    0.11    0.12    0.13    0.03    


{'test_rmse': array([0.88913321, 0.89277222, 0.89709493, 0.89806644, 0.91023508]),
 'test_mae': array([0.68546352, 0.68846075, 0.69378735, 0.6915959 , 0.7000305 ]),
 'fit_time': (1.0077953338623047,
  1.1374835968017578,
  0.8998796939849854,
  0.9151120185852051,
  1.0079929828643799),
 'test_time': (0.11514139175415039,
  0.09123444557189941,
  0.18954753875732422,
  0.10870146751403809,
  0.12476491928100586)}