In [2]:
import surprise

In [1]:
# load required libraries
from surprise import SVD, NMF
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy

import pandas as pd
import numpy as np

In [3]:
# load the dataset from data/recommender/user_comments.csv
data = pd.read_csv('../data/recommender/user-comments.csv')

In [6]:
data

Unnamed: 0,qid,rating,sentiment,comment
0,http://www.wikidata.org/entity/Q824389,1.0,0.0,I dont know who wrote the script for this movi...
1,http://www.wikidata.org/entity/Q603448,4.0,0.0,SPOILERS THROUGHOUTThis could have been a lot ...
2,http://www.wikidata.org/entity/Q4378426,7.0,1.0,As a long time fan of Peter ODonnells greatest...
3,http://www.wikidata.org/entity/Q1420651,1.0,0.0,After finally viewing this movie in its entire...
4,http://www.wikidata.org/entity/Q780951,10.0,1.0,..especially by Lambert. This is the essential...
...,...,...,...,...
26486,http://www.wikidata.org/entity/Q621565,7.0,1.0,This is a nice movie with good performances by...
26487,http://www.wikidata.org/entity/Q1334314,9.0,1.0,when my sister said this movie was gonna be go...
26488,http://www.wikidata.org/entity/Q813097,9.0,1.0,I guess when Beat Street made a national appea...
26489,http://www.wikidata.org/entity/Q1538224,4.0,0.0,I will never forget the wit and great comedy o...


In [9]:
# load the dataset from data/recommender/plots.csv
data = pd.read_csv('../data/recommender/plots.csv')
data

Unnamed: 0,qid,plot
0,http://www.wikidata.org/entity/Q532372,The film opens at a school where a boy is pick...
1,http://www.wikidata.org/entity/Q2362333,"Billy Tully (Keach), a boxer past his prime, g..."
2,http://www.wikidata.org/entity/Q1757747,"In the far reaches of outer space, two spacesh..."
3,http://www.wikidata.org/entity/Q2479210,Teenage lovers Jordan White and Amy Blue pick ...
4,http://www.wikidata.org/entity/Q1114179,Set in the South Pacific Ocean in the year 189...
...,...,...
10362,http://www.wikidata.org/entity/Q4313927,Deven Yuvvraaj (Salman Khan) is a struggling s...
10363,http://www.wikidata.org/entity/Q51668,The film opens in Gethsemane at night as Jesus...
10364,http://www.wikidata.org/entity/Q1111758,"Two white cops, Bob 'Uncle Bob' Hodges (Robert..."
10365,http://www.wikidata.org/entity/Q40071,The film follows various plot arcs all occurri...


In [11]:
indices = pd.Series(data.index, index=data['qid']).drop_duplicates() 
indices

qid
http://www.wikidata.org/entity/Q532372          0
http://www.wikidata.org/entity/Q2362333         1
http://www.wikidata.org/entity/Q1757747         2
http://www.wikidata.org/entity/Q2479210         3
http://www.wikidata.org/entity/Q1114179         4
                                            ...  
http://www.wikidata.org/entity/Q4313927     10362
http://www.wikidata.org/entity/Q51668       10363
http://www.wikidata.org/entity/Q1111758     10364
http://www.wikidata.org/entity/Q40071       10365
http://www.wikidata.org/entity/Q15270932    10366
Length: 10367, dtype: int64

In [12]:
# build a recommender system based on dataset and TF-IDF vectorizer
# Path: Jupyter Notebooks/Recommender.ipynb
# load required libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# load the dataset from data/recommender/plots.csv
data = pd.read_csv('../data/recommender/plots.csv')

# build a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# replace NaN with empty string
data['plot'].fillna('', inplace=True)

# construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(data['plot'])

# compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# build a 1-dimensional array with movie titles
indices = pd.Series(data.index, index=data['qid']).drop_duplicates() 

In [None]:
# dump the cosine similarity matrix and indices into pickle files
import pickle
pickle.dump(cosine_sim, open('../data/recommender/cosine_sim.pickle', 'wb'))
pickle.dump(indices, open('../data/recommender/indices.pickle', 'wb'))

In [None]:
# load the cosine similarity matrix and indices from pickle files
import pickle
cosine_sim = pickle.load(open('../data/recommender/cosine_sim.pickle', 'rb'))
indices = pickle.load(open('../data/recommender/indices.pickle', 'rb'))

In [17]:
# function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # get the index of the movie that matches the title
    idx = indices[title]

    # get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # return the top 10 most similar movies
    return data['qid'].iloc[movie_indices]  

In [18]:
get_recommendations('http://www.wikidata.org/entity/Q15270932')

6745     http://www.wikidata.org/entity/Q2417231
291      http://www.wikidata.org/entity/Q3794003
4455     http://www.wikidata.org/entity/Q6692284
8346     http://www.wikidata.org/entity/Q1143310
4068     http://www.wikidata.org/entity/Q1360651
2104     http://www.wikidata.org/entity/Q3616724
5511      http://www.wikidata.org/entity/Q859131
382     http://www.wikidata.org/entity/Q14704171
4628    http://www.wikidata.org/entity/Q11187871
9251     http://www.wikidata.org/entity/Q1477823
Name: qid, dtype: object