In [2]:
import surprise


In [1]:
# load required libraries
from surprise import SVD, NMF
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy

import pandas as pd
import numpy as np


# Recommender based on Ratings


In [11]:
# load the dataset from data/recommender/user_comments.csv
comments = pd.read_csv('../data/recommender/user-comments.csv')

In [12]:
comments

Unnamed: 0,qid,rating,sentiment,comment
0,http://www.wikidata.org/entity/Q824389,1.0,0.0,I dont know who wrote the script for this movi...
1,http://www.wikidata.org/entity/Q603448,4.0,0.0,SPOILERS THROUGHOUTThis could have been a lot ...
2,http://www.wikidata.org/entity/Q4378426,7.0,1.0,As a long time fan of Peter ODonnells greatest...
3,http://www.wikidata.org/entity/Q1420651,1.0,0.0,After finally viewing this movie in its entire...
4,http://www.wikidata.org/entity/Q780951,10.0,1.0,..especially by Lambert. This is the essential...
...,...,...,...,...
26486,http://www.wikidata.org/entity/Q621565,7.0,1.0,This is a nice movie with good performances by...
26487,http://www.wikidata.org/entity/Q1334314,9.0,1.0,when my sister said this movie was gonna be go...
26488,http://www.wikidata.org/entity/Q813097,9.0,1.0,I guess when Beat Street made a national appea...
26489,http://www.wikidata.org/entity/Q1538224,4.0,0.0,I will never forget the wit and great comedy o...


In [13]:
# append index column to data as user_id
comments['user_id'] = comments.index

In [14]:
comments


Unnamed: 0,qid,rating,sentiment,comment,user_id
0,http://www.wikidata.org/entity/Q824389,1.0,0.0,I dont know who wrote the script for this movi...,0
1,http://www.wikidata.org/entity/Q603448,4.0,0.0,SPOILERS THROUGHOUTThis could have been a lot ...,1
2,http://www.wikidata.org/entity/Q4378426,7.0,1.0,As a long time fan of Peter ODonnells greatest...,2
3,http://www.wikidata.org/entity/Q1420651,1.0,0.0,After finally viewing this movie in its entire...,3
4,http://www.wikidata.org/entity/Q780951,10.0,1.0,..especially by Lambert. This is the essential...,4
...,...,...,...,...,...
26486,http://www.wikidata.org/entity/Q621565,7.0,1.0,This is a nice movie with good performances by...,26486
26487,http://www.wikidata.org/entity/Q1334314,9.0,1.0,when my sister said this movie was gonna be go...,26487
26488,http://www.wikidata.org/entity/Q813097,9.0,1.0,I guess when Beat Street made a national appea...,26488
26489,http://www.wikidata.org/entity/Q1538224,4.0,0.0,I will never forget the wit and great comedy o...,26489


## Surprise

In [7]:
# prepare data for surprise
reader = Reader(rating_scale=(0,1))
movie_surprise = Dataset.load_from_df(data[['user_id', 'qid', 'sentiment']], reader)


In [150]:
# train using cross validation on SVD
algo = SVD()
cross_validate(algo, movie_surprise, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3529  0.3528  0.3542  0.3515  0.3507  0.3524  0.0012  
MAE (testset)     0.3023  0.3025  0.3039  0.3015  0.3020  0.3024  0.0008  
Fit time          0.19    0.17    0.22    0.19    0.33    0.22    0.06    
Test time         0.02    0.02    0.02    0.02    0.03    0.02    0.00    


{'test_rmse': array([0.35290013, 0.35277226, 0.35416095, 0.35145073, 0.35074155]),
 'test_mae': array([0.30230697, 0.30246422, 0.3039157 , 0.30151303, 0.3019711 ]),
 'fit_time': (0.19182491302490234,
  0.17084431648254395,
  0.21894526481628418,
  0.1931912899017334,
  0.3287191390991211),
 'test_time': (0.015185117721557617,
  0.015334606170654297,
  0.0169217586517334,
  0.02030801773071289,
  0.027342557907104492)}

In [8]:
# predict using SVD
algo = SVD()
trainset, testset = train_test_split(movie_surprise, test_size=.2, random_state=42)
algo.fit(trainset)
predictions_svd = algo.test(testset)
accuracy.rmse(predictions_svd)


RMSE: 0.3553


0.35531364904452173

In [23]:
predictions_svd[:10]

[Prediction(uid=9259, iid='http://www.wikidata.org/entity/Q313659', r_ui=1.0, est=0.8786273036836345, details={'was_impossible': False}),
 Prediction(uid=22146, iid='http://www.wikidata.org/entity/Q2698384', r_ui=1.0, est=0.8437773370731496, details={'was_impossible': False}),
 Prediction(uid=1717, iid='http://www.wikidata.org/entity/Q4378426', r_ui=0.0, est=0.7226972282302915, details={'was_impossible': False}),
 Prediction(uid=2696, iid='http://www.wikidata.org/entity/Q3989319', r_ui=0.0, est=0.4223618172309451, details={'was_impossible': False}),
 Prediction(uid=1848, iid='http://www.wikidata.org/entity/Q1214303', r_ui=1.0, est=0.8723642368596253, details={'was_impossible': False}),
 Prediction(uid=20896, iid='http://www.wikidata.org/entity/Q1198096', r_ui=1.0, est=0.8105581729384415, details={'was_impossible': False}),
 Prediction(uid=10623, iid='http://www.wikidata.org/entity/Q2699724', r_ui=1.0, est=0.9195302607146698, details={'was_impossible': False}),
 Prediction(uid=21476, ii

In [21]:
for pred in predictions_svd:
    if pred.uid == 6622:
        print(pred)

user: 6622       item: http://www.wikidata.org/entity/Q1366560 r_ui = 1.00   est = 0.75   {'was_impossible': False}


In [153]:
preds = {}
for pred in predictions_svd:
    if pred.est < 0.5:
        pred.est = 0
    else:
        pred.est = 1
predictions_svd[:20]

AttributeError: can't set attribute

In [13]:
from collections import defaultdict


def get_top_n(predictions, n=10):
    """
    Return the top-N recommendation for each user from a set of predictions.
    Args:
    predictions(list of Prediction objects): The list of predictions, as
    returned by the test method of an algorithm.
    n(int): The number of recommendation to output for each user. Default
    is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
    [(raw item id, rating estimation), ...] of size n.
    """
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    
    return top_n


In [26]:
# SVD Recommendations
top_n = get_top_n(predictions_svd, n=10)
top_n[6622]

[('http://www.wikidata.org/entity/Q1366560', 0.750930414146896)]

In [36]:
preds = []
for item in data['qid'].unique():
    pred = algo.predict(6622, item)
    if pred.est > 0.5:
        preds.append(pred)

In [38]:
# train using cross validation on NMF
algo = NMF()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)


Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    3.4531  3.4603  3.4470  3.4456  3.4451  3.4502  0.0058  
MAE (testset)     3.2491  3.2550  3.2385  3.2361  3.2350  3.2427  0.0079  
Fit time          1.15    1.12    1.18    1.19    1.17    1.16    0.02    
Test time         0.02    0.02    0.02    0.02    0.02    0.02    0.00    


{'test_rmse': array([3.45308395, 3.46034599, 3.44697498, 3.4456411 , 3.44514903]),
 'test_mae': array([3.24908661, 3.25498906, 3.2385198 , 3.23610842, 3.23496969]),
 'fit_time': (1.1528708934783936,
  1.117828130722046,
  1.176422357559204,
  1.1874279975891113,
  1.1736862659454346),
 'test_time': (0.020592689514160156,
  0.019394397735595703,
  0.019087791442871094,
  0.01924443244934082,
  0.020799636840820312)}

# Recommender based on Plots


In [2]:
# load the dataset from data/recommender/plots.csv
data = pd.read_csv('../data/recommender/plots.csv')
data


Unnamed: 0,qid,plot
0,http://www.wikidata.org/entity/Q532372,The film opens at a school where a boy is pick...
1,http://www.wikidata.org/entity/Q2362333,"Billy Tully (Keach), a boxer past his prime, g..."
2,http://www.wikidata.org/entity/Q1757747,"In the far reaches of outer space, two spacesh..."
3,http://www.wikidata.org/entity/Q2479210,Teenage lovers Jordan White and Amy Blue pick ...
4,http://www.wikidata.org/entity/Q1114179,Set in the South Pacific Ocean in the year 189...
...,...,...
10362,http://www.wikidata.org/entity/Q4313927,Deven Yuvvraaj (Salman Khan) is a struggling s...
10363,http://www.wikidata.org/entity/Q51668,The film opens in Gethsemane at night as Jesus...
10364,http://www.wikidata.org/entity/Q1111758,"Two white cops, Bob 'Uncle Bob' Hodges (Robert..."
10365,http://www.wikidata.org/entity/Q40071,The film follows various plot arcs all occurri...


In [27]:
# get imdb id for each movie from graph and append to dataframe
for qid in data['qid']:
    imdb = movie2imdb[URIRef(qid)]
    data.loc[data['qid'] == qid, 'imdb id'] = imdb

In [31]:
data

Unnamed: 0,qid,plot,imdb id
0,http://www.wikidata.org/entity/Q532372,The film opens at a school where a boy is pick...,tt0212346
1,http://www.wikidata.org/entity/Q2362333,"Billy Tully (Keach), a boxer past his prime, g...",tt0068575
2,http://www.wikidata.org/entity/Q1757747,"In the far reaches of outer space, two spacesh...",tt0059792
3,http://www.wikidata.org/entity/Q2479210,Teenage lovers Jordan White and Amy Blue pick ...,tt0112887
4,http://www.wikidata.org/entity/Q1114179,Set in the South Pacific Ocean in the year 189...,tt0102782
...,...,...,...
10362,http://www.wikidata.org/entity/Q4313927,Deven Yuvvraaj (Salman Khan) is a struggling s...,tt1105747
10363,http://www.wikidata.org/entity/Q51668,The film opens in Gethsemane at night as Jesus...,tt0335345
10364,http://www.wikidata.org/entity/Q1111758,"Two white cops, Bob 'Uncle Bob' Hodges (Robert...",tt0094894
10365,http://www.wikidata.org/entity/Q40071,The film follows various plot arcs all occurri...,tt0137338


In [147]:
# scrape IMDb ratigns for each movie and append to pandas dataframe
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re

def get_imdb_rating(movie):
    url = 'https://www.imdb.com/search/title/?title=' + movie.replace(' ', '+')
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    print(soup)
    try:
        rating = soup.find('div', class_='inline-block ratings-imdb-rating').text
        return float(rating)
    except:
        print(movie)
        return None

In [13]:
# load the graph from pickle
import pickle
with open('fixed_graph_crowd.pickle', 'rb') as f:
    graph = pickle.load(f)

In [15]:
# load the embeddings
import os
os.chdir('../data')
os.chdir('./ddis-graph-embeddings')
entity_emb = np.load('./entity_embeds.npy')
relation_emb = np.load('./relation_embeds.npy')
entity_file = './entity_ids.del'
relation_file = './relation_ids.del'

In [16]:
# define some prefixes
import rdflib
from rdflib import Graph, URIRef, Literal

WD = rdflib.Namespace('http://www.wikidata.org/entity/')
WDT = rdflib.Namespace('http://www.wikidata.org/prop/direct/')
DDIS = rdflib.Namespace('http://ddis.ch/atai/')
RDFS = rdflib.namespace.RDFS
SCHEMA = rdflib.Namespace('http://schema.org/')

In [17]:
ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}
rel2lbl = {rel: str(lbl) for rel, lbl in graph.subject_objects(RDFS.label)}
lbl2rel = {lbl: rel for rel, lbl in rel2lbl.items()}

In [22]:
# dictionary for movie entity to imdb id
movie2imdb = {ent: o for ent, o in graph.subject_objects(WDT.P345)}

In [26]:
movie2imdb[URIRef('http://www.wikidata.org/entity/Q532372')]

rdflib.term.Literal('tt0212346', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))

In [119]:
for qid in data['qid'][:10]:
    qid_s = qid.split('/')[-1]
    movie = ent2lbl[WD[qid_s]]
    rating = get_imdb_rating(movie)
    print(movie, rating)
    data.loc[data['qid'] == qid, 'imdb_rating'] = rating

Miss Congeniality 6.3
Fat City 7.3
Planet of the Vampires 6.2
The Doom Generation 6.0
Return to the Blue Lagoon 5.1
The Descendants 7.3
Don't Be a Menace to South Central While Drinking Your Juice in the Hood 6.5
Largo Winch II 6.1
The Wedding Ringer 6.6
Next Friday 6.1


In [120]:
data

Unnamed: 0,qid,plot,imdb_rating
0,http://www.wikidata.org/entity/Q532372,The film opens at a school where a boy is pick...,6.3
1,http://www.wikidata.org/entity/Q2362333,"Billy Tully (Keach), a boxer past his prime, g...",7.3
2,http://www.wikidata.org/entity/Q1757747,"In the far reaches of outer space, two spacesh...",6.2
3,http://www.wikidata.org/entity/Q2479210,Teenage lovers Jordan White and Amy Blue pick ...,6.0
4,http://www.wikidata.org/entity/Q1114179,Set in the South Pacific Ocean in the year 189...,5.1
...,...,...,...
10362,http://www.wikidata.org/entity/Q4313927,Deven Yuvvraaj (Salman Khan) is a struggling s...,
10363,http://www.wikidata.org/entity/Q51668,The film opens in Gethsemane at night as Jesus...,
10364,http://www.wikidata.org/entity/Q1111758,"Two white cops, Bob 'Uncle Bob' Hodges (Robert...",
10365,http://www.wikidata.org/entity/Q40071,The film follows various plot arcs all occurri...,


In [None]:
indices = pd.Series(data.index, index=data['qid']).drop_duplicates()
indices

qid
http://www.wikidata.org/entity/Q532372          0
http://www.wikidata.org/entity/Q2362333         1
http://www.wikidata.org/entity/Q1757747         2
http://www.wikidata.org/entity/Q2479210         3
http://www.wikidata.org/entity/Q1114179         4
                                            ...  
http://www.wikidata.org/entity/Q4313927     10362
http://www.wikidata.org/entity/Q51668       10363
http://www.wikidata.org/entity/Q1111758     10364
http://www.wikidata.org/entity/Q40071       10365
http://www.wikidata.org/entity/Q15270932    10366
Length: 10367, dtype: int64

In [None]:
# build a recommender system based on dataset and TF-IDF vectorizer
# Path: Jupyter Notebooks/Recommender.ipynb
# load required libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# load the dataset from data/recommender/plots.csv
data = pd.read_csv('../data/recommender/plots.csv')

# build a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# replace NaN with empty string
data['plot'].fillna('', inplace=True)

# construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(data['plot'])

# compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# build a 1-dimensional array with movie titles
indices = pd.Series(data.index, index=data['qid']).drop_duplicates()


In [None]:
# dump the cosine similarity matrix and indices into pickle files
import pickle
pickle.dump(cosine_sim, open('../data/recommender/cosine_sim.pickle', 'wb'))
pickle.dump(indices, open('../data/recommender/indices.pickle', 'wb'))

In [56]:
# build a recommender system based on dataset and TF-IDF vectorizer
# Path: Jupyter Notebooks/Recommender.ipynb
# load required libraries
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# load the dataset from data/recommender/plots.csv
data = pd.read_csv('../data/recommender/plots_expanded.csv')

# replace NaN with empty string
data.fillna('', inplace=True)

features = []
for feat in [data['Genres'], data['Directors']]:
    # build a TF-IDF vectorizer
    tfidf = TfidfVectorizer(stop_words='english')
    # construct the required TF-IDF matrices by fitting and transforming the features
    tfidf_matrix = tfidf.fit_transform(feat)
    # compute the cosine similarity matrix
    feature = linear_kernel(tfidf_matrix, tfidf_matrix)
    features.append(feature)

for feature in features:
    print(feature.shape)

(10367, 10367)
(10367, 10367)


In [62]:
# learn the hyperparameters
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# split the dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(
    cosine_sim, data['imdb_rating'], test_size=0.2, random_state=42)

# define the loss function
def loss(y_true, y_pred):
    return mean_squared_error(y_true, y_pred)

# define the evaluation function
def evaluate(y_true, y_pred):
    print('MSE: ', mean_squared_error(y_true, y_pred))
    print('MAE: ', mean_absolute_error(y_true, y_pred))
    print('R2: ', r2_score(y_true, y_pred))

# define the hyperparameter search space
h1 = np.linspace(0, 1, 11)
h2 = np.linspace(0, 1, 11)
h3 = np.linspace(0, 1, 11)

# define the best hyperparameters
best_h1 = 0
best_h2 = 0
best_h3 = 0
best_loss = np.inf

# iterate over all hyperparameter combinations
for h1 in h1:
    for h2 in h2:
        for h3 in h3:
            # compute the cosine similarity matrix
            cosine_sim = (plot_feature * h1) + (features[0] * h2) + (features[1] * h3)
            # split the dataset into train and test set
            X_train, X_test, y_train, y_test = train_test_split(
                cosine_sim, data['imdb_rating'], test_size=0.2, random_state=42)
            # train the model
            model.fit(X_train, y_train)
            # predict the test set
            y_pred = model.predict(X_test)
            # compute the loss
            l = loss(y_test, y_pred)
            # check if the loss is better than the best loss
            if l < best_loss:
                # update the best loss
                best_loss = l
                # update the best hyperparameters
                best_h1 = h1
                best_h2 = h2
                best_h3 = h3

# print the best hyperparameters
print('Best h1: ', best_h1)
print('Best h2: ', best_h2)
print('Best h3: ', best_h3)

# compute the cosine similarity matrix
cosine_sim = (plot_feature * best_h1) + (features[0] * best_h2) + (features[1] * best_h3)

In [67]:
# compute the cosine similarity matrix
cosine_sim = (plot_feature * 0.5) + (features[0] * 0.3) + (features[1] * 0.2)

In [49]:
# load the cosine similarity matrix and indices from pickle files
import pickle
plot_feature = pickle.load(open('../data/recommender/cosine_sim.pickle', 'rb'))
indices = pickle.load(open('../data/recommender/indices.pickle', 'rb'))


In [73]:
# load the dataset from data/recommender/plots.csv
data = pd.read_csv('../data/recommender/plots.csv')

In [63]:
# function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # get the index of the movie that matches the title
    idx = indices[title]

    # get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # return the top 10 most similar movies
    return data['qid'].iloc[movie_indices]

In [68]:
get_recommendations('http://www.wikidata.org/entity/Q4941')

6384    http://www.wikidata.org/entity/Q18602670
5272      http://www.wikidata.org/entity/Q106571
5072      http://www.wikidata.org/entity/Q204398
572       http://www.wikidata.org/entity/Q181540
6550      http://www.wikidata.org/entity/Q107894
4899      http://www.wikidata.org/entity/Q180279
6810      http://www.wikidata.org/entity/Q102754
4299      http://www.wikidata.org/entity/Q212145
2989      http://www.wikidata.org/entity/Q332330
2677      http://www.wikidata.org/entity/Q151904
Name: qid, dtype: object