In [1]:
import pandas as pd
import numpy as np
import random
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from collections import defaultdict #data colector
from sklearn.utils import shuffle
from surprise.reader import Reader
from surprise.model_selection import train_test_split



In [2]:
filters = pd.read_csv('imbd_amazon_movie_vectors.csv')
movies_repository_df = filters[['title_clean']]
ratings_app = pd.read_csv('ratings_app.csv')
history = pd.read_csv('NetflixViewingHistory.csv')

In [3]:
filters[filters['title_clean']=='Nocturna']

Unnamed: 0,title_origin_imbd,rating,year,users_rating,votes,metascore,img_url,countries,languages,actors,...,isin_deep_profound,isin_entertaining_music,isin_realistic_settings,isin_experience_excitement,isin_fun,deep_profound,entertaining_music,realistic_settings,experience_excitement,fun
806,Nocturna,Not Rated,2015.0,3.6,340,,https://m.media-amazon.com/images/M/MV5BMTUzNT...,['USA'],['English'],"['Mike Doyle', 'Estella Warren', 'Massimo Dobr...",...,10,20,10,10,0,1.0,1.0,1.0,1.0,0.0


# SOMEONE'S OWN NETFLIX HISTORY

In [4]:
def clean_titles_external_history(history):
    '''
    Desc.:
    Input:
    Output:
    '''
    external_history_df = history.copy()
    # cleaning title
    external_history_df['title_clean'] = external_history_df.Title.str.replace(
        '[^a-zA-Z0-9]', ' ').replace(regex=r'\Season.*$', value='').replace('   ', ' ').replace(
        '  ', ' ').str.strip(
    )

    #variable: seen
    external_history_df['overall'] = 1

    # clening dataframe
    external_history_df.drop_duplicates(subset=['title_clean'], inplace=True)
    external_history_df.drop('Date', axis=1, inplace=True)
    print('Titles in the History: {0}'.format(
        external_history_df['title_clean'].nunique()))
    return external_history_df

In [5]:
external_history_df = clean_titles_external_history(history)
external_history_df

Titles in the History: 600


Unnamed: 0,Title,title_clean,overall
0,The Predator,The Predator,1
1,The Matrix,The Matrix,1
2,The Ritual,The Ritual,1
3,Beverly Hills Ninja,Beverly Hills Ninja,1
4,MINDHUNTER: Season 2: Episode 1,MINDHUNTER,1
...,...,...,...
1888,Valley Uprising,Valley Uprising,1
1889,Human Planet: Oceans - Into the Blue,Human Planet Oceans Into the Blue,1
1890,Maze Runner: The Scorch Trials,Maze Runner The Scorch Trials,1
1891,Hell on Wheels: Season 1: Pilot,Hell on Wheels,1


# COMMON TITLES IN OUR MOVIE REPOSITORY

In [6]:
def construct_common_prediction(movies_repository, external_history, user_name = 'appuser'):
    '''
    Desc. Makes two DFs: oNe with matching titles in our repository the user watched on Netflix
            and one with titles for prediction
    Input:
    Output:
    '''

    common_titles_df = pd.merge(movies_repository_df, external_history,
                                how='inner', left_on='title_clean', right_on='title_clean')
    common_titles_df['reviewerName'] = user_name
    common_titles_df = common_titles_df.loc[:, [
        "reviewerName", "title_clean", "overall"]]
    print('Titles matching our repository: {0}'.format(
        common_titles_df['title_clean'].nunique()))

    titles_for_prections_df = movies_repository_df[~movies_repository_df['title_clean'].isin(
        common_titles_df['title_clean'])]
    titles_for_prections_df['overall'] = 0
    titles_for_prections_df['reviewerName'] = user_name
    titles_for_prections_df = titles_for_prections_df.loc[:, [
        "reviewerName", "title_clean", "overall"]]
    print('Titles to be predictedy: {0}'.format(
        titles_for_prections_df['title_clean'].nunique()))

    return common_titles_df, titles_for_prections_df

In [7]:
common_titles_df, titles_for_prections_df  = construct_common_prediction(movies_repository_df, external_history_df)
common_titles_df



Titles matching our repository: 113
Titles to be predictedy: 4191


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,reviewerName,title_clean,overall
0,appuser,Harry Potter and the Order of the Phoenix,1
1,appuser,21 Jump Street,1
2,appuser,Grown Ups 2,1
3,appuser,Police Academy,1
4,appuser,Total Recall,1
...,...,...,...
108,appuser,Super 8,1
109,appuser,The Matrix Reloaded,1
110,appuser,The Bourne Identity,1
111,appuser,The Girl with the Dragon Tattoo,1


In [8]:
titles_for_prections_df

Unnamed: 0,reviewerName,title_clean,overall
0,appuser,For Love of the Game,0
1,appuser,The Lord of the Rings The Return of the King,0
2,appuser,Her,0
4,appuser,Heat,0
5,appuser,The Circle,0
...,...,...,...
4298,appuser,Indiana Jones and the Temple of Doom,0
4299,appuser,Thor,0
4300,appuser,12 Years a Slave,0
4301,appuser,Donnie Darko,0


In [9]:
def get_app_user_vect(common_titles_df, titles_for_prections_df, melt = True):
    
    app_user_vector = pd.pivot_table(
        pd.concat([common_titles_df, titles_for_prections_df]), values='overall', 
        index='reviewerName', columns='title_clean')
    app_user_vector = app_user_vector.replace(0, np.nan)
    
    if melt == True:
        app_user_vector = pd.melt(app_user_vector.reset_index(), value_vars=app_user_vector.columns,
                          id_vars='reviewerName', value_name='overall')
        print(app_user_vector.overall.value_counts(dropna =False))
    return app_user_vector


In [10]:
app_user_vector = get_app_user_vect(common_titles_df, titles_for_prections_df, melt = False)
app_user_df = get_app_user_vect(common_titles_df, titles_for_prections_df, melt = True)


NaN    4191
1.0     113
Name: overall, dtype: int64


In [11]:
app_user_df

Unnamed: 0,reviewerName,title_clean,overall
0,appuser,10 Attitudes,
1,appuser,10 Cloverfield Lane,
2,appuser,10 Items or Less,
3,appuser,10 Things I Hate About You,
4,appuser,10 Years,
...,...,...,...
4299,appuser,Zookeeper,
4300,appuser,Zootopia,
4301,appuser,Zorba the Greek,
4302,appuser,Zulu Dawn,


In [12]:
def melt_ratings(ratings):
    """
    Desc.: Makes all combinations between users and titles with values 0 if nor rated (no roview - 
    we assume the user'didnt watch), 1 if review so user watched th movie
    """
    
    ratings = ratings[ratings['title_clean'].isin(movies_repository_df['title_clean'].unique())]
    ratings.drop_duplicates(subset=['reviewerName', 'title_clean'], inplace = True)
    # IF DOING PROBABILISTIC 
    ratings['overall'] = 1
    
    rating_pivot = pd.pivot_table(
    ratings, values='overall', index='reviewerName', columns='title_clean',  fill_value=0)
    rating_melt = pd.melt(rating_pivot.reset_index(), value_vars=rating_pivot.columns,
                      id_vars='reviewerName', value_name='overall')
    
    
    return rating_melt

In [13]:
ratings = melt_ratings(ratings = ratings_app)
ratings

Unnamed: 0,reviewerName,title_clean,overall
0,Oreo Cookie Chocolate Peanut Butter Pie 2018,10 Cloverfield Lane,0
1,! MR. KNOW IT ALL ;-b,10 Cloverfield Lane,0
2,...,10 Cloverfield Lane,0
3,007,10 Cloverfield Lane,0
4,1,10 Cloverfield Lane,0
...,...,...,...
3195131,william,Zulu Dawn,0
3195132,wiredweird,Zulu Dawn,0
3195133,xfiler93,Zulu Dawn,0
3195134,zombie phreak,Zulu Dawn,0


In [14]:
ratings.nunique(dropna = False)

reviewerName    1783
title_clean     1792
overall            2
dtype: int64

def make_predictions(algo_SVD, titles_for_prections_df, n = 100):
    pred_app = titles_for_prections_df['title_clean'].apply(
        lambda x: algo_SVD.predict(uid='appuser', iid=x))
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in pred_app:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[: n]

    pred_app_df = pd.DataFrame([(id, pair[0], pair[1]) for id, row in top_n.items() for pair in row], columns=[
                               "reviewerName", "title_clean", "rat_pred"]).merge(filters, on='title_clean', how='left')
    return pred_app_df

In [15]:
make_predictions(algo_SVD, titles_for_prections_df, n = 1000)

NameError: name 'make_predictions' is not defined

### Similarity Modules

In [16]:
def get_and_convert_ids(ratings):
    
    reviewer_id = ratings.loc[:,['reviewerName']]
    reviewer_id['reviewer_id'] = reviewer_id['reviewerName'].astype('category').cat.codes
    
    title_id = ratings.loc[:,['title_clean']]
    title_id['title_id'] =title_id['title_clean'].astype('category').astype('category').cat.codes

    ratings_ids = ratings.copy()
    ratings_ids['reviewer_id'] = reviewer_id['reviewer_id']
    ratings_ids['title_id'] = title_id['title_id'] 
    ratings_ids = ratings_ids.loc[:,['reviewer_id','title_id','overall' ]]
    
    
    reviewer_id.drop_duplicates(inplace = True)
    title_id.drop_duplicates(inplace = True)
    
    return reviewer_id, title_id, ratings_ids

In [17]:
reviewer_id, title_id, ratings_ids = get_and_convert_ids(ratings)

In [18]:
ratings_ids

Unnamed: 0,reviewer_id,title_id,overall
0,0,0,0
1,1,0,0
2,2,0,0
3,3,0,0
4,4,0,0
...,...,...,...
3195131,1778,1791,0
3195132,1779,1791,0
3195133,1780,1791,0
3195134,1781,1791,0


In [19]:
def ids_user_watched(common_titles_df, title_id):

    titles_name_id = pd.merge(common_titles_df, title_id,
                              how='inner', left_on='title_clean', right_on='title_clean')
    titles_name_id.drop('overall', axis = 1, inplace = True)
    return titles_name_id


In [20]:
ids_user_watched_df = ids_user_watched(common_titles_df, title_id)
ids_user_watched_df

Unnamed: 0,reviewerName,title_clean,title_id
0,appuser,Harry Potter and the Order of the Phoenix,574
1,appuser,21 Jump Street,10
2,appuser,Grown Ups 2,553
3,appuser,Total Recall,1644
4,appuser,Minions,841
...,...,...,...
80,appuser,Super 8,1230
81,appuser,The Matrix Reloaded,1476
82,appuser,The Bourne Identity,1295
83,appuser,The Girl with the Dragon Tattoo,1380


In [21]:
# Surprise (matrix factorization package for RecSys.) data loading
# reader = Reader(rating_scale=(0, 1)) #line_format by default order of the fields
# data = Dataset.load_from_df(ratings_ids[['reviewer_id','title_id','overall' ]], reader=reader)

# trainset, testset = train_test_split(data, test_size=0.2)

# app_user_df_shuffle = app_user_df.copy()
# app_user_df_shuffle['overall'] = shuffle(app_user_df_shuffle['overall'])

# data_appuser = Dataset.load_from_df(app_user_df[["reviewerName","title_clean","overall"]], reader=reader)
# trainset_appuser = data_appuser.build_full_trainset()
# testset_appuser = trainset.build_anti_testset()



In [22]:
ratings_ids.nunique()

reviewer_id    1783
title_id       1792
overall           2
dtype: int64

In [33]:
from surprise import dump
from surprise.reader import Reader
from surprise import Dataset
from surprise import dump

In [24]:
# Surprise (matrix factorization package for RecSys.) data loading
reader = Reader(rating_scale=(0, 1)) #line_format by default order of the fields
data = Dataset.load_from_df(ratings_ids[['reviewer_id','title_id','overall' ]], reader=reader)

trainset = data.build_full_trainset()

In [None]:
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate


In [133]:
# %%time

# my_k = 5
# my_min_k = 5


# algo_knn_means = KNNWithMeans(k = my_k, min_k = my_min_k, 
#     sim_options = {
#         'name':'pearson', 'user_based': False}, verbose = True
#                              )
    
# results = cross_validate(
#     algo = algo_knn_means, data = data, measures=['RMSE'], 
#     cv= 3 , return_train_measures=True
#     )



Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


KeyboardInterrupt: 

In [25]:
results['test_rmse'].mean()

NameError: name 'results' is not defined

In [26]:
# algo_knn_means.fit(trainset)


In [25]:
from surprise import KNNBasic

algo_cosine = KNNBasic(sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               } 
                 )

algo_msd = KNNBasic(sim_options = {'name': 'msd',
               'user_based': False  # compute  similarities between items
               } 
                 )

algo_pearson = KNNBasic(sim_options = {'name': 'pearson',
               'user_based': False  # compute  similarities between items
               } 
                 )


In [34]:
%%time
# algo_cosine.fit(trainset)
# dump.dump('algo_cosine_trained', algo= algo_cosine)
_, algo_cosine = dump.load('algo_cosine_trained')


CPU times: user 741 ms, sys: 222 ms, total: 964 ms
Wall time: 964 ms


In [36]:
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
ratings_ids.pivot_table(index = 'reviewer_id')ratings_ids[['reviewer_id','title_id','overall' ]]

ValueError: No group keys passed!

In [35]:
ids_user_watched_df['title_id'].apply(
    lambda x: algo_cosine.get_neighbors(x, 20))

0     [572, 573, 575, 1653, 1204, 1150, 1464, 676, 5...
1     [1250, 11, 712, 1475, 805, 699, 847, 1734, 780...
2     [1734, 742, 1422, 1405, 1250, 1297, 712, 61, 1...
3     [1008, 1328, 780, 805, 1209, 1161, 345, 386, 1...
4     [100, 1475, 509, 712, 556, 121, 793, 983, 179,...
                            ...                        
80    [1782, 1008, 847, 1187, 1260, 980, 1629, 780, ...
81    [1195, 981, 1680, 1463, 68, 1465, 1464, 575, 1...
82    [1463, 1445, 295, 139, 256, 829, 954, 1046, 16...
83    [1332, 1161, 1394, 853, 370, 1567, 106, 847, 1...
84    [1069, 518, 1674, 789, 1180, 1648, 439, 1463, ...
Name: title_id, Length: 85, dtype: object

In [27]:
%%time
# algo_msd.fit(trainset)
# dump.dump('algo_msd_trained', algo= algo_msd)
_, algo_msd = dump.load('algo_msd_trained')


CPU times: user 778 ms, sys: 254 ms, total: 1.03 s
Wall time: 1.03 s


In [28]:
# %%time
# algo_pearson.fit(trainset)
# dump.dump('algo_pearson_trained', algo= algo_pearson)
_, algo_pearson = dump.load('algo_pearson_trained')



Computing the pearson similarity matrix...
Done computing similarity matrix.
CPU times: user 5min 29s, sys: 2.45 s, total: 5min 31s
Wall time: 5min 31s


In [146]:
ids_user_watched_df['knn_cosine'] = ids_user_watched_df['title_id'].apply(
    lambda x: algo_cosine.get_neighbors(x, 20))
ids_user_watched_df['knn_msd'] = ids_user_watched_df['title_id'].apply(
    lambda x: algo_msd.get_neighbors(x, 20))
ids_user_watched_df['knn_pearson'] = ids_user_watched_df['title_id'].apply(
    lambda x: algo_pearson.get_neighbors(x, 20))

In [147]:
cosine_exploded = ids_user_watched_df.loc[:, [
    'reviewerName', 'title_clean', 'title_id', 'knn_cosine']].explode('knn_cosine')

msd_exploded = ids_user_watched_df.loc[:, [
    'reviewerName', 'title_clean', 'title_id', 'knn_msd']].explode('knn_msd')

pearson_exploded = ids_user_watched_df.loc[:, [
    'reviewerName', 'title_clean', 'title_id', 'knn_pearson']].explode('knn_pearson')



In [148]:
exploded_df =  pd.concat([cosine_exploded, msd_exploded,pearson_exploded], axis=1, join='inner')
exploded_df = exploded_df.loc[:,~exploded_df.columns.duplicated()]
exploded_df


Unnamed: 0,reviewerName,title_clean,title_id,knn_cosine,knn_msd,knn_pearson
0,appuser,Harry Potter and the Order of the Phoenix,574,572,573,572
0,appuser,Harry Potter and the Order of the Phoenix,574,573,572,573
0,appuser,Harry Potter and the Order of the Phoenix,574,575,1304,575
0,appuser,Harry Potter and the Order of the Phoenix,574,1653,1278,1150
0,appuser,Harry Potter and the Order of the Phoenix,574,1204,585,1653
...,...,...,...,...,...,...
84,appuser,Snatch,1168,3,369,770
84,appuser,Snatch,1168,685,714,1102
84,appuser,Snatch,1168,1291,963,1188
84,appuser,Snatch,1168,1227,1093,941


In [149]:
cosine_df = pd.DataFrame(exploded_df['knn_cosine'].value_counts(normalize=True)).reset_index().rename(columns={'index': 'title_id',
                                                                                                               'knn_cosine': 'cosine_freq'})
cosine_df

Unnamed: 0,title_id,cosine_freq
0,712,0.015882
1,1345,0.015882
2,699,0.014706
3,556,0.012941
4,793,0.012353
...,...,...
669,529,0.000588
670,531,0.000588
671,532,0.000588
672,534,0.000588


In [150]:
msd_df = pd.DataFrame(exploded_df['knn_msd'].value_counts(normalize = True)).reset_index().rename(columns={'index': 'title_id',
                                                                                     'knn_msd': 'msd_freq'})

msd_df


Unnamed: 0,title_id,msd_freq
0,1224,0.044118
1,1177,0.042353
2,1439,0.040588
3,463,0.037647
4,196,0.035882
...,...,...
237,1267,0.000588
238,1266,0.000588
239,1257,0.000588
240,1720,0.000588


In [151]:
pearson_df = pd.DataFrame(exploded_df['knn_pearson'].value_counts(normalize = True)).reset_index().rename(columns={'index': 'title_id',
                                                                                     'knn_pearson': 'pearson_freq'})
pearson_df


Unnamed: 0,title_id,pearson_freq
0,712,0.007059
1,1345,0.007059
2,725,0.006471
3,793,0.006471
4,699,0.006471
...,...,...
817,1263,0.000588
818,1261,0.000588
819,718,0.000588
820,719,0.000588


In [164]:
similarities_df.columns[similarities_df.columns.str.contains('_freq')]


Index(['cosine_freq', 'msd_freq', 'pearson_freq'], dtype='object')

In [176]:
similarities_df = pd.merge(cosine_df, msd_df).merge(pearson_df)
similarities_df['mean_sim'] = similarities_df.loc[:,['cosine_freq', 'msd_freq', 'pearson_freq']].mean(axis = 1)
similarities_df.sort_values('mean_sim',ascending=False, inplace = True)
similarities_df

Unnamed: 0,title_id,cosine_freq,msd_freq,pearson_freq,mean_sim
59,1483,0.000588,0.014118,0.001176,0.005294
48,911,0.000588,0.014118,0.000588,0.005098
44,1762,0.000588,0.012941,0.001176,0.004902
2,1073,0.004118,0.000588,0.005294,0.003333
63,359,0.000588,0.008824,0.000588,0.003333
...,...,...,...,...,...
56,627,0.000588,0.000588,0.000588,0.000588
55,978,0.000588,0.000588,0.000588,0.000588
47,1689,0.000588,0.000588,0.000588,0.000588
52,830,0.000588,0.000588,0.000588,0.000588


In [178]:
pd.merge(similarities_df, title_id)[:30]

Unnamed: 0,title_id,cosine_freq,msd_freq,pearson_freq,mean_sim,title_clean
0,1483,0.000588,0.014118,0.001176,0.005294,The Million Dollar Hotel
1,911,0.000588,0.014118,0.000588,0.005098,Nocturna
2,1762,0.000588,0.012941,0.001176,0.004902,Wildflower
3,1073,0.004118,0.000588,0.005294,0.003333,Run All Night
4,359,0.000588,0.008824,0.000588,0.003333,Deuces Wild
5,51,0.004706,0.000588,0.004706,0.003333,A Walk Among the Tombstones
6,415,0.000588,0.007059,0.001765,0.003137,Emmet Otter s Jug Band Christmas
7,1195,0.005294,0.000588,0.003529,0.003137,Spider Man 2
8,1266,0.003529,0.000588,0.003529,0.002549,The Amityville Horror
9,182,0.001765,0.004118,0.001765,0.002549,Big Trouble


In [118]:
title_id[title_id['title_id'].isin([699,556,1475,
                                   1177, 1224, 1439,
                                  1195,1248,1209])]

Unnamed: 0,title_clean,title_id
991348,Guardians of the Galaxy,556
1246317,John Wick,699
2098591,Somewhere Tomorrow,1177
2130685,Spider Man 2,1195
2155647,Star Trek Into Darkness,1209
2182392,Straightman,1224
2225184,Taxi Driver,1248
2565737,The Last Brickmaker in America,1439
2629925,The Martian,1475


In [88]:
title_id

Unnamed: 0,title_clean,title_id
0,10 Cloverfield Lane,0
1783,10 Things I Hate About You,1
3566,101 Dalmatians,2
5349,102 Dalmatians,3
7132,12 Angry Men,4
...,...,...
3186221,Young Frankenstein,1787
3188004,Young Sherlock Holmes,1788
3189787,Zookeeper,1789
3191570,Zootopia,1790


In [2]:
%%time
import pandas as pd
from  predict_external_user import main as recommend
user_hist = pd.read_csv('NetflixViewingHistory.csv')
recommend(user_hist)

Titles in the History: 600
Titles matching our repository: 113
Titles to be predictedy: 4191


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titles_for_prections_df['overall'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titles_for_prections_df['reviewerName'] = user_name


CPU times: user 4.9 s, sys: 1.18 s, total: 6.08 s
Wall time: 6.1 s


Unnamed: 0,title_id,cosine_freq,msd_freq,pearson_freq,mean_sim,title_clean
0,1483,0.000588,0.014118,0.001176,0.005294,The Million Dollar Hotel
1,911,0.000588,0.014118,0.000588,0.005098,Nocturna
2,1762,0.000588,0.012941,0.001176,0.004902,Wildflower
3,1073,0.004118,0.000588,0.005294,0.003333,Run All Night
4,359,0.000588,0.008824,0.000588,0.003333,Deuces Wild
...,...,...,...,...,...,...
68,627,0.000588,0.000588,0.000588,0.000588,Hustle Flow
69,978,0.000588,0.000588,0.000588,0.000588,Pieces of April
70,1689,0.000588,0.000588,0.000588,0.000588,Unthinkable
71,830,0.000588,0.000588,0.000588,0.000588,Memorial Day


In [3]:
filters = pd.read_csv('imbd_amazon_movie_vectors.csv')
filters

Unnamed: 0,title_origin_imbd,rating,year,users_rating,votes,metascore,img_url,countries,languages,actors,...,isin_deep_profound,isin_entertaining_music,isin_realistic_settings,isin_experience_excitement,isin_fun,deep_profound,entertaining_music,realistic_settings,experience_excitement,fun
0,For Love of the Game,PG-13,1999.0,6.6,31110,43.0,https://m.media-amazon.com/images/M/MV5BZDgzY2...,['USA'],['English'],"['Kevin Costner', 'Kelly Preston', 'John C. Re...",...,0,0,0,0,10,0.0,0.0,0.0,0.0,0.0
1,The Lord of the Rings: The Return of the King,PG-13,2003.0,8.9,1596400,94.0,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,"['New Zealand', 'USA']","['English', 'Quenya', 'Old English', 'Sindarin']","['Noel Appleby', 'Ali Astin', 'Sean Astin', 'D...",...,0,0,0,20,0,0.0,0.0,0.0,1.0,0.0
2,Her,R,2013.0,8.0,519781,90.0,https://m.media-amazon.com/images/M/MV5BMjA1Nz...,['USA'],['English'],"['Joaquin Phoenix', 'Lynn Adrianna Freedman', ...",...,0,0,0,20,0,0.0,0.0,0.0,1.0,0.0
3,Harry Potter and the Order of the Phoenix,PG-13,2007.0,7.5,490882,71.0,https://m.media-amazon.com/images/M/MV5BMTM0NT...,"['UK', 'USA']",['English'],"['Daniel Radcliffe', 'Harry Melling', 'Jason B...",...,0,10,0,10,0,0.0,1.0,0.0,1.0,0.0
4,Heat,R,1995.0,8.2,558892,76.0,https://m.media-amazon.com/images/M/MV5BMDJjNW...,['USA'],"['English', 'Spanish']","['Al Pacino', 'Robert De Niro', 'Val Kilmer', ...",...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4299,Thor,PG-13,2011.0,7.0,727860,57.0,https://m.media-amazon.com/images/M/MV5BOGE4Nz...,['USA'],['English'],"['Chris Hemsworth', 'Natalie Portman', 'Tom Hi...",...,0,0,0,30,20,0.0,0.0,0.0,2.0,1.0
4300,12 Years a Slave,R,2013.0,8.1,621019,96.0,https://m.media-amazon.com/images/M/MV5BMjExMT...,"['UK', 'USA']",['English'],"['Chiwetel Ejiofor', 'Dwight Henry', 'Dickie G...",...,0,0,10,0,0,0.0,0.0,1.0,0.0,0.0
4301,Donnie Darko,R,2001.0,8.0,725828,88.0,https://m.media-amazon.com/images/M/MV5BZjZlZD...,['USA'],['English'],"['Jake Gyllenhaal', 'Holmes Osborne', 'Maggie ...",...,0,0,10,10,10,0.0,0.0,1.0,1.0,0.0
4302,Jurassic World,PG-13,2015.0,7.0,567087,59.0,https://m.media-amazon.com/images/M/MV5BNzQ3OT...,['USA'],['English'],"['Chris Pratt', 'Bryce Dallas Howard', 'Irrfan...",...,0,0,0,0,10,0.0,0.0,0.0,0.0,0.0


# How filters for the users may help:

In [291]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

    Let's say the user is in the user makes a query for these filters:
    'deep_profound'  = 0
    'entertaining_music' = 0
    'realistic_settings', = 5
    'experience_excitement' = 0
    'fun' = 2
We cen the use nearest neigbour algorithm to find the closest neighbour movies (by the asscociated filters and score) to the user's query

In [302]:
user_query = [0,0,5,5]
user_query

[0, 0, 5, 5, 1]

In [303]:
def apply_filters_predictions(user_query, pred_, n_neighbors = 50):
    '''
    Desc.: -Apply the user's query for filters
        - scales and transform the filterss
        - transform the query
        - finds by default 50 closes neigbours (by the filters) to the users query
    
    '''
    
    scaler = StandardScaler()
    filters_array = np.array(pred_.iloc[:, -5: ])
    
    filters_scaled = scaler.fit_transform(filters_array)
    nbrs = NearestNeighbors(n_neighbors = n_neighbors, algorithm='ball_tree')
    nbrs.fit(filters_scaled)
    
    user_query = np.array(user_query).reshape(1, -1)
    user_query_scaled = scaler.transform(user_query.reshape(1, -1))
    distances, indices = nbrs.kneighbors(user_query_scaled)
    
    filtered_recommendations = pred_.iloc[np.array(indices[0]),:]
    return filtered_recommendations

In [307]:


filtered_recommendations = apply_filters_predictions(user_query = user_query , pred_ = pred_, n_neighbors = 50)
filtered_recommendations #50 nearest neighbours to the user's query

Unnamed: 0,reviewerName,title_clean,rat_pred,users_rating,genre,tagline,imdb_url,deep_profound,entertaining_music,realistic_settings,experience_excitement,fun
87839,appuser,The Ex,1.0,5.5,"['Comedy', 'Romance']","On January 19, it's not business, it's personal.",https://www.imdb.com/title/tt0458364/,0.0,0.0,2.0,2.0,2.0
87291,appuser,Kingdom Come,1.0,8.2,[],,https://www.imdb.com/title/tt0156709/,1.0,0.0,2.0,2.0,2.0
87210,appuser,The Proposal,1.0,5.4,"['Crime', 'Drama', 'Thriller']",Their Mistake Was Trusting Her.,https://www.imdb.com/title/tt0179435/,0.0,1.0,4.0,1.0,2.0
87703,appuser,The Sweetest Thing,1.0,5.2,"['Comedy', 'Romance']",First came the rules of love. Now comes the fun.,https://www.imdb.com/title/tt0253867/,1.0,0.0,1.0,2.0,4.0
87793,appuser,Elektra,1.0,4.7,"['Action', 'Adventure', 'Crime', 'Fantasy']",Before she can find peace she will wage war.,https://www.imdb.com/title/tt0357277/,0.0,1.0,1.0,3.0,2.0
87801,appuser,The Alamo,1.0,6.9,"['Adventure', 'Drama', 'History', 'War', 'West...",They stood firing until they could stand no lo...,https://www.imdb.com/title/tt0053580/,0.0,0.0,4.0,2.0,0.0
87383,appuser,Learning to Drive,1.0,6.4,"['Comedy', 'Drama', 'Romance']",It's never too late to begin a new adventure.,https://www.imdb.com/title/tt3062976/,1.0,0.0,4.0,2.0,0.0
87595,appuser,Space Cowboys,1.0,6.4,"['Action', 'Adventure', 'Thriller']",Space will never be the same,https://www.imdb.com/title/tt0186566/,0.0,1.0,4.0,1.0,1.0
87478,appuser,Lake Placid,1.0,5.7,"['Action', 'Comedy', 'Horror']",Part Mystery Part Thriller Parts Missing,https://www.imdb.com/title/tt0139414/,0.0,1.0,1.0,3.0,1.0
87176,appuser,Ant Man,1.0,7.3,"['Action', 'Adventure', 'Comedy', 'Sci-Fi']",No shield. No armor. No problem.,https://www.imdb.com/title/tt0478970/,0.0,0.0,0.0,2.0,4.0


# We may even recommend shows to users out of the rating matrix!!!

Let's say the *cold-start* user is in the user makes a query for these filters:
    * 'deep_profound'  = 0
    * 'entertaining_music' = 0
    * 'realistic_settings', = 5
    * 'experience_excitement' = 0
    * 'fun' = 2
    
 We will just query that in the df with movie vectors

In [305]:
def apply_filters_cold_user(user_query, filters, n_neighbors = 50):
    '''
    Desc.: -Apply the user's query for filters
        - scales and transform the filterss
        - transform the query
        - finds by default 50 closes neigbours (by the filters) to the users query
    
    '''
    
    scaler = StandardScaler()
    filters_array = np.array(pred_.loc[:,'deep_profound':'fun'])
    
    filters_scaled = scaler.fit_transform(filters_array)
    nbrs = NearestNeighbors(n_neighbors = n_neighbors, algorithm='ball_tree')
    nbrs.fit(filters_scaled)
    
    user_query = np.array(user_query).reshape(1, -1)
    user_query_scaled = scaler.transform(user_query.reshape(1, -1))
    distances, indices = nbrs.kneighbors(user_query_scaled)
    
    filtered_recommendations = filters.iloc[np.array(indices[0]),:]
    return filtered_recommendations

In [306]:
user_query = [0,0,5,5,5]
user_query

[0, 0, 5, 5, 5]

# How filters for the users may help:

In [117]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

    Let's say the user is in the user makes a query for these filters:
    'deep_profound'  = 0
    'entertaining_music' = 0
    'realistic_settings', = 5
    'experience_excitement' = 0
    'fun' = 2
We cen the use nearest neigbour algorithm to find the closest neighbour movies (by the asscociated filters and score) to the user's query

In [198]:
user_query = [0,0,5,0,2]
user_query

[0, 0, 5, 0, 2]

In [71]:
def apply_filters_predictions(user_query, pred_, n_neighbors = 50):
    '''
    Desc.: -Apply the user's query for filters
        - scales and transform the filterss
        - transform the query
        - finds by default 50 closes neigbours (by the filters) to the users query
    
    '''
    
    scaler = StandardScaler()
    filters_array = np.array(pred_.iloc[:, -5: ])
    
    filters_scaled = scaler.fit_transform(filters_array)
    nbrs = NearestNeighbors(n_neighbors = n_neighbors, algorithm='ball_tree')
    nbrs.fit(filters_scaled)
    
    user_query = np.array(user_query).reshape(1, -1)
    user_query_scaled = scaler.transform(user_query.reshape(1, -1))
    distances, indices = nbrs.kneighbors(user_query_scaled)
    
    filtered_recommendations = pred_.iloc[np.array(indices[0]),:]
    return filtered_recommendations

In [209]:


filtered_recommendations = apply_filters_predictions(user_query = user_query , pred_ = pred_, n_neighbors = 50)
filtered_recommendations #50 nearest neighbours to the user's query

Unnamed: 0,reviewerName,title_clean,rat_pred,users_rating,genre,tagline,imdb_url,deep_profound,entertaining_music,realistic_settings,experience_excitement,fun
1973936,Christopher Beck,Heaven Knows Mr Allison,4.866923,7.3,"['Adventure', 'Drama', 'War']",They were alone on this Pacific Island... trap...,https://www.imdb.com/title/tt0050490/,1.0,0.0,4.0,0.0,0.0
1973918,Christopher Beck,Maria Full of Grace,4.892121,7.4,"['Crime', 'Drama']",How far will she go before she's gone too far?,https://www.imdb.com/title/tt0390221/,0.0,0.0,4.0,1.0,0.0
1974137,Christopher Beck,Gladiator,4.65789,6.5,"['Action', 'Drama', 'Sport']","Against all odds, they became friends. Agains...",https://www.imdb.com/title/tt0104346/,0.0,0.0,4.0,1.0,0.0
1973945,Christopher Beck,61,4.851283,7.8,"['Biography', 'Drama', 'History', 'Sport']",Mickey Mantle and Roger Maris. Why did America...,https://www.imdb.com/title/tt0250934/,0.0,1.0,4.0,0.0,0.0
1974019,Christopher Beck,How to Steal a Million,4.751196,7.6,"['Comedy', 'Crime', 'Romance']",S-S-S-H-H-H-H-H - Meet a couple of smart opera...,https://www.imdb.com/title/tt0060522/,0.0,0.0,2.0,0.0,2.0
1973986,Christopher Beck,Return to Me,4.788072,6.9,"['Comedy', 'Drama', 'Romance']",For the strength to be strong; for the will to...,https://www.imdb.com/title/tt0122459/,0.0,0.0,2.0,1.0,1.0
1974135,Christopher Beck,The Breakfast Club,4.660897,7.9,"['Comedy', 'Drama']","They were five total strangers, with nothing i...",https://www.imdb.com/title/tt0088847/,0.0,0.0,2.0,0.0,0.0
1973968,Christopher Beck,The Rookie,4.813316,5.9,"['Action', 'Crime', 'Drama', 'Thriller']",,https://www.imdb.com/title/tt0100514/,0.0,0.0,2.0,0.0,0.0
1973966,Christopher Beck,The Longest Day,4.815613,7.8,"['Action', 'Drama', 'History', 'War']",42 STARS IN THE LONGEST DAY,https://www.imdb.com/title/tt0056197/,0.0,0.0,2.0,0.0,0.0
1974182,Christopher Beck,True Grit,4.613134,7.4,"['Adventure', 'Drama', 'Western']",The strangest trio ever to track a killer. A f...,https://www.imdb.com/title/tt0065126/,0.0,0.0,2.0,0.0,0.0


# We may even recommend shows to users out of the rating matrix!!!

Let's say the *cold-start* user is in the user makes a query for these filters:
    * 'deep_profound'  = 0
    * 'entertaining_music' = 0
    * 'realistic_settings', = 5
    * 'experience_excitement' = 0
    * 'fun' = 2
    
 We will just query that in the df with movie vectors

In [211]:
def apply_filters_cold_user(user_query, filters, n_neighbors = 50):
    '''
    Desc.: -Apply the user's query for filters
        - scales and transform the filterss
        - transform the query
        - finds by default 50 closes neigbours (by the filters) to the users query
    
    '''
    
    scaler = StandardScaler()
    filters_array = np.array(pred_.loc[:,'deep_profound':'fun'])
    
    filters_scaled = scaler.fit_transform(filters_array)
    nbrs = NearestNeighbors(n_neighbors = n_neighbors, algorithm='ball_tree')
    nbrs.fit(filters_scaled)
    
    user_query = np.array(user_query).reshape(1, -1)
    user_query_scaled = scaler.transform(user_query.reshape(1, -1))
    distances, indices = nbrs.kneighbors(user_query_scaled)
    
    filtered_recommendations = filters.iloc[np.array(indices[0]),:]
    return filtered_recommendations

In [220]:
user_query = [0,0,2,0,2]
user_query

[0, 0, 2, 0, 2]

In [221]:
apply_filters_cold_user(user_query, filters = , n_neighbors = 50)

Unnamed: 0,title_clean,users_rating,genre,tagline,imdb_url,deep_profound,entertaining_music,realistic_settings,experience_excitement,fun
158,Battle Los Angeles,5.7,"['Action', 'Sci-Fi']",It's not war. It's survival.,https://www.imdb.com/title/tt1217613/,0.0,1.0,0.0,0.0,0.0
219,American Gigolo,6.3,"['Crime', 'Drama', 'Mystery', 'Romance', 'Thri...",Richard Gere is American Gigolo. His business ...,https://www.imdb.com/title/tt0080365/,1.0,0.0,1.0,1.0,0.0
275,Bachelor Mother,7.6,"['Comedy', 'Romance']","""And a little child shall lead them""--into the...",https://www.imdb.com/title/tt0031067/,2.0,0.0,0.0,0.0,2.0
372,Mackenna s Gold,6.8,"['Action', 'Adventure', 'Romance', 'Western']",Mackenna's Gold is beyond reach! Mackenna's Go...,https://www.imdb.com/title/tt0064615/,0.0,0.0,0.0,1.0,0.0
314,Necessary Roughness,6.2,"['Comedy', 'Sport']","This gang of loners, loonies and losers are ab...",https://www.imdb.com/title/tt0102517/,1.0,0.0,0.0,0.0,0.0
293,Very Bad Things,6.4,"['Comedy', 'Crime', 'Thriller']",For boys who should know better...,https://www.imdb.com/title/tt0124198/,1.0,0.0,0.0,0.0,2.0
125,The Omen,7.5,['Horror'],It is the greatest mystery of all because no h...,https://www.imdb.com/title/tt0075005/,0.0,0.0,0.0,1.0,0.0
111,What Women Want,6.4,"['Comedy', 'Fantasy', 'Romance']",He has the power to hear everything women are ...,https://www.imdb.com/title/tt0207201/,0.0,0.0,0.0,0.0,2.0
215,Dangerous Minds,6.5,"['Biography', 'Drama']",She Broke The Rules... And Changed Their Lives.,https://www.imdb.com/title/tt0112792/,0.0,0.0,0.0,2.0,0.0
83,Red Dawn,5.4,"['Action', 'Drama']",Welcome To the Home of the Brave,https://www.imdb.com/title/tt1234719/,1.0,1.0,0.0,1.0,0.0
