# 51-yes-votes-evaluation

#### This notebook will be used to evaluate the models used on datasets that contain user ratings.

### Install packages & run necessary notebooks

In [1]:
# import packages
import pandas as pd
import numpy as np

In [None]:
# run notebook to get pre-defined functions
%%capture
%run 50-get-5recommendation-using-euclidian-distance.ipynb

### Read in datasets

In [59]:
# read in data

path = 'data/'

kmodes = 'yesVotes_kmode_final_df.csv'
kmodes_kmeans = 'yesVotes_kmode_kmeans_final_df.csv'


kmodes_data = pd.read_csv(path + kmodes)
kmodesKmeans_data = pd.read_csv(path + kmodes_kmeans)

imdb_recs = 'imdb_recommendations_tid.csv'
imdb_data = pd.read_csv(path + imdb_recs)

## MODEL 1: KMODES ONLY

### Evaluating for one movie id as test

In [116]:
# test movie id
test = 'tt0374180'

In [119]:
# run model to get predictions
_, test1 = get_5recommendation_noKmeans(kmodes_data, test)
test1

['tt0041776', 'tt0300015', 'tt0847877', 'tt0116683', 'tt0041182']

In [120]:
# look up movie id in the imdb recommendations dataset
test2 = imdb_data.loc[imdb_data['id'] == test]
test2

Unnamed: 0,id,recommendation1,recommendation2,recommendation3,recommendation4,recommendation5,recommendation6,recommendation7,recommendation8,recommendation9,recommendation10,recommendation11,recommendation12,imdb_recs
21,tt0374180,tt0108517,tt0100369,tt0087075,tt0091142,tt0381966,tt0115610,tt0303021,tt0450405,tt0470132,tt0257516,tt0336940,tt0116365,"[tt0108517, tt0100369, tt0087075, tt0091142, t..."


The predictions from our model are not listed in the IMDB recommendations at all. This happened for several movies that I checked.

### Run predictions on full dataset (3k movie ids) and evaluate

In [60]:
# run function to get predictions for all movies - store as dictionary
recs = {}
kmodes_ids = kmodes_data['id'].tolist()
for i in kmodes_ids:
    _, recs_5 = get_5recommendation_noKmeans(kmodes_data, i)
    recs[i] = recs_5

In [None]:
# make dataframe with preds as a list
recs_data = pd.DataFrame(list(recs.items()),columns = ['id','recommendations'])

In [86]:
# convert all 12 recommendations to list to match preds df
cols = list(imdb_data.filter(regex='recommendation').columns)
imdb_data['imdb_recs'] = imdb_data[cols].values.tolist()

In [88]:
# merge imdb preds and model preds
compare_df = recs_data.merge(imdb_data[['id','imdb_recs']], how='inner',on='id')

In [97]:
# for each movie id compare the two lists to see how many are accurate
for i in range(len(compare_df)):
    recs_sum = sum(a == b for a, b in zip(compare_df['recommendations'][i], compare_df['imdb_recs'][i]))
    compare_df['acc'] = recs_sum / 5

Accuracy is 0.0 for all movies ids.

## MODEL 2: KMODES + KMEANS

### Evaluating for one movie id as test

In [191]:
# test movie id
test = 'tt0077631'

In [192]:
# run model on test movie id
_, test1 = get_5recommendation_yesKmeans(kmodesKmeans_data, test)
test1

['tt0076416', 'tt0049367', 'tt0021025', 'tt0115610', 'tt0211718']

In [161]:
# filter imdb dataset for test movie id
test2 = imdb_data.loc[imdb_data['id'] == test]
test2

Unnamed: 0,id,recommendation1,recommendation2,recommendation3,recommendation4,recommendation5,recommendation6,recommendation7,recommendation8,recommendation9,recommendation10,recommendation11,recommendation12,imdb_recs
2979,tt0053213,tt0054381,tt0030845,tt0059290,tt0053640,tt0049013,tt0051947,tt0052846,tt0059161,tt0052884,tt0051058,tt0061827,tt2175675,"[tt0054381, tt0030845, tt0059290, tt0053640, t..."


Same as above. No predictions found in IMDB recommendations.

### Run model on full dataset and evaluate

In [124]:
# run function to get predictions for all movies - store as dictionary
recs_kmeans = {}
kmodesKmeans_ids = kmodesKmeans_data['id'].tolist()
for i in kmodesKmeans_ids:
    _, recsKmeans_5 = get_5recommendation_yesKmeans(kmodesKmeans_data, i)
    recs_kmeans[i] = recsKmeans_5

In [125]:
# make dataframe with preds as a list
recsKmeans_data = pd.DataFrame(list(recs.items()),columns = ['id','recommendations'])

In [126]:
# merge imdb preds and model preds
compareKmeans_df = recsKmeans_data.merge(imdb_data[['id','imdb_recs']], how='inner',on='id')

In [127]:
# for each movie id compare the two lists to see how many are accurate
for i in range(len(compareKmeans_df)):
    recsKmeans_sum = sum(a == b for a, b in zip(compareKmeans_df['recommendations'][i], compareKmeans_df['imdb_recs'][i]))
    compareKmeans_df['acc'] = recsKmeans_sum / 5

Accuracy is also 0.0 for all movie ids.