# Movie Recommendation System

### Part 3: Evaluation

For the evaluation, I used stratified cross-validation and Top-n accuracy. In the group consisting of 1 reviewed-item of test data and randomly sampled 100 non-reviewed items, the model produced a ranked list of recommended items. The model evaluator checked whether the interacted item is among the top N items (hit) in the ranked list of 101 recommendations for a user. It computes the Top-N accuracy and Recall for each users and the results will be aggregated for evaluation of the model.

In [55]:
EVAL_RANDOM_SAMPLE_NON_REVIEWED_ITEMS = 100

class ModelEvaluator:
    
    def __init__(self, rating_test_df):
        self.rating_test_df = rating_test_df
    
    def get_not_reviewed_sample(self, person_id, sample_size, seed=42):
        reviewed_items = get_items_reviewed(person_id, self.rating_test_df)
        all_items = np.asarray(range(0,10000))
        non_reviewed_items = set(all_items) - set(reviewed_items)

        random.seed(seed)
        non_reviewed_samples = random.sample(non_reviewed_items, sample_size)
        return set(non_reviewed_samples)

    #returning whether if the movie is in topn list and the index
    def _verify_hit_top_n(self, movie_id, recommended_items, topn):        
        try:
            index = next(i for i in range(len(recommended_items)) if recommended_items[i] == movie_id)
        except:
            index = -1
        hit = int(index in range(0, topn))
        return hit, index

        
    def evaluate_model_for_user(self, model, person_id):
        
        #Getting the items in test set
        person_testset = rating_indexed_test.loc[person_id]
        movies_test = person_testset['movie_id'].values
        movies_cnt_test = len(movies_test)

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, topn=1000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        
        #For each item the user has interacted in test set
        for item_id in movies_test:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_reviewed_sample = self.get_not_reviewed_sample(person_id, sample_size=EVAL_RANDOM_SAMPLE_NON_REVIEWED_ITEMS)

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_reviewed_sample.union(set([item_id]))
          

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['movie_id'].isin(items_to_filter_recs)]
            valid_recs = valid_recs_df['movie_id'].values
            
            
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10
            

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(movies_cnt_test)
        recall_at_10 = hits_at_10_count / float(movies_cnt_test)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'reviewed_count': movies_cnt_test,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    
    def evaluate_model(self, model):

        people_metrics = []
        for idx, person_id in enumerate(list(rating_indexed_test.index.unique().values)):
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        
       

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('reviewed_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['reviewed_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['reviewed_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}  
        
        return global_metrics, detailed_results_df

<br>
<br>
By using the model evaluator, I got recall scores from each models and saved them.

In [45]:
model_eval = ModelEvaluator(rating_test_df)
cf_result, cf_result_df = model_eval.evaluate_model(cf_rec)
cf_result

{'modelName': 'Collaborative Filtering',
 'recall@5': 0.35171620558145167,
 'recall@10': 0.5193262304013561}

In [53]:
cbf_result, cbf_result_df = model_eval.evaluate_model(cbf_rec)
cbf_result

{'modelName': 'Content-Based Filtering',
 'recall@5': 0.05691930504267813,
 'recall@10': 0.1111447424178219}

In [66]:
hb_rec_result, hb_rec_result_df = model_eval.evaluate_model(hb_rec)
hb_rec_result

{'modelName': 'Hybrid Model',
 'recall@5': 0.018130637447787395,
 'recall@10': 0.018130637447787395}

In [68]:
with open('modelresult.pkl', 'wb') as fout:
    pickle.dump({
        'cf': [cf_result, cf_result_df],
        'cbf': [cbf_result, cbf_result_df],
        'hyb': [hb_rec_result, hb_rec_result_df]
    }, fout)