In [1]:
import os
import sys

sys.path.insert(0, f'{os.environ.get("HOME")}/workspace/recommendation-study')

In [3]:
from util.data import DataLoader

dataset = DataLoader().load()

In [9]:
user_movie_matrix = dataset.train.pivot(index='user_id', columns='movie_id', values='rating')
user_movie_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,62000,62113,62293,62344,62394,62801,62803,63113,63992,64716
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.0,,,,,,3.0,,,,...,,,,,,,,,,


In [10]:
user_movie_matrix[user_movie_matrix < 4] = 0
user_movie_matrix[user_movie_matrix.isnull()] = 0
user_movie_matrix[user_movie_matrix >= 4] = 1
user_movie_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,62000,62113,62293,62344,62394,62801,62803,63113,63992,64716
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
from mlxtend.frequent_patterns import apriori

freq_movies = apriori(user_movie_matrix, min_support=0.1, use_colnames=True)
freq_movies.head()

Unnamed: 0,support,itemsets
0,0.263,(1)
1,0.124,(6)
2,0.107,(10)
3,0.132,(17)
4,0.112,(21)


In [12]:
freq_movies = freq_movies.sort_values(by='support', ascending=False)
freq_movies.head()

Unnamed: 0,support,itemsets
42,0.415,(593)
23,0.379,(318)
21,0.369,(296)
19,0.361,(260)
25,0.319,(356)


In [19]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(freq_movies, metric='lift', min_threshold=1)
rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(296),(593),0.369,0.415,0.249,0.674797,1.626016,0.095865,1.798875,0.610143
1,(593),(296),0.415,0.369,0.249,0.6,1.626016,0.095865,1.5775,0.65812
2,(593),(318),0.415,0.379,0.247,0.595181,1.570398,0.089715,1.534018,0.620887
3,(318),(593),0.379,0.415,0.247,0.651715,1.570398,0.089715,1.679659,0.584893
4,(296),(318),0.369,0.379,0.226,0.612466,1.616006,0.086149,1.602441,0.604105


In [20]:
rules = rules.sort_values(by='lift', ascending=False)
rules = rules[['antecedents', 'consequents', 'lift']]
rules.head()

Unnamed: 0,antecedents,consequents,lift
635,(4993),(5952),5.45977
634,(5952),(4993),5.45977
1413,"(1291, 260)","(1196, 1198)",4.669188
1412,"(1196, 1198)","(1291, 260)",4.669188
1415,"(260, 1198)","(1291, 1196)",4.171359


In [34]:
from util.models import Dataset, RecommendResult
from collections import Counter
from collections import defaultdict
import numpy as np
from recommend.base import BaseRecommender

np.random.seed(0)

'''
user가 최근에 4점 이상을 준 영화 5편을 apriori 알고리즘의 input으로 사용
'''
class AprioriRecommender(BaseRecommender):
    def recommend(self, dataset: Dataset, k: int, **kwargs) -> RecommendResult:
        min_support = kwargs.get('min_support', 0.1)
        min_lift = kwargs.get('min_lift', 1.0)

        user_movie_matrix = dataset.train.pivot(index='user_id', columns='movie_id', values='rating')
        user_movie_matrix[user_movie_matrix < 4] = 0
        user_movie_matrix[user_movie_matrix.isnull()] = 0
        user_movie_matrix[user_movie_matrix >= 4] = 1
        
        freq_movies = apriori(user_movie_matrix, min_support=min_support, use_colnames=True)

        rules = association_rules(freq_movies, metric='lift', min_threshold=min_lift)

        pred_user2items = defaultdict(list)
        user_evaluated_movies = dataset.train.groupby('user_id').agg({'movie_id': list})['movie_id'].to_dict()

        high_rating = dataset.train[dataset.train.rating >= 4]

        for user_id, data in high_rating.groupby('user_id'):
            input_movie_ids = data.sort_values(by='timestamp')['movie_id'].to_list()[-5:]
            matched = rules.antecedents.apply(lambda x: bool(set(x).intersection(input_movie_ids)))

            consequent_movies = []

            for i, row in rules[matched].sort_values(by='lift', ascending=False).iterrows():
                consequent_movies.extend(row['consequents'])
            
            counter = Counter(consequent_movies)

            for movie_id, count in counter.most_common():
                if movie_id not in user_evaluated_movies[user_id]:
                    pred_user2items[user_id].append(movie_id)
                if len(pred_user2items[user_id]) >= 10:
                    break
        
        # 전체 예측값을 test ratings를 그대로 주었으므로 rmse는 0임
        return RecommendResult(dataset.test.rating, pred_user2items)

In [35]:
params = [(0.1, 1.0), (0.1, 1.2), (0.08, 1.0), (0.08, 1.2)]
for min_support, min_lift in params:
    metrics = AprioriRecommender().run_sample(min_support=min_support, min_lift=min_lift)
    print(f'min_support={min_support}, min_lift={min_lift} => {metrics}')

rmse: 0.000, precision@K: 0.011, recall@K: 0.034
min_support=0.1, min_lift=1.0 => None
rmse: 0.000, precision@K: 0.011, recall@K: 0.034
min_support=0.1, min_lift=1.2 => None


KeyboardInterrupt: 