In [15]:
from collections import defaultdict
import itertools
import numpy as np
from platformdirs import user_state_dir
from sklearn.ensemble import  RandomForestRegressor as RFR


from base_recommender import BaseRecommender
from sub import user_movie

from collaborative_filter import user_movie_matrix
from util.models import RecommendResult

class RFRRecommender(BaseRecommender):
    def recommend(self, dataset, **kwargs):
        train_set = dataset.train[["user_id","movie_id"]]
        test_set = dataset.test[["user_id",'movie_id']]
        train_y = dataset.train.rating
        user_movie_matrix = dataset.train.pivot(index="user_id", columns= "movie_id", values="rating")
        all_set = user_movie_matrix.stack(dropna=False).reset_index()[["user_id",'movie_id']]

        train_x = train_set.copy()
        test_x = test_set.copy()
        all_x = all_set.copy()

        #이 3개의 x들에 3개의 열 train에 대한 평균, 최대, 최소를 추가하고자함

        user_stats = user_movie_matrix.agg(['mean','min','max'], axis=1).reset_index()
        user_stats.columns=["user_id","user_mean",'user_min','user_max']
        movie_stats = user_movie_matrix.agg(['mean','min','max']).T.reset_index()
        movie_stats.columns = ['movie_id','movie_mean','movie_min', 'movie_max']


        train_x.merge(user_stats, on="user_id",how="left")
        train_x.merge(movie_stats, on="movie_id", how="left")

        test_x.merge(user_stats, on="user_id",how="left")
        test_x.merge(movie_stats, on="movie_id", how="left")

        all_x.merge(user_stats, on="user_id",how="left")
        all_x.merge(movie_stats, on="movie_id", how="left")

        # train_avg_rate = train_y.mean()
        # test_x.fillna(train_avg_rate, inplace=True)

        movie_genres = dataset.item_content[["movie_id",'genre']]
        genres = set(itertools.chain(*movie_genres['genre']))
        for genre in genres:
            movie_genres[f"is_{genre}"] = movie_genres['genre'].apply(lambda x:genre in x)
        movie_genres.drop(columns="genre",inplace=True)

        train_x.merge(movie_genres,on="movie_id")
        test_x.merge(movie_genres,on="movie_id")
        all_x.merge(movie_genres,on="movie_id")

        train_x.drop(columns=["movie_id","user_id"])
        test_x.drop(columns=["movie_id","user_id"])
        all_x.drop(columns=["movie_id","user_id"])

        reg = RFR(n_jobs=-1, random_state=0)
        reg.fit(train_x, train_y)

        test_pred = reg.predict(test_x)
        test_x['rating_pred']=test_pred

        all_pred = reg.predict(all_x)
        all_x['rating_pred'] = all_pred

        pred_matrix = all_x.pivot(index="user_id", columns="movie_id",values="rating_pred")

        pred_love_items = defaultdict(list)
        user_eval_movies = dataset.train.groupby("user_id").agg({"movie_id":list})['movie_id'].to_dict()

        for user_id in dataset.train.user_id.unique():
            movie_indexes = np.argsort(-pred_matrix.loc[user_id,:]).values
            for movie_idx in movie_indexes:
                movie_id = user_movie_matrix.columns[movie_idx]
                if movie_id not in user_eval_movies[user_id]:
                    pred_love_items[user_id].append(movie_id)
                if len(pred_love_items[user_id])==10:
                    break

        return RecommendResult(
            rating=test_pred,
            user_love_items=pred_love_items
        )




In [16]:
RFRRecommender().eval()

  movies=pd.read_csv(os.path.join(self.data_path,"movies.dat"), encoding="latin-1", names=cols,
  tags = pd.read_csv(os.path.join(self.data_path, "tags.dat"), encoding='latin-1',
  ratings = pd.read_csv(os.path.join(self.data_path, "ratings.dat"), encoding='latin-1',
  all_set = user_movie_matrix.stack(dropna=False).reset_index()[["user_id",'movie_id']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_genres[f"is_{genre}"] = movie_genres['genre'].apply(lambda x:genre in x)


rmse: 1.116, recall: 0.018, precision:0.006


In [17]:
user_num = len(user_movie_matrix.index)
item_num = len(user_movie_matrix.columns)
non_null_items = user_num*item_num-user_movie_matrix.isnull().sum().sum()
non_null_ratio = non_null_items/(user_num*item_num)

print(f'사요자수 ={user_num}, 아이템수={item_num}, 밀도={non_null_ratio:.2f}')


사요자수 =1000, 아이템수=6669, 밀도=0.02
