In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

import scipy.sparse as sparse
import random
import implicit
from implicit.als import AlternatingLeastSquares as ALS

%cd /home/user_3/medistream-recsys/Script
from preprocessing import drop_columns,dict_to_column,dict_to_set,set_to_column,key_to_element

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)

/home/user_3/medistream-recsys/Script


# 1. LoadData

In [2]:
# products name 확인 용
products_df = pd.read_json("/fastcampus-data/products/products.json")
products_df = key_to_element(['_id'],products_df)
df = pd.read_json('/fastcampus-data/select_column_version_3.json')

100%|██████████| 5141/5141 [00:00<00:00, 760596.71it/s]


In [3]:
date_state = "2022-05-12"

# paid orders만 가져오기
df['date_paid'] = pd.to_datetime(df['date_paid'])
df_only_paid = df[~df['date_paid'].isna()]
# 3개월치 데이터만 가져오기
df_date = df_only_paid[df_only_paid['date_paid'] > date_state]
# 취소 안된 것만 가져오기
complete_df = df_date[(df_date['paid'] == True) & (df_date['cancelled']==False)]
# 도서 카테고리만 가져오기
only_book = complete_df[complete_df['name'] == '도서']

# 유저가 중복으로 아이템 구매 삭제
df_duplicated_book = only_book.drop_duplicates(subset=['customer_id','product_ids'])
df_book = df_duplicated_book.sort_values(by='date_paid').reset_index(drop=True)

# 전체
# df_book = complete_df[complete_df['name'].isin(['도서','소모품'])].sort_values(by='date_paid')

# 2. Train_Test_split

In [13]:
from datetime import datetime, timedelta

date = df_book['date_paid'].max()-timedelta(weeks=3)
train_before_preprocess = df_book[df_book['date_paid'] < date]
test_before_preprocess = df_book[df_book['date_paid'] >= date]
## train 1개 이상 구매 기록유저에 대한 학습 진행 ###
train_drop_row_index = train_before_preprocess['customer_id'].value_counts()[train_before_preprocess['customer_id'].value_counts()>1].index
train = train_before_preprocess[train_before_preprocess['customer_id'].isin(train_drop_row_index)]

In [14]:
# 아이템 중복확인
# product_ids, name_x 수는 일치
len(df_book.product_ids.unique()), len(df_book.name_x.unique())

(252, 252)

In [15]:
len(train_before_preprocess.product_ids.unique()),len(test_before_preprocess.product_ids.unique())

(243, 130)

In [16]:
# test 아이템에 train 없는 아이템 확인
len(set(test_before_preprocess.product_ids.unique())-set(train_before_preprocess.product_ids.unique()))

9

In [17]:
# test 만 있는 item 제거
only_test_items = set(test_before_preprocess.product_ids.unique())-set(train_before_preprocess.product_ids.unique())
test = test_before_preprocess[~test_before_preprocess['product_ids'].isin(only_test_items)]
train = train_before_preprocess

In [18]:
len(test.customer_id.unique()), len(train_before_preprocess.customer_id.unique())

(491, 2323)

In [19]:
##################################################################################################################

# 3. 도서 Sparse matrix 만들기

In [20]:
# 고객과 아이템 얼마나 구매했는가
grouped_purchased = train.groupby(['customer_id','product_ids']).sum().reset_index()
customers = list(np.sort(grouped_purchased['customer_id'].unique()))
products = list (grouped_purchased['product_ids'].unique())
quantity = list(grouped_purchased['paid'])

rows = grouped_purchased['customer_id'].astype('category').cat.codes
cols = grouped_purchased['product_ids'].astype('category').cat.codes
print(len(customers)) 
print(len(products)) 

# most popular matrix
most_popular_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape = (len(customers),len(products)))
# ALS matrix
purchase_sparse = sparse.csr_matrix(([1 for _ in range(len(quantity))], (rows, cols)), shape = (len(customers),len(products)))
purchase_sparse

2323
243


<2323x243 sparse matrix of type '<class 'numpy.int64'>'
	with 5024 stored elements in Compressed Sparse Row format>

In [21]:
# Sparsity: 얼마나 비어있나?
matrix_size = purchase_sparse.shape[0]* purchase_sparse.shape[1]
num_purchases = len(purchase_sparse.nonzero()[0])
sparsity = 100 * (1 - (num_purchases / matrix_size))
sparsity

99.10999151444935

# 4. MP 추천

In [28]:
most_popular_index = pd.DataFrame(most_popular_sparse.todense()).sum(axis=0).sort_values(ascending=False).head(100).index

In [30]:
item_ids = df_book['product_ids'].unique()
user_ids = test['customer_id'].unique()

In [31]:
# test 실제 값
ground_trues = []
for user_id in user_ids:
    ground_trues.append({'id': user_id,\
    'items':list(test[test['customer_id']==user_id].product_ids)
    })

In [32]:
# test 예측값| 랜덤 추천( 100개 추천)
predict_list = []
for user_id in user_ids:
    random_products = np.random.randint(len(products),size=100)
#     result = als_model.recommend(customers.index(user_id), purchase_sparse[customers.index(user_id)], N=3)
    predict_list.append({'id':user_id ,'items':[products[num] for num in most_popular_index]})