https://assaeunji.github.io/machine%20learning/2020-11-29-implicitfeedback/

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
import scipy.sparse as sparse
import random
import implicit
from implicit.als import AlternatingLeastSquares as ALS

In [2]:
########### 원본
# drop columns 전처리
def drop_columns(delete_columns:list,df:pd.DataFrame) -> pd.DataFrame:
    # 회의 때 얘기한 컬럼 전처리
    # None or 필요 없는 컬럼
    df = df.drop(columns=delete_columns)
    return df

def dict_to_column(columns:list,df:pd.DataFrame) -> pd.DataFrame:
    for col in columns:
        key_set = dict_to_set(col,df)
        df = set_to_column(col,key_set,df)
    return df

def dict_to_set(column:str,df:pd.DataFrame) -> set:
    key_set = set()
    for i in tqdm(df[column]):
        
        # column 내용이 dict일 경우
        if isinstance(i, dict):
            key_set |= set(i.keys())
        
        # column 내용이 None type 일 경우
        elif i == None:
            continue
        
        # column 내용이 [dict]로 감싸져있는 경우
        elif isinstance(i, list)&len(i)>0:
            if isinstance(i[0], str):
                continue
            key_set |= set(i[0].keys())
        
    return key_set

def set_to_column(column:str,key_set:set,df:pd.DataFrame) -> pd.DataFrame:
    for key in key_set:
        
        #중복인 경우 컬럼_중복컬럼으로 추가
        if key in df.columns:
            df[column+'_'+key] = df[column].apply(lambda x: x.get(key, None) if isinstance(x, dict) else None if x==None
                                       else ( None \
                                       if len(x)==0 else ( x[0].get(key, None) \
                                       if isinstance(x[0],dict)  else \
                                           None)) \
                                      )
        #중복이 아닌 경우
        else:
            df[key] = df[column].apply(lambda x: x.get(key, None) if isinstance(x, dict) else None if x==None
                                       else ( None \
                                       if len(x)==0 else ( x[0].get(key, None) \
                                       if isinstance(x[0],dict)  else \
                                           None)) \
                                       
                                      )

    df = df.drop(columns=[column])
    return df

# 컬럼안 key 값이 한 개일 경우 딕셔너리만 풉니다!
def key_to_element(element_columns:list,df:pd.DataFrame)->pd.DataFrame:
    for col in element_columns:
        key_set = dict_to_set(col, df)
        assert len(key_set)==1, f'{col}: key가 2개 이상이므로 dict_to_column 함수 이용하세요'
        for key in key_set:
            df[col] = df[col].apply(lambda x: x.get(key,None) if isinstance(x,dict) else None)
    return df

In [3]:
df = pd.read_csv('no_test_df.csv').iloc[:,1:]

In [4]:
df_book = df[df['name'] == '도서']

## 도서 필터링

In [5]:
# 고객과 아이템 얼마나 구매했는가
grouped_purchased = df_book.groupby(['customer_id','product_ids']).sum().reset_index()

In [6]:
customers = list(np.sort(grouped_purchased['customer_id'].unique()))
products = list (grouped_purchased['product_ids'].unique())
quantity = list(grouped_purchased['paid'])

rows = grouped_purchased['customer_id'].astype('category').cat.codes
cols = grouped_purchased['product_ids'].astype('category').cat.codes
print(len(customers)) # 2663
print(len(products))  # 223
purchase_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape = (len(customers),len(products)))
purchase_sparse #2663 * 223 행렬

2663
223


<2663x223 sparse matrix of type '<class 'numpy.int64'>'
	with 4512 stored elements in Compressed Sparse Row format>

In [7]:
# Sparsity: 얼마나 비어있나?
matrix_size = purchase_sparse.shape[0]* purchase_sparse.shape[1]
num_purchases = len(purchase_sparse.nonzero()[0])
sparsity = 100 * (1 - (num_purchases / matrix_size))
sparsity

99.2402108953623

In [8]:
als_model = ALS(factors=20, regularization=0.01, iterations = 100)
als_model.fit(purchase_sparse)

  0%|          | 0/100 [00:00<?, ?it/s]

In [9]:
result = als_model.recommend(2, purchase_sparse[2], N=10)

In [10]:
result

(array([195, 206, 193, 201, 128, 188,  59, 172, 170, 134], dtype=int32),
 array([0.05436036, 0.04319233, 0.03490718, 0.0286657 , 0.02736679,
        0.02705139, 0.01702726, 0.01642221, 0.01533489, 0.01462266],
       dtype=float32))

In [11]:
products_json = pd.read_json("/fastcampus-data/products/products.json")
products_df = pd.DataFrame(products_json)

In [12]:
products_df = key_to_element(['_id'],products_df)

100%|██████████| 5141/5141 [00:00<00:00, 778922.69it/s]


In [13]:
def recommended_product(products, result):
    products_id = products[result[0][0]]
    return products_df[products_df['_id'] == products_id].loc[:,'meta_title']

In [14]:
recommended_product(products,result)

236    정골의학 핵심정리
Name: meta_title, dtype: object