In [26]:
import gensim.corpora as corpora
import gensim

In [27]:
def preprocess(data):
    # 사용자 아이디가 없는 데이터 제외
    # quantity 음수인 데이터 제외
    df = data[~(data['CustomerID'].isnull())&(data['Quantity']>0)]
    df['CustomerID'] = df.CustomerID.astype(int)
    df['StockCode'] = df['StockCode'].astype(str)
    df = df[['InvoiceNo', 'StockCode', 'Quantity', 'CustomerID', 'InvoiceDate']]
    df['ym'] = df['InvoiceDate'].apply(lambda x: str(x)[:7])
    return df

def split_train_test(data):
    train_data = data[(data["ym"]>='2011-09')&(data["ym"]<='2011-11')]
    test_data = data[(data['ym']=='2011-12')]
    return train_data, test_data

def train_groupby(train_data, test_data):
    train_groupby = train_data.groupby(['CustomerID'])
    return train_groupby


def make_train_data(data):
    doc_list = []
    for user_id, user_df in data:
        stockcodes = user_df['StockCode'].values.tolist()
        doc_list.append(stockcodes)
    id2word = corpora.Dictionary(doc_list)
    corpus = [id2word.doc2bow(doc) for doc in doc_list]
    return id2word, corpus

In [28]:
def get_precision(X:list, Y:list):
    _intersection = set(X).intersection(Y)
    return len(_intersection) / len(Y)

def get_recall(X:list, Y:list):
    _intersection = set(X).intersection(Y)
    return len(_intersection) / len(X)

In [29]:
from pathlib import Path
import pandas as pd
import os
path = os.path.join(Path(os.getcwd()).parent, "data")
online_retail = os.path.join(path, "online_retail", "online_retail.csv")

In [30]:
from datetime import datetime
parse_date = lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M")
retail_df = pd.read_csv(online_retail, encoding="utf-8", parse_dates=["InvoiceDate"],
                        date_parser=parse_date)

In [31]:
train_data, test_data = split_train_test(preprocess(retail_df))
train_df = train_groupby(train_data, test_data)
id2word, corpus = make_train_data(train_df)
ldamodel = gensim.models.ldamodel.LdaModel(id2word=id2word, corpus=corpus, num_topics=20, passes=10,
                                              per_word_topics=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CustomerID'] = df.CustomerID.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['StockCode'] = df['StockCode'].astype(str)


### topic별 top 10 아이템 분포

In [32]:
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.012*"21615" + 0.012*"21616" + 0.012*"21619" + 0.011*"21620" + 0.011*"21108"')
(1, '0.040*"22423" + 0.039*"22699" + 0.037*"22697" + 0.033*"22698" + 0.025*"23245"')
(2, '0.057*"22865" + 0.056*"22866" + 0.047*"22867" + 0.046*"22633" + 0.042*"23439"')
(3, '0.026*"21181" + 0.026*"21175" + 0.025*"21166" + 0.024*"85152" + 0.021*"82494L"')
(4, '0.008*"23355" + 0.007*"20727" + 0.006*"23300" + 0.006*"23321" + 0.006*"23356"')
(5, '0.022*"POST" + 0.021*"21212" + 0.015*"21080" + 0.014*"21213" + 0.013*"23293"')
(6, '0.015*"22750" + 0.015*"22749" + 0.015*"22144" + 0.014*"22568" + 0.014*"20971"')
(7, '0.015*"21232" + 0.013*"21231" + 0.012*"79321" + 0.012*"22993" + 0.011*"22720"')
(8, '0.016*"23351" + 0.016*"23354" + 0.016*"23349" + 0.016*"23353" + 0.013*"23350"')
(9, '0.012*"23321" + 0.011*"23322" + 0.011*"85123A" + 0.010*"22469" + 0.010*"22577"')
(10, '0.039*"21669" + 0.032*"21670" + 0.032*"21668" + 0.031*"23032" + 0.027*"21672"')
(11, '0.027*"22726" + 0.022*"22727" + 0.012*"22728" + 0.012*"22

### 20개 토픽의 Top 5 아이템 분포

In [33]:
stock_to_description = {row["StockCode"]: row["Description"] for _, row in retail_df.iterrows()}
for i in range(20):
    recommend = ldamodel.show_topic(topicid=i, topn=5)
    print(i, [stock_to_description[item] for item, score in recommend])

0 ['4 LAVENDER BOTANICAL DINNER CANDLES', '4 PEAR BOTANICAL DINNER CANDLES', '4 VANILLA BOTANICAL CANDLES', 'SET OF 4 ROSE BOTANICAL CANDLES', 'FAIRY CAKE FLANNEL ASSORTED COLOUR']
1 ['REGENCY CAKESTAND 3 TIER', 'ROSES REGENCY TEACUP AND SAUCER ', 'GREEN REGENCY TEACUP AND SAUCER', 'PINK REGENCY TEACUP AND SAUCER', 'SET OF 3 REGENCY CAKE TINS']
2 ['HAND WARMER OWL DESIGN', 'HAND WARMER SCOTTY DOG DESIGN', 'HAND WARMER BIRD DESIGN', 'HAND WARMER UNION JACK', 'HAND WARMER RED LOVE HEART']
3 ['PLEASE ONE PERSON METAL SIGN', 'GIN AND TONIC DIET METAL SIGN', 'COOK WITH WINE METAL SIGN ', 'HAND OVER THE CHOCOLATE   SIGN ', 'WOODEN FRAME ANTIQUE WHITE ']
4 ['HOT WATER BOTTLE KEEP CALM', 'LUNCH BAG  BLACK SKULL.', 'GARDENERS KNEELING PAD CUP OF TEA ', 'SMALL WHITE HEART OF WICKER', 'LOVE HOT WATER BOTTLE']
5 ['POSTAGE', 'PACK OF 72 RETROSPOT CAKE CASES', 'SET/20 RED RETROSPOT PAPER NAPKINS ', 'PACK OF 72 SKULL CAKE CASES', 'SET OF 12 FAIRY CAKE BAKING CASES']
6 ['FELTCRAFT PRINCESS LOLA DOLL',

### 전체 유저의 토픽 분포

In [35]:
user_topic_dist = {}
for user_id, user_df in train_df:
    document = user_df['StockCode'].values.tolist()
    user_topic_dist[user_id] = ldamodel.get_document_topics(id2word.doc2bow(document), minimum_probability=0)
print("12682번 user 토픽 분포\n", user_topic_dist[12682])

12682번 user 토픽 분포
 [(0, 0.00035469927), (1, 0.00035469927), (2, 0.00035469927), (3, 0.00035469927), (4, 0.00035469927), (5, 0.19515014), (6, 0.03278907), (7, 0.00035469927), (8, 0.00035469927), (9, 0.00035469927), (10, 0.00035469927), (11, 0.00035469927), (12, 0.00035469927), (13, 0.0003546993), (14, 0.00035469927), (15, 0.23734611), (16, 0.50640327), (17, 0.00035469927), (18, 0.022990933), (19, 0.00035469927)]


### 12682번 유저 토픽 분포에서 가장 확률이 높은 토픽 선택 → Top 20 아이템 추천

In [36]:
user_topics = user_topic_dist[12682]
user_topics = sorted(user_topics, key=lambda x: (x[1]), reverse=True)
user_topic = user_topics[0][0]
print('가장 확률이 높은 토픽 : ', user_topic)
recommend = ldamodel.show_topic(topicid=user_topic, topn=20)
print('Top 20 아이템 추천 : ', [item for item, _ in recommend])
relevant = test_data[test_data["CustomerID"]==12682]['StockCode'].unique()
print("실제 12682번 유저가 데이터에서 선호한 아이템 : ", relevant)


가장 확률이 높은 토픽 :  16
Top 20 아이템 추천 :  ['85099B', '23203', '23209', '20725', '22382', '20727', '23344', '23202', '22383', '22386', '23206', '20728', '23199', '22384', '23581', '20724', '23583', '23201', '23343', '20726']
실제 12682번 유저가 데이터에서 선호한 아이템 :  ['20750' '21931' '85099B' '22423' '21242' '21243' '21239' '21240' '23040'
 '22596' '22456' '48185' '21770' '21977' '21212' '84375' '23163' '84378'
 '23020' '22966' '23084' '22556' '22551' '22555' '47566' '23192' '22139'
 '22138' '22467' 'POST']


### 유저별 LDAmodel 추천(Top 20) 성능

In [38]:
import numpy as np

train_user_ids = train_data['CustomerID'].unique()
test_user_ids = test_data['CustomerID'].unique()

topn = 20
default_recommend = list(train_data.groupby('StockCode')['Quantity'].count().sort_values(ascending=False)[:topn].index)
precisions, recalls = [], []
for user_id in test_user_ids:
    if user_id in train_user_ids:
        user_topics = user_topic_dist[user_id]
        user_topics = sorted(user_topics, key = lambda x: (x[1]), reverse=True)
        user_topic = user_topics[0][0]
        recommend = [item for item, _ in ldamodel.show_topic(topicid=user_topic, topn=topn)]
    else:
        recommend =default_recommend

    relevant = test_data[test_data['CustomerID'] == user_id]['StockCode'].unique()
    precisions.append(get_precision(relevant, recommend))
    recalls.append(get_recall(relevant, recommend))
print("Precision@K : ", np.mean(precisions))
print("Recall@K : ", np.mean(recalls))

Precision@K :  0.10089430894308944
Recall@K :  0.11428890195384434
