# aspectの特徴量を作る

In [25]:
import codecs
import pickle
import glob
import numpy as np
import scipy.stats

### 各aspect = topicの単語読み込み

In [3]:
topic_word = []
with codecs.open('hotel_bootstrapping_lda.dat', 'r', 'utf-8') as f:
    for l in f:
        topic_word.append(l.replace('\n', '').split(' ')[1:])
        

### 各アイテムの各aspectに対するsentiment factorを読み込み

In [35]:
hotel_aspect = []
with open('prediction.dat', 'r') as f:
    for l in f:
        hotel_id = l.split('\t')[0]
        aspect = [float(a) for a in l.split('\t')[9:15]]
        hotel_aspect.append([hotel_id, aspect])
        
print(len(hotel_aspect))

1759


sentiment factor $s_n^A$をスケーリングする。（平均０、分散１）

In [36]:
s_mat = []
hotel_id = []
for idx, aspect in hotel_aspect:
    hotel_id.append(idx)
    s_mat.append(aspect)
    
hotel_aspect = scipy.stats.zscore(np.array(s_mat))

`hotel_id`にホテルのID, `hotel_aspect`の各行がaspectになってる

### userリストとネットワーク読み込み

In [5]:
KG = pickle.load(open('../../knowledge_graph/trip_ad/trip_ad_graph.pickle', 'rb'))
print(len(KG.edges()))
print(len(KG.nodes()))


edges = list(KG.edges())
# implicit feedback のedgeを格納
implicit_feedback = []

for e in edges:
    if e[0][0:2] == 'u_':
        implicit_feedback.append(e)
    elif e[1][0:2] == 'u_':
        implicit_feedback.append(e)

30191
8074


In [6]:
user2idx = {}
item2idx = {}
user_list = []
item_list = []

for f in implicit_feedback:
    if f[0][0:2] == 'u_':
        user = f[0]
        item = f[1]
    else:
        user = f[1]
        item = f[0]
        
    if user not in user2idx:
        user2idx[user[2:]] = len(user2idx)
    if item not in item2idx:
        item2idx[item[2:]] = len(item2idx)
        
        
user_list = list(user2idx)
item_list = list(item2idx)

print('user :{}'.format(len(user_list)))
print('item :{}'.format(len(item_list)))

user :6134
item :1763


### user/itemのレビュー読み込み
- user-reviewの辞書を作る
- item-reviewの辞書を作る

In [21]:
# user-reviewの辞書を作る

user2review = {}
item2review = {}
hotel_list = glob.glob('../../dataset/TripAdvisor/hotel_review/hotel*')

count = 0

for hotel in hotel_list:
    
    with codecs.open(hotel, 'r', 'utf-8') as f: 
        
        item = hotel.split('_')[2].split('.')[0]
        item_review = ''
        user = ''
        
        for l in f:
            if l[0:3] == '<Au':
                user = l[8:].replace('\n', '').replace('\r', '')
                
            if l[0:3] == '<Co':
                item_review += l[9:]
                if user in user2review:
                    user2review[user] = user2review[user] + l[9:]
                else:
                    #print(user)
                    user2review[user] = l[9:]
                user = ''
        
        item2review[item] = item_review
    
        
    #count += 1
    #if count > 5: 
    #    break
        

## aspectの特徴量を作る
user-aspect_featureの辞書を作る  
特徴量の作り方はZhang2014とHou2018のどちらかをとる。今回は  
$p_u = \frac{1}{1 + \exp(-c_u^A)}$  
$p_i = \frac{1}{1 + \exp(-s_i^Ac_i^A)}$

In [49]:
def get_feat_user(revi):
    feat = []
    for words in topic_word:
        count = 0
        for w in words:
            if w in revi: count += 1
        feat.append(1 / (1 + np.exp(-1 * count)))
        count = 0
            
    return feat

def get_feat_item(revi, aspect):
    feat = []
    for i in range(len(topic_word)):
        count = 0
        for w in topic_word[i]:
            if w in revi: count += 1
        feat.append(1 / (1 + np.exp(-1 * count * aspect[i])))
        count = 0
            
    return feat

In [50]:
user_aspect_dict = {}
for user in user_list:
    try:
        feat = get_feat_user(user2review[user])
    except:
        print(user)
        feat = [0,0,0,0,0,0]
    user_aspect_dict[user] = feat

Lemora
marcus_p
finnsino


In [51]:
item_aspect_dict = {}
for item in item_list:
    try:
        idx = hotel_id.index(item)
    except:
        print(item)
        item_aspect_dict[item] = [0,0,0,0,0,0]
        continue
    feat = get_feat_item(item2review[item], hotel_aspect[idx])
    item_aspect_dict[item] = feat
    

111969
111974
301028
302139
300154
284441
223408
223129
223120
309180
249682
256733
111978
301770
256734
112007
223133
325490
302012
120831
120852
111983
125161
111993
223128
302144
123524
305304
302020
302835
305313
300859
217522
305365
302853
305294
214170
325491
320080
223130
258882
223116
268257
302934
120822
301762
123519
305293
302137
239900
112000
305290
1159023
301590
223122
235216
223121
302918
120861
305372
305349
1127236
308433
299851
115251
302931
305370
281614
320079
115244
296344
302919
116270
299735
325519
1159881
120848
305303
218544
308420
325502
241044
1166502


In [52]:
user_aspect_dict[user_list[670]]

[0.999983298578152,
 0.9999999999999982,
 0.9990889488055994,
 0.999983298578152,
 0.999983298578152,
 0.9999999847700205]

## user/item_aspect_featureを保存

In [54]:
with open('user_aspect_feat.pickle', 'wb') as f:
     pickle.dump(user_aspect_dict, f)
        
with open('item_aspect_feat.pickle', 'wb') as f:
     pickle.dump(item_aspect_dict, f)

In [126]:
with open('user_aspect_feat.pickle', 'rb') as f:
    dic = pickle.load(f)