* pyhton 3.6.0 

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width: 85% !important; }</style>"))
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import pandas as pd
import numpy as np
import random
from scipy.sparse import csr_matrix
import scipy.sparse as sp

In [3]:
def map_data(data):
    """
    Map data to proper indices in case they are not in a continues [0, N) range

    Parameters
    ----------
    data : np.int32 arrays

    Returns
    -------
    mapped_data : np.int32 arrays
    n : length of mapped_data

    """
    uniq = list(set(data))

    id_dict = {old: new for new, old in enumerate(sorted(uniq))}
    data = np.array(list(map(lambda x: id_dict[x], data)))
    n = len(uniq)

    return data, id_dict, n

def preprocess_user_item_features(u_features, v_features):
    """
    Creates one big feature matrix out of user features and item features.
    Stacks item features under the user features.
    """

    zero_csr_u = sp.csr_matrix((u_features.shape[0], v_features.shape[1]), dtype=u_features.dtype) #943, 1682
    zero_csr_v = sp.csr_matrix((v_features.shape[0], u_features.shape[1]), dtype=v_features.dtype) #1682, 943

    u_features = sp.hstack([u_features, zero_csr_u], format='csr') #(943, 943+1682)
    v_features = sp.hstack([zero_csr_v, v_features], format='csr') #(1682, 1682+943)

    return u_features, v_features

def globally_normalize_bipartite_adjacency(adjacencies, verbose=False, symmetric=True):
    """ 
    Globally Normalizes set of bipartite adjacency matrices (calculate c_ij) 
    """
    
    if verbose:
        print('Symmetrically normalizing bipartite adj')
    # degree_u and degree_v are row and column sums of adj+I

    adj_tot = np.sum(adj for adj in adjacencies)
    degree_u = np.asarray(adj_tot.sum(1)).flatten() # sum by col (943,)
    degree_v = np.asarray(adj_tot.sum(0)).flatten() # sum by row (1682,)

    # set zeros to inf to avoid dividing by zero
    degree_u[degree_u == 0.] = np.inf 
    degree_v[degree_v == 0.] = np.inf 
    
    degree_u_inv_sqrt = 1. / np.sqrt(degree_u) # (943,)
    degree_v_inv_sqrt = 1. / np.sqrt(degree_v) # (1682,)
     
    degree_u_inv_sqrt_mat = sp.diags([degree_u_inv_sqrt], [0]) #[0]: 대각성분 시작 인덱스 
    degree_v_inv_sqrt_mat = sp.diags([degree_v_inv_sqrt], [0])

    degree_u_inv = degree_u_inv_sqrt_mat.dot(degree_u_inv_sqrt_mat)

    if symmetric:
        # symmetric normalization
        adj_norm = [degree_u_inv_sqrt_mat.dot(adj).dot(degree_v_inv_sqrt_mat) for adj in adjacencies]

    else:
        # left normalization 
        adj_norm = [degree_u_inv.dot(adj) for adj in adjacencies] 

    return adj_norm


def normalize_features(feat):

    degree = np.asarray(feat.sum(1)).flatten()

    # set zeros to inf to avoid dividing by zero
    degree[degree == 0.] = np.inf

    degree_inv = 1. / degree
    degree_inv_mat = sp.diags([degree_inv], [0])
    feat_norm = degree_inv_mat.dot(feat)

    if feat_norm.nnz == 0:
        print('ERROR: normalized adjacency matrix has only zero entries!!!!!')
        exit

    return feat_norm

# 데이터 준비 
* http://files.grouplens.org/datasets/movielens/  
* ml-100k.zip  

### 1-1. 데이터 로드 
* figue 1 참고

In [5]:
path = './ml-100k/'
dtypes = {'u_nodes': np.int32, 'v_nodes': np.int32, 'ratings': np.float32, 'timestamp': np.float64}

data_train = pd.read_csv(path+'u1.base', sep='\t', header=None,
                         names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], dtype=dtypes)

data_test = pd.read_csv(path+'u1.test', sep='\t', header=None,
                        names=['u_nodes', 'v_nodes', 'ratings', 'timestamp'], dtype=dtypes)

print('train shape:', data_train.shape)
print('test shape:', data_test.shape)

train shape: (80000, 4)
test shape: (20000, 4)


In [6]:
data_train.head()

Unnamed: 0,u_nodes,v_nodes,ratings,timestamp
0,1,1,5.0,874965758.0
1,1,2,3.0,876893171.0
2,1,3,4.0,878542960.0
3,1,4,3.0,876893119.0
4,1,5,3.0,889751712.0


In [7]:
data_test.head()

Unnamed: 0,u_nodes,v_nodes,ratings,timestamp
0,1,6,5.0,887431973.0
1,1,10,3.0,875693118.0
2,1,12,5.0,878542960.0
3,1,14,5.0,874965706.0
4,1,17,3.0,875073198.0


In [8]:
# df -> array로 변경 

# train data
data_array_train = data_train.values.tolist()
data_array_train = np.array(data_array_train) # shape:(80000, 4)

# test data
data_array_test = data_test.values.tolist() 
data_array_test = np.array(data_array_test)   # shape:(20000, 4)

# train + test data 
data_array = np.concatenate([data_array_train, data_array_test], axis=0) # shape:(100000, 4) 

In [9]:
data_train.head()

Unnamed: 0,u_nodes,v_nodes,ratings,timestamp
0,1,1,5.0,874965758.0
1,1,2,3.0,876893171.0
2,1,3,4.0,878542960.0
3,1,4,3.0,876893119.0
4,1,5,3.0,889751712.0


In [10]:
data_array_train

array([[1.00000000e+00, 1.00000000e+00, 5.00000000e+00, 8.74965758e+08],
       [1.00000000e+00, 2.00000000e+00, 3.00000000e+00, 8.76893171e+08],
       [1.00000000e+00, 3.00000000e+00, 4.00000000e+00, 8.78542960e+08],
       ...,
       [9.43000000e+02, 1.18800000e+03, 3.00000000e+00, 8.88640250e+08],
       [9.43000000e+02, 1.22800000e+03, 3.00000000e+00, 8.88640275e+08],
       [9.43000000e+02, 1.33000000e+03, 3.00000000e+00, 8.88692465e+08]])

### 1-2. 데이터 생성  
#### user, item, rating 

In [11]:
# user node
u_nodes = data_array[:, 0].astype(dtypes['u_nodes'])
# item node
v_nodes = data_array[:, 1].astype(dtypes['v_nodes']) 
# rating node 
ratings = data_array[:, 2].astype(dtypes['ratings'])         
# ranting class 
class_values = np.sort(np.unique(ratings))

print('num users:', len(np.unique(u_nodes)))
print('num items:', len(np.unique(v_nodes)))
print('num ratings:', len(np.unique(ratings)))


num users: 943
num items: 1682
num ratings: 5


In [12]:
# create user, item id mapping dictionary 
u_nodes, u_dict, num_users = map_data(u_nodes) 
v_nodes, v_dict, num_items = map_data(v_nodes) 
rating_dict = {r: i for i, r in enumerate(np.sort(np.unique(ratings)).tolist())}

# change data type 
u_nodes, v_nodes = u_nodes.astype(np.int64), v_nodes.astype(np.int32)

# set unobserved value
neutral_rating = -1

#### user-item matrix 

In [13]:
labels = np.full((num_users, num_items), neutral_rating, dtype=np.int32) # shape: (943, 1682)
labels

array([[-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1]], dtype=int32)

In [14]:
labels[u_nodes, v_nodes] = np.array([rating_dict[r] for r in ratings]) # 값 부여 
labels

array([[ 4,  2,  3, ..., -1, -1, -1],
       [ 3, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [ 4, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1,  4, -1, ..., -1, -1, -1]], dtype=int32)

In [15]:
labels = labels.reshape([-1]) # (1586126,) = (num_users*num_items, )
labels

array([ 4,  2,  3, ..., -1, -1, -1], dtype=int32)

#### train, test, validation data 

In [16]:
num_train = data_array_train.shape[0]   # 80000

num_test = data_array_test.shape[0]     # 20000
num_val = int(np.ceil(num_train * 0.2)) # 16000
num_train = num_train - num_val         # 64000

In [17]:
# user, item node pairs 
pairs_nonzero = np.array([[u, v] for u, v in zip(u_nodes, v_nodes)])  # shape: (100000, 2)
pairs_nonzero_train = pairs_nonzero[0:num_train+num_val]              # shape: (80000, 2)
pairs_nonzero_test = pairs_nonzero[num_train+num_val:]                # shape: (20000, 2)

In [18]:
pairs_nonzero

array([[  0,   0],
       [  0,   1],
       [  0,   2],
       ...,
       [458, 933],
       [459,   9],
       [461, 681]])

In [23]:
pairs_nonzero.shape

(100000, 2)

In [19]:
# index which has nonzero value
idx_nonzero = np.array([u * num_items + v for u, v in pairs_nonzero]) # shape: (100000,) ; labels에서 -1이 아닌 index
idx_nonzero_train = idx_nonzero[0:num_train+num_val]                  # shape: (80000,)
idx_nonzero_test = idx_nonzero[num_train+num_val:]                    # shape: (20000,)

In [22]:
idx_nonzero

array([     0,      1,      2, ..., 771289, 772047, 776083])

In [24]:
idx_nonzero.shape

(100000,)

In [25]:
# draw random index to shuffle train
np.random.seed(42)
rand_idx = [i for i in range(len(idx_nonzero_train))]
np.random.shuffle(rand_idx)

pairs_nonzero_train = pairs_nonzero_train[rand_idx]
idx_nonzero_train = idx_nonzero_train[rand_idx]

pairs_nonzero = np.concatenate([pairs_nonzero_train, pairs_nonzero_test], axis=0)
idx_nonzero = np.concatenate([idx_nonzero_train, idx_nonzero_test], axis=0) 

In [26]:
# train, test, validation for nonzero index 
val_idx = idx_nonzero[0:num_val]
train_idx = idx_nonzero[num_val: num_val + num_train]
test_idx = idx_nonzero[num_val + num_train:]

# train, test, validation for pairs
val_pairs_idx = pairs_nonzero[0:num_val]                      # (16000, 2)
train_pairs_idx = pairs_nonzero[num_val: num_val + num_train] # (64000, 2)
test_pairs_idx = pairs_nonzero[num_val + num_train:]          # (20000, 2)

val_u_indices, val_v_indices = val_pairs_idx.transpose()        # (16000,), (16000,)
train_u_indices, train_v_indices = train_pairs_idx.transpose()  # (64000,), (64000,)
test_u_indices, test_v_indices = test_pairs_idx.transpose()     # (20000,), (20000,)

# train, test, validation for labels
val_labels = labels[val_idx]
train_labels = labels[train_idx]
test_labels = labels[test_idx]


#### adjacency matrix
* page2. 오른쪽 상단의 M_r 메트릭스 정의

In [27]:
# train index에만 rating 부여 (M_r matrix)
rating_mx_train = np.zeros(num_users * num_items, dtype=np.float32)         # (1586126,)
rating_mx_train[train_idx] = labels[train_idx].astype(np.float32) + 1.      
rating_mx_train = csr_matrix(rating_mx_train.reshape(num_users, num_items)) # (943, 1682) 

rating_mx_train.toarray()

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]], dtype=float32)

### 1-3. 사이드 정보 데이터 

#### item(movie) side feature: genre 

In [28]:
# item(movie) features(genres)
names = ['movie id', 'movie title', 'release date', 'video release date',
         'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
         'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
         'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
         'Thriller', 'War', 'Western']

movie_df = pd.read_csv(path+'u.item', sep=r'|', header=None, names=names, engine='python')

In [29]:
movie_df.head(3)

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [30]:
genre_headers = movie_df.columns.values[6:]
num_genres = genre_headers.shape[0] # 18개  
genre_headers

array(['Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
       'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
       'Western'], dtype=object)

In [31]:
v_side_features_ = np.zeros((num_items, num_genres), dtype=np.float32) #(1682, 18) 
v_side_features_ 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [32]:
for movie_id, genre_one_hot_vec in zip(movie_df['movie id'].values.tolist(), movie_df[genre_headers].values.tolist()):
    if movie_id in v_dict.keys():
        v_side_features_[v_dict[movie_id],:] = genre_one_hot_vec 
v_side_features_

array([[0., 0., 1., ..., 0., 0., 0.],
       [1., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [33]:
movie_df.loc[0]

movie id                                                              1
movie title                                            Toy Story (1995)
release date                                                01-Jan-1995
video release date                                                  NaN
IMDb URL              http://us.imdb.com/M/title-exact?Toy%20Story%2...
unknown                                                               0
Action                                                                0
Adventure                                                             0
Animation                                                             1
Childrens                                                             1
Comedy                                                                1
Crime                                                                 0
Documentary                                                           0
Drama                                                           

In [34]:
v_side_features_[0]

array([0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.], dtype=float32)

#### user side features: age, gender, occupation

In [35]:
names = ['user id', 'age', 'gender', 'occupation', 'zip code']
users_df = pd.read_csv(path+'u.user', sep= r'|', header=None, names=names, engine='python')

In [36]:
users_df.head()

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [37]:
# age 
age = users_df['age'].values
age_max = age.max()

# gender 
gender_dict = {'M': 0., 'F': 1.}

# occupation 
occupation = set(users_df['occupation'].values.tolist())
occupation_dict = {f: i for i, f in enumerate(occupation, start=2)} #21개 

num_feats = 2 + len(occupation_dict) #23

In [38]:
u_side_features_ = np.zeros((num_users, num_feats), dtype=np.float32) #(943, 23) u_side_features로 변경 
for _, row in users_df.iterrows():
    u_id = row['user id']
    if u_id in u_dict.keys():
        # age
        u_side_features_[u_dict[u_id], 0] = row['age'] / np.float(age_max)
        # gender
        u_side_features_[u_dict[u_id], 1] = gender_dict[row['gender']]
        # occupation
        u_side_features_[u_dict[u_id], occupation_dict[row['occupation']]] = 1.
        

In [39]:
list(users_df.iterrows())[14]

(14, user id             15
 age                 49
 gender               F
 occupation    educator
 zip code         97301
 Name: 14, dtype: object)

In [40]:
u_side_features_[14]

array([0.6712329, 1.       , 0.       , 0.       , 0.       , 1.       ,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
       0.       , 0.       , 0.       , 0.       , 0.       ],
      dtype=float32)

In [41]:
# to csr matrix 
u_side_features_ = csr_matrix(u_side_features_)
v_side_features_ = csr_matrix(v_side_features_)

print("user side features shape: "+str(u_side_features_.shape))
print("item side features shape: "+str(v_side_features_.shape))

user side features shape: (943, 23)
item side features shape: (1682, 18)


In [42]:
# normalize side features  
u_features_side_ = normalize_features(u_side_features_) # 각 행별로 더해서 나눔 
v_features_side_ = normalize_features(v_side_features_)

In [43]:
u_features_side_.toarray()

array([[0.2474227 , 0.        , 0.        , ..., 0.        , 0.75257736,
        0.        ],
       [0.26633164, 0.36683416, 0.        , ..., 0.        , 0.        ,
        0.36683416],
       [0.23958333, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.21505375, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.2474227 , 0.37628868, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.23157896, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [44]:
u_features_side_.toarray()[0] # (23,)

array([0.2474227 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.75257736, 0.        ], dtype=float32)

In [45]:
# preprocess_user_item_features() 동작 방식 
u_features_side, v_features_side = preprocess_user_item_features(u_features_side_, v_features_side_)

print("user side features shape: "+str(u_features_side.shape))
print("item side features shape: "+str(v_features_side.shape))

user side features shape: (943, 41)
item side features shape: (1682, 41)


In [46]:
u_features_side.toarray()[0] #(23+18, )

array([0.2474227 , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.75257736, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ], dtype=float32)

In [47]:
v_features_side.toarray()[0] #(23+18, )

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.33333334, 0.33333334, 0.33333334, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ], dtype=float32)

In [48]:
u_features_side = np.array(u_features_side.todense(), dtype=np.float32)
v_features_side = np.array(v_features_side.todense(), dtype=np.float32)
num_side_features = u_features_side.shape[1] #41 = 18+23

### 1-4. 노드 정보 데이터 

In [49]:
id_csr_u = sp.identity(num_users, format='csr') #diagnal matrix 
id_csr_v = sp.identity(num_items, format='csr')

u_features, v_features = preprocess_user_item_features(id_csr_u, id_csr_v)

In [50]:
print('shape of id_csr_u:', id_csr_u.shape)
print('shape of id_csr_v:', id_csr_v.shape)

print('shape of u_features:', u_features.shape)
print('shape of v_features:', v_features.shape)

shape of id_csr_u: (943, 943)
shape of id_csr_v: (1682, 1682)
shape of u_features: (943, 2625)
shape of v_features: (1682, 2625)


In [51]:
id_csr_u.toarray() # (943, 943)

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [52]:
# preprocess_user_item_features() 동작 방식 
u_features.toarray() # (943, 2625)

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [53]:
#예시 
data_train.head(1)

Unnamed: 0,u_nodes,v_nodes,ratings,timestamp
0,1,1,5.0,874965758.0


In [54]:
u_features.toarray()[0]

array([1., 0., 0., ..., 0., 0., 0.])

In [55]:
v_features.toarray()[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [56]:
v_features.toarray()[0][943]

1.0

### 1-5. global normalization
* page3. 수식(1)의 c_ij 정의

In [57]:
## 2-4. global normalization
numclass = 5
support = []
support_t = []

# create M_r matrix for each rating 
for i in range(numclass):    
    support_unnormalized = sp.csr_matrix(rating_mx_train == i+1, dtype=np.float32) # M_1, M_2, ...M_R (943, 1682)
    support_unnormalized_transpose = support_unnormalized.T # (1682, 943)
    
    support.append(support_unnormalized)
    support_t.append(support_unnormalized_transpose)
    

In [58]:
rating_mx_train.toarray()

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]], dtype=float32)

In [59]:
support[-1].toarray() # M_5와 동일 

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

#### left normalization 

In [60]:
# 계산 과정 그림 첨부 
support = globally_normalize_bipartite_adjacency(support, symmetric=False)      # [[], [], ...]
support_t = globally_normalize_bipartite_adjacency(support_t, symmetric=False)

In [61]:
support[-1].toarray() # for rating 5 (943, 1682)

array([[0.00819672, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.05555556, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00763359, 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [62]:
num_support = len(support) 
support = sp.hstack(support, format='csr')      #(943,  1682*num_support) = (943, 8410)
support_t = sp.hstack(support_t, format='csr')  #(1682, 943*num_support)  = (1682, 4715)

### 1-6. 인덱스 맞추기 

In [63]:
# Collect all user and item nodes for train set
train_u = list(set(train_u_indices)) # 943개 
train_v = list(set(train_v_indices)) # 1614개 
train_u_dict = {u_idx: i for i, u_idx in enumerate(train_u)}
train_v_dict = {v_idx: i for i, v_idx in enumerate(train_v)}

train_u_indices = np.array([train_u_dict[u_idx] for u_idx in train_u_indices]) #64000개 
train_v_indices = np.array([train_v_dict[v_idx] for v_idx in train_v_indices]) #64000개 

# for side features 
train_u_features_side = u_features_side[np.array(train_u)] #(943, 41)
train_v_features_side = v_features_side[np.array(train_v)] #(1614, 41)

# for adj matrix 
train_support = support[np.array(train_u)]     #(943, 8410)
train_support_t = support_t[np.array(train_v)] #(1614, 4715)


In [64]:
# Collect all user and item nodes for test set
test_u = list(set(test_u_indices)) # 459개
test_v = list(set(test_v_indices)) # 1410개 
test_u_dict = {n: i for i, n in enumerate(test_u)}
test_v_dict = {n: i for i, n in enumerate(test_v)}

test_u_indices = np.array([test_u_dict[u_idx] for u_idx in test_u_indices]) #20000개 
test_v_indices = np.array([test_v_dict[v_idx] for v_idx in test_v_indices]) #20000개

# for side features 
test_u_features_side = u_features_side[np.array(test_u)] #(459, 41)
test_v_features_side = v_features_side[np.array(test_v)] #(1410, 41)

# for adj matrix 
test_support = support[np.array(test_u)]      # (459, 1682*num_support) = (459, 8410)
test_support_t = support_t[np.array(test_v)]  # (1410, 943*num_support) = (1410, 4715)


In [65]:
# Collect all user and item nodes for validation set
val_u = list(set(val_u_indices)) #933개 
val_v = list(set(val_v_indices)) #1351개 
val_u_dict = {n: i for i, n in enumerate(val_u)}
val_v_dict = {n: i for i, n in enumerate(val_v)}

val_u_indices = np.array([val_u_dict[u_idx] for u_idx in val_u_indices]) #16000개
val_v_indices = np.array([val_v_dict[v_idx] for v_idx in val_v_indices]) #16000개 

# for side features 
val_u_features_side = u_features_side[np.array(val_u)] #(933, 41)
val_v_features_side = v_features_side[np.array(val_v)] #(1351, 41)
 
# for adj matrix 
val_support = support[np.array(val_u)]     #(933, 8410)
val_support_t = support_t[np.array(val_v)] #(1351, 4715)


# Model
* pseudo code 
* RecommenderSideInfoGAE() 

In [None]:
import tensorflow as tf  # 1.4.0

In [66]:
placeholders = {
    
    # user, item index
    'user_indices': tf.placeholder(tf.int32, shape=(None,)),
    'item_indices': tf.placeholder(tf.int32, shape=(None,)),
    
    # user, item features 
    'u_features': tf.sparse_placeholder(tf.float32, shape=np.array(u_features.shape, dtype=np.int64)),
    'v_features': tf.sparse_placeholder(tf.float32, shape=np.array(v_features.shape, dtype=np.int64)),
    
    # user, item side features 
    'u_features_side': tf.placeholder(tf.float32, shape=(None, num_side_features)),
    'v_features_side': tf.placeholder(tf.float32, shape=(None, num_side_features)),
     
    # rating 
    'labels': tf.placeholder(tf.int32, shape=(None,)),
    'class_values': tf.placeholder(tf.float32, shape=class_values.shape),
    
    # adj matrix 
    'support': tf.sparse_placeholder(tf.float32, shape=(None, None)),
    'support_t': tf.sparse_placeholder(tf.float32, shape=(None, None)),
   
    # dropout 
    'dropout': tf.placeholder_with_default(0., shape=()),
    'u_features_nonzero': tf.placeholder(tf.int32, shape=()), 
    'v_features_nonzero': tf.placeholder(tf.int32, shape=()),

    'weight_decay': tf.placeholder_with_default(0., shape=()),

}

In [62]:
 def build():
            
        # ---------- layers for user ----------
        
        # gcn layer(encoder)
        # ref   : page2 - (2)
        # input : (943, 2625)
        # output: (943, 500)
        gcn_u_output = gcn_layer(u_features) 
        
        
        # dense1 layer(for side features)
        # ref   : page4 - (10)
        # input : (943, 41)
        # output: (943, 10)
        u_side_output = dense1(u_features_side) 
        
        
        # dense2 layer(for concat) 
        # ref   : page2 - (3)
        # input : (943, 510)
        # output: (943, 75)
        input_u = concat([gcn_u_output, u_side_output])
        u_latent_output = dense2(input_u)
        
      
        # ---------- layers for item ----------
    
        # gcn layer(encoder)
        # input : (1682, 2625)
        # output: (1682, 500)
        gcn_v_output = gcn_layer(v_features)
        
        
        # dense1 layer(for side features)
        # input : (1682, 41)
        # output: (1682, 10)
        v_side_output = dense1(v_features_side) 
        
        
        # dense2 layer(for concat)
        # input : (1682, 510)
        # output: (1682, 75)
        input_v = concat([gcn_v_output, v_side_output])
        v_latent_output = dense2(input_v)
        
        
        # ----------- output layer ------------
        
        # decoder
        # ref   : page 3 - (4)
        # input : (943, 75), (1682, 75)
        # output: (80000, 5)
        output = decoder_layer([u_latent_output, v_latent_output])
              
        # ------------- learning --------------
        
        loss = cross_entropy(output, labels) 
        rmse = metric(output, labels)
        AdamOptimizer.minimize(loss)



In [64]:
def build_ours():

    # gcn layer(encoder)
    gcn_u_output, gcn_v_output = gcn_layer([u_features, v_features])

    # dense1 layer(dense layer for side features)
    u_side_output, v_side_output = dense1([u_features_side, v_features_side])

    # dense2 layer(dense layer for concat) 
    input_u = concat([gcn_u_output, u_side_output])
    input_v = concat([gcn_v_output, v_side_output])

    u_latent_output, v_latent_output = dense2([input_u, input_v])

    # output layer(decoder)
    output = decoder_layer([u_latent_output, v_latent_output]) #(80000, num_class)

    # learning 
    loss = cross_entropy(output, labels)
    rmse = metric(output, labels)
    AdamOptimizer.minimize(loss)
        