In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import sys

In [2]:
# 數據位置
ROOT_DIR = 'C:\\Users\\kccho\\Desktop'
DATA_DIR = os.path.join(ROOT_DIR, 'ml-100k/')

# 讀檔
users = pd.read_csv(os.path.join(DATA_DIR, 'u.user'), sep='|', header=None, engine='python', encoding='latin-1')

#描述用戶特徵的列
users.columns = ['Index', 'Age', 'Gender', 'Occupation', 'Zip code']


#users.head()

In [3]:
#用戶數
nb_users = len(users)

# Gender: 將“M”和“F”轉換為 0 和 1
gender = np.where(np.matrix(users['Gender']) == 'M', 0, 1)[0]

print('Shape of gender features:', gender.shape)

# 職業
occupation_name = np.array(pd.read_csv(os.path.join(DATA_DIR, 'u.occupation'), 
                                            sep='|', header=None, engine='python', encoding='latin-1').loc[:, 0])

# 用戶職業的布爾轉換
occupation_matrix = np.zeros((nb_users, len(occupation_name)))

for k in np.arange(nb_users):
    occupation_matrix[k, occupation_name.tolist().index(users['Occupation'][k])] = 1

print('Shape of user occupation matrix (num of users x num of occupations):', occupation_matrix.shape)

# Concatenation of the sociodemographic variables 
user_attributes = np.concatenate((np.matrix(users['Age']), np.matrix(gender), occupation_matrix.T)).T.tolist()

print('Shape of final user attribute matrix: (list of users with 23 features):', len(user_attributes), len(user_attributes[0]))

Shape of gender features: (943,)
Shape of user occupation matrix (num of users x num of occupations): (943, 21)
Shape of final user attribute matrix: (list of users with 23 features): 943 23


In [5]:
#排名
def rank_top_k(names, ratings):

    ranked_ids = np.argsort(ratings)
    return names[ranked_ids], ratings[ranked_ids]

#合成資料
def convert(data, nb_users, nb_movies):
    new_data = []
    for id_users in range(1, nb_users + 1):
        id_movies = data[:,1][data[:,0] == id_users]
        id_ratings = data[:,2][data[:,0] == id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies - 1] = id_ratings
        new_data.append(list(ratings))
    return new_data

#男女性別更改為0/1
def transform(user_sex, user_age, user_occupation):

    argument = users_occupation_name.tolist().index(user_occupation)
    empty = np.zeros(len(users_occupation_name))
    empty[argument] = 1

    if user_sex == 'Male':
        user_sex = 0
    else:
        user_sex = 1

    return np.concatenate(([user_sex], [user_age], empty.T)).T.tolist()


In [9]:
# 讀檔
movies = pd.read_csv(os.path.join(DATA_DIR, 'u.item'), sep='|', header=None, engine='python', encoding='latin-1')

# 電影數
nb_movies = len(movies)
print('The number of movies is: ', nb_movies)

#類別
movies_genre = np.matrix(movies.loc[:, 5:])
movies_genre_name = np.array(pd.read_csv(os.path.join(DATA_DIR, 'u.genre'), sep='|', header=None, engine='python', encoding='latin-1').loc[:, 0])

movies.columns = ['Index', 'Title', 'Release', 'The Not a Number column', 'Imdb'] + movies_genre_name.tolist()
movies.head()

The number of movies is:  1682


Unnamed: 0,Index,Title,Release,The Not a Number column,Imdb,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
#利用u1.base,u1.test當測試集與訓練集
#用戶index、電影index、評分、時間戳

training_set = np.array(pd.read_csv(os.path.join(DATA_DIR, 'u1.base'), delimiter='\t'), dtype='int')
testing_set = np.array(pd.read_csv(os.path.join(DATA_DIR, 'u1.test'), delimiter='\t'), dtype='int')

print('Example sample (user idx, movie idx, rating, timestamp: ', training_set[0])
print('Shape of original training and test set with shape:     ', training_set.shape, testing_set.shape)

Example sample (user idx, movie idx, rating, timestamp:  [        1         2         3 876893171]
Shape of original training and test set with shape:      (79999, 4) (19999, 4)


In [13]:
train_set = convert(training_set, nb_users, nb_movies)
test_set = convert(testing_set, nb_users, nb_movies)

print('Shape of final training set: (list of users x list of all movies):', len(train_set), len(train_set[0]))
print('Shape of final test set:     (list of users x list of all movies):', len(test_set), len(test_set[0]))

Shape of final training set: (list of users x list of all movies): 943 1682
Shape of final test set:     (list of users x list of all movies): 943 1682


In [38]:
train_matrix = np.array(train_set)
assert train_matrix.shape == (943, 1682)

binarized_train_matrix = np.where(train_matrix > 0 , 1, 0)

num_movies_watched = np.sum(binarized_train_matrix, axis=1) ## 每個用戶的電影總和

#確保我們的模型不僅不會過度擬合訓練數據集的特徵
def split(data, ratio, tensor=False):
    train = np.zeros((len(data), len(data[0]))).tolist()
    valid = np.zeros((len(data), len(data[0]))).tolist()

    for i in range(len(data)):
        for j in range(len(data[i])):
            if data[i][j] > 0:
                if np.random.binomial(1, ratio, 1):
                    train[i][j] = data[i][j]
                else:
                    valid[i][j] = data[i][j]

    return [train, valid]

train = split(train_set, 0.8)
test = test_set

In [18]:
#Learning loop：迭代優化過程，優化過程是利用SGD函數(梯度下降)，
#梯度下降是一種迭代方法，將估計模型與給定目標的差距，並將進行必要的模型更新，直到達到給定的停止標準。

def learn_to_recommend(data, features=10, lr=0.0002, epochs=101, weigth_decay=0.02, stopping=0.001):
    """
     Args:
       data: 每一個評價
       特徵：潛在變量的數量
       lr：梯度下降的學習率
        epochs：要執行的迭代次數或最大循環次數
       weigth_decay: 正規化
       stopping: 停止標準的time
      
     Returns:
       P：用戶的潛在矩陣
       Q: 項目的潛在矩陣
       loss_train: 在train上每次迭代後損失函數的不同值的向量
       loss_valid: 每次迭代後損失函數的向量
       
       """
     
    train, valid = data[0], data[1]
    nb_users, nb_items = len(train), len(train[0])

 # 列表的初始化
    loss_train, loss_valid = [], []

    P = np.random.rand(nb_users, features) * 0.1
    Q = np.random.rand(nb_items, features) * 0.1
    
    for e in range(epochs):        
        for u in range(nb_users):
            for i in range(nb_items):
                
# 在每個時期結束時，查看指標以確保我們的模型正在學習
                if train[u][i] > 0:
                    error_ui = train[u][i] - prediction(P, Q, u, i)
                    P, Q = sgd(error_ui, P, Q, u, i, features, lr, weigth_decay)
                               
        #指標來確保模型正在學習
        loss_train.append(loss(train, P, Q))
        loss_valid.append(loss(valid, P, Q))
        
        if e % 10 == 0:
            print('Epoch : ', "{:3.0f}".format(e+1), ' | Train :', "{:3.3f}".format(loss_train[-1]), 
                  ' | Valid :', "{:3.3f}".format(loss_valid[-1]))

            
          # 新的停止標準
        if e > 1:
            if abs(loss_valid[-1] - loss_valid[-2]) < stopping:
                break
        
    return P, Q, loss_train, loss_valid

In [19]:
 #prediction函數回傳用戶 u 對項目 i 的預測評價
    
def prediction(P, Q, u, i):
    """
    Args:
        P: 用戶矩陣
        Q: 項目矩陣
        u: 與用戶 u 關聯的索引
        i:與項目 i 關聯的索引
    Returns:
        pred: 用戶 u 對項目 i 的預測評價
    """
    
    return np.dot(P[u,:], Q[i,:])

#損失函數在預測模型的構建中起著決定性的作用。通過迭代調整潛在矩陣 P 和 Q 的值來優化這個成本函數。
#預測分數是利用均方誤差 (MSE) 的方式。

def loss(data, P, Q):
    """
    Args:
       data: ratings
       P: matrix of users
       Q: matrix of items   
    Returns:
        MSE: observed mean of squared errors 
    """
    errors_sum, nb_evaluations = 0., 0
    nb_users, nb_items = len(data), len(data[0])

    for u in range(nb_users):
        for i in range(nb_items):
        
            
            if data[u][i] > 0:
                errors_sum += pow(data[u][i] - prediction(P, Q, u, i), 2)
                nb_evaluations += 1
                
    return errors_sum / nb_evaluations

In [20]:
#隨機梯度下降（SGD）； 一種迭代方法，看每個用戶的所有非零評價。

def sgd(error, P, Q, id_user, id_item, features, lr, weigth_decay):
    """
   Args:
        error: 觀察和預測評估之間的差異
        P: 用戶矩陣
        Q: 項目矩陣
        id_user: id_user
        id_item: id_item
        features: 潛在變量的數量
        lr: 學習梯度下降
        weigth_decay: 控制正項影響
       
     Returns:
        P: P 的新估計
        Q: Q 的新估計
        
     """    
    
    
      # sgd 函數來更新我們的模型參數
    for f in range(features):
        P[id_user, f] = P[id_user, f] + lr * (2 * Q[id_item, f] * error - 2 * weigth_decay * P[id_user, f])
        Q[id_item, f] = Q[id_item, f] + lr * (2 * P[id_user, f] * error - 2 * weigth_decay * Q[id_item, f])
      
    return P, Q

In [21]:
features = 5
lr = 0.01
epochs = 101
weigth_decay = 0.02
stopping = 0.001

P, Q, loss_train, loss_valid = learn_to_recommend(train, features, lr, epochs, weigth_decay, stopping)

Epoch :    1  | Train : 7.016  | Valid : 7.186
Epoch :   11  | Train : 0.785  | Valid : 0.978


In [25]:
def rank_top_k(names, ratings, k=10):
  
 
   # 指數高到低
   ranked_ids = np.argsort(ratings)[::-1]
 
   return names[ranked_ids][:k], ratings[ranked_ids][:k]

In [39]:
#loss function是用 MSE所測量

user_id =1
top_k = 5

# Step 1:訓練集定義用戶的偏好。
user_train = np.array(train[0][user_id])


# Step 2:定義用戶還沒有看過哪些電影
#
movies_not_seen = np.where(user_train == 0, 1, 0)

# Step 3: 預測用戶對所有電影的評分。
estimates = np.dot(P[user_id, :], Q.T)


# Step 4:考慮用戶未看過的電影的估計評級
unseen_movie_estimates = estimates * movies_not_seen

# Step 5: 該用戶的前 k 個推薦。
recommendations, scores = rank_top_k(np.array(movies['Title']), unseen_movie_estimates, k=top_k)


# Step 6: 顯示推薦的標題和相關的分數
df = pd.DataFrame(np.matrix((recommendations, scores)).T, (np.arange(5) + 1).tolist(), 
                  columns=['Title', 'Predicted rating'])
df
#print(user_train)
#(數量)

Unnamed: 0,Title,Predicted rating
1,"Close Shave, A (1995)",4.894899
2,Pather Panchali (1955),4.841121
3,"Wrong Trousers, The (1993)",4.821045
4,"Shawshank Redemption, The (1994)",4.766396
5,Apocalypse Now (1979),4.659429


In [33]:
def recommend(user_id, data, P, Q, list_of_genre_names, movies_genre, genre):
    """
    args:
       user_id: 用戶身份
        data: 用戶項目評分
        P: 用戶矩陣
        Q: 項目矩陣
       list_of_genre_names: 電影總類
        movies_genre: 用戶對電影總類的偏好
        
    Returns:
        基於所選電影類型的最佳建議
    """

    #假設我們的用戶正在瀏覽動畫列表。 為了提供最相關的電影集，推薦限制在特定類型
    place = movies_genre_name.tolist().index(genre)   
    genre = np.array(movies_genre[:, place])
    predictions = np.array(np.dot(P[user_id, :], Q.T))
    
    return np.array(predictions) * np.array(genre.T)[0]

In [40]:
genre = "Film-Noir"
user_id = 1
top_k = 5
 
# 估計
estimates = recommend(user_id, train, P, Q, list_of_genre_names=movies_genre_name, movies_genre=movies_genre, genre=genre)
 
recommendations, scores = rank_top_k(np.array(movies['Title']), estimates, k=top_k)
 
# 印出結果
df = pd.DataFrame(np.matrix((recommendations, scores)).T, (np.arange(top_k) + 1).tolist(), columns = ['Title', 'Predicted rating'])
df

Unnamed: 0,Title,Predicted rating
1,Blade Runner (1982),4.575551
2,"Manchurian Candidate, The (1962)",4.494802
3,Laura (1944),4.454126
4,Chinatown (1974),4.384947
5,Crossfire (1947),4.374286


In [1]:
#print(movies_genre_name)