In [1]:
from fastFM import als
import numpy as np
import pandas as pd
from scipy.sparse import csc_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

In [2]:
np.random.seed(42)

## データの用意

In [3]:
all_df = pd.read_csv('./data/ml-100k/u.data', sep='\t', header=None)
all_df.columns = ['user', 'item', 'rating', 'timestamp']
all_df['timestamp'] = pd.to_datetime(all_df['timestamp'], unit='s')

In [4]:
all_df.head()

Unnamed: 0,user,item,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16


In [5]:
all_df.shape

(100000, 4)

In [6]:
all_df.describe()

Unnamed: 0,user,item,rating
count,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986
std,266.61442,330.798356,1.125674
min,1.0,1.0,1.0
25%,254.0,175.0,3.0
50%,447.0,322.0,4.0
75%,682.0,631.0,4.0
max,943.0,1682.0,5.0


In [7]:
train_df, test_df = train_test_split(all_df, test_size=0.1)

### 評価関数の定義

In [8]:
def rmse(pred, test_df):
    return np.sqrt(np.mean([np.power(p - a, 2) for p, a in zip(pred, test_df['rating'])]))

## ユーザー情報を考慮しない手法 (SVD, NMF)

In [9]:
def rmse_svd(pred_df, test_df):
    """
    @param mat 行列分解後、復元した user * item 行列 (DataFrame)
    @return MSE
    """
    
    rmse = np.sqrt(
        np.mean(
            [np.power(pred_df.loc[user][item] - rating, 2)
             for user, item, rating in test_user_item_rating
             if user in pred_df.index and item in pred_df.columns]
        )
    )
    return rmse

In [10]:
def svd(user_item_df, test_df):
    """
    SVDを学習させたあと、test_df に対して予測
    @param user_item_df user * item 行列 (DataFrame)
    @return pred
    """
    U, s, V = np.linalg.svd(user_item_df.values, full_matrices=False)
    mat = np.dot(np.dot(U, np.diag(s)),V)
    pred_df = pd.DataFrame(
        mat,
        index=user_item_df.index,
        columns=[ser[1] for ser in user_item_df.columns]
    )
    # 1~5の範囲に補正
    pred_df = pred_df.applymap(lambda x: (max(x, 1) if x < 5 else 5))
    
    test_user_item_rating = list(zip(test_df['user'], test_df['item'], test_df['rating']))
    pred = [pred_df.loc[user][item]
            if user in pred_df.index and item in pred_df.columns
            else 0
            for user, item, rating in test_user_item_rating]

    return pred

In [11]:
def nmf(user_item_df, test_df):
    nmf = NMF(n_components=5, random_state=42)
    user = nmf.fit_transform(user_item_df)
    item = nmf.components_
    rec_mat = np.dot(user, item)
    
    pred_df = pd.DataFrame(
        rec_mat,
        index=user_item_df.index,
        columns=[ser[1] for ser in user_item_df.columns]
    )
    
    pred_df = pred_df.applymap(lambda x: (max(x, 1) if x < 5 else 5))
    
    
    test_user_item_rating = list(zip(test_df['user'], test_df['item'], test_df['rating']))
    pred = [pred_df.loc[user][item]
            if user in pred_df.index and item in pred_df.columns
            else 0
            for user, item, rating in test_user_item_rating]

    return pred

In [12]:
# ユーザー*商品行列の作成
user_item_df = train_df.pivot_table(index='user', columns='item', fill_value=1)

In [13]:
user_item_df.head()

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating
item,1,2,3,4,5,6,7,8,9,10,...,1672,1673,1674,1675,1676,1678,1679,1680,1681,1682
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,5,3,4,3,3,1,4,1,5,3,...,1,1,1,1,1,1,1,1,1,1
2,4,1,1,1,1,1,1,1,1,2,...,1,1,1,1,1,1,1,1,1,1
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
5,4,3,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [14]:
pred_svd = svd(user_item_df, test_df)
print('SVD: {}'.format(rmse(pred_svd, test_df)))

SVD: 2.762173057576221


In [15]:
pred_nmf = nmf(user_item_df, test_df)
print('NMF: {}'.format(rmse(pred_nmf, test_df)))

NMF: 1.9622421584049703


## ユーザー情報を考慮する手法 (Factorization Machine, Collaborative Filtering)

In [16]:
def fm(train_df, test_df, user_df):
    # 使用する特徴量は適宜変更してください
    features = ['item', 'rating', 'age', 'gender', 'occupation', 'zip code']
    data = pd.get_dummies(
        pd.merge(train_df, user_df, left_on='user', right_on='id')[features],
        drop_first=True)
    machine = als.FMRegression(n_iter=1000, init_stdev=0.1, rank=2, l2_reg_w=0.1, l2_reg_V=0.5)
    X_train = data.loc[:, ~data.columns.isin(['rating'])]
    y_train = data['rating']
    machine.fit(csc_matrix(X_train), y_train)
    
    # predict 
    test_data = pd.get_dummies(
        pd.merge(test_df, user_df, left_on='user', right_on='id')[features],
        drop_first=True)
    X_test = data.loc[:, ~data.columns.isin(['rating'])]
    pred = machine.predict(csc_matrix(X_test))
    # 補正
    pred = [max(p, 1) if p < 5 else 5 for p in pred]
    return pred

In [17]:
user_df = pd.read_csv('data/ml-100k/u.user', sep='|', header=None)
user_df.columns = ['id', 'age', 'gender', 'occupation', 'zip code']

In [18]:
pred_fm = fm(train_df, test_df, user_df)
rmse(pred_fm, test_df)

1.2862326726827418