In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat,get_feature_names

In [24]:
data = pd.read_csv("movielens_sample.csv")
data.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zip
0,3299,235,4,968035345,Ed Wood (1994),Comedy|Drama,F,25,4,19119
1,3630,3256,3,966536874,Patriot Games (1992),Action|Thriller,M,18,4,77005
2,517,105,4,976203603,"Bridges of Madison County, The (1995)",Drama|Romance,F,25,14,55408
3,785,2115,3,975430389,Indiana Jones and the Temple of Doom (1984),Action|Adventure,M,18,19,29307
4,5848,909,5,957782527,"Apartment, The (1960)",Comedy|Drama,M,50,20,20009


In [3]:
sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
target = ['rating']

In [4]:
# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]  #特征稀疏化
print(fixlen_feature_columns)
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

[SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x0000016E8B44E8C8>, embedding_name='movie_id', group_name='default_group', trainable=True), SparseFeat(name='user_id', vocabulary_size=193, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x0000016EEB51B948>, embedding_name='user_id', group_name='default_group', trainable=True), SparseFeat(name='gender', vocabulary_size=2, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x0000016E8485A208>, embedding_name='gender', group_name='default_group', trainable=True), SparseFeat(name='age', vocabulary_size=7, embedding_dim=4, use_hash=False, dtype='int32', embeddings_initializer=<tensorflow.python.keras.initializers.RandomNormal object at 0x0000016E8

In [5]:
# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}


In [6]:
# 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )
# 使用DeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
print("test RMSE", rmse)

Train on 128 samples, validate on 32 samples
test RMSE 3.8248660107250814


In [7]:
from surprise import SVD,SVDpp
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import KFold, split
import pandas as pd
# import svd

In [42]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file('movielens_sample.csv', reader=reader)
#train_set = data.build_full_trainset()
train_s,test_s = split.train_test_split(data, train_size=0.8)

In [38]:
data.shape

(200, 10)

In [39]:
reader = Reader(rating_scale=(1, 200))
data1 = Dataset.load_from_df(data[["movie_id", "occupation", 'rating']], reader)
train_s,test_s=split.train_test_split(data1, train_size=0.8)

In [43]:
# 使用biasSVD
algo1 = SVD()
# 使用funkSVD
algo2 = SVD(biased = False)
# 使用SVD++
algo3 = SVDpp()

In [44]:
print('biasSVD结果')
algo1.fit(train_s)
pre = algo1.test(test_s)
accuracy.rmse(pre,verbose=True)
print('funkSVD结果')
algo2.fit(train_s)
pre = algo2.test(test_s)
accuracy.rmse(pre,verbose=True)
print('SVD++结果')
algo3.fit(train_s)
pre = algo3.test(test_s)
accuracy.rmse(pre,verbose=True)

biasSVD结果
RMSE: 1.1872
funkSVD结果
RMSE: 1.1954
SVD++结果
RMSE: 1.1842


1.1842261505409808

In [46]:
def svd(train, k):
    utilMat = np.array(train)
    # the nan or unavailable entries are masked
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0],1))
    # we remove the per item average from all entries.
    # the above mentioned nan entries will be essentially zero now
    utilMat = utilMat - x
    # The magic happens here. U and V are user and item features
    U, s, V=np.linalg.svd(utilMat, full_matrices=False)
    s=np.diag(s)
    # we take only the k most significant features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]
    UsV = np.dot(np.dot(U,s), V)
    UsV = UsV + x
    print("svd done")
    return UsV

In [47]:
from funk_svd.svd import SVD

svd = SVD(learning_rate=0.001, regularization=0.005, n_epochs=1000, n_factors=15, min_rating=1, max_rating=max_dl)

ModuleNotFoundError: No module named 'funk_svd'