In [1]:
import pandas as pd
file='../PycharmProjects/data/ml-1m/ratings.dat'
data_df = pd.read_csv(file, sep="::", engine='python',
                     names=['UserId', 'MovieId', 'Rating', 'Timestamp'])

In [2]:
data_df.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
data_df['avg_score'] = data_df.groupby(by='UserId')['Rating'].transform('mean')

In [4]:
data_df.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp,avg_score
0,1,1193,5,978300760,4.188679
1,1,661,3,978302109,4.188679
2,1,914,3,978301968,4.188679
3,1,3408,4,978300275,4.188679
4,1,2355,5,978824291,4.188679


In [5]:
data_df.tail()

Unnamed: 0,UserId,MovieId,Rating,Timestamp,avg_score
1000204,6040,1091,1,956716541,3.577713
1000205,6040,1094,5,956704887,3.577713
1000206,6040,562,5,956704746,3.577713
1000207,6040,1096,4,956715648,3.577713
1000208,6040,1097,4,956715569,3.577713


In [6]:
user_num, item_num = data_df['UserId'].max() + 1, data_df['MovieId'].max() + 1

In [7]:
user_num, item_num

(6041, 3953)

In [8]:
def sparseFeature(feat, feat_num, embed_dim=4):
    """
    create dictionary for sparse feature
    :param feat: feature name
    :param feat_num: the total number of sparse features that do not repeat
    :param embed_dim: embedding dimension
    :return:
    """
    return {'feat': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}

def denseFeature(feat):
    """
    create dictionary for dense feature
    :param feat: dense feature name
    :return:
    """
    return {'feat': feat}

In [9]:
latent_dim=4
feature_columns = [[denseFeature('avg_score')],
                       [sparseFeature('user_id', user_num, latent_dim),
                        sparseFeature('item_id', item_num, latent_dim)]]

In [10]:
feature_columns

[[{'feat': 'avg_score'}],
 [{'feat': 'user_id', 'feat_num': 6041, 'embed_dim': 4},
  {'feat': 'item_id', 'feat_num': 3953, 'embed_dim': 4}]]

In [11]:
# split train dataset and test dataset
watch_count = data_df.groupby(by='UserId')['MovieId'].agg('count')
watch_count

UserId
1        53
2       129
3        51
4        21
5       198
       ... 
6036    888
6037    202
6038     20
6039    123
6040    341
Name: MovieId, Length: 6040, dtype: int64

In [12]:
from tqdm import tqdm
test_df = pd.concat([
        data_df[data_df.UserId == i].iloc[int(0.8 * watch_count[i]):] for i in tqdm(watch_count.index)], axis=0)

100%|██████████| 6040/6040 [00:09<00:00, 630.25it/s]


In [13]:
test_df.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp,avg_score
42,1,1962,4,978301753,4.188679
43,1,2692,4,978301570,4.188679
44,1,260,4,978300760,4.188679
45,1,1028,5,978301777,4.188679
46,1,1029,5,978302205,4.188679


In [14]:
test_df.groupby(by='UserId')['MovieId'].agg('count')

UserId
1        11
2        26
3        11
4         5
5        40
       ... 
6036    178
6037     41
6038      4
6039     25
6040     69
Name: MovieId, Length: 6040, dtype: int64

In [15]:
test_df = test_df.reset_index()
test_df.head()

Unnamed: 0,index,UserId,MovieId,Rating,Timestamp,avg_score
0,42,1,1962,4,978301753,4.188679
1,43,1,2692,4,978301570,4.188679
2,44,1,260,4,978300760,4.188679
3,45,1,1028,5,978301777,4.188679
4,46,1,1029,5,978302205,4.188679


In [16]:
train_df = data_df.drop(labels=test_df['index'])

In [18]:
train_df.head()

Unnamed: 0,UserId,MovieId,Rating,Timestamp,avg_score
0,1,1193,5,978300760,4.188679
1,1,661,3,978302109,4.188679
2,1,914,3,978301968,4.188679
3,1,3408,4,978300275,4.188679
4,1,2355,5,978824291,4.188679


In [19]:
train_df = train_df.drop(['Timestamp'], axis=1).sample(frac=1.).reset_index(drop=True)

In [20]:
train_df.head()

Unnamed: 0,UserId,MovieId,Rating,avg_score
0,1133,3272,3,3.386905
1,2685,1185,3,3.596939
2,1078,296,4,3.984252
3,2025,1620,4,3.680672
4,4578,1500,3,3.730357


In [21]:
test_df = test_df.drop(['index', 'Timestamp'], axis=1).sample(frac=1.).reset_index(drop=True)

In [22]:
test_df.head()

Unnamed: 0,UserId,MovieId,Rating,avg_score
0,765,1020,3,3.574627
1,1125,2959,4,3.222414
2,268,3594,4,3.394636
3,4663,2396,4,3.29771
4,5745,110,5,3.708333


In [23]:
train_X = [train_df['avg_score'].values, train_df[['UserId', 'MovieId']].values]
# 只有2个元素，一个是avg_score列表，一个是['UserId', 'MovieId']列表

In [26]:
train_X[0]

array([3.38690476, 3.59693878, 3.98425197, ..., 3.91489362, 3.9608209 ,
       2.70130933])

In [25]:
train_X[1]

array([[1133, 3272],
       [2685, 1185],
       [1078,  296],
       ...,
       [5289, 2918],
       [4506, 1347],
       [4227, 2353]])

In [27]:
train_y = train_df['Rating'].values.astype('int32')
# 评分结果

In [28]:
train_y

array([3, 3, 4, ..., 3, 4, 4], dtype=int32)

In [30]:
test_X = [test_df['avg_score'].values, test_df[['UserId', 'MovieId']].values]
test_y = test_df['Rating'].values.astype('int32')