In [139]:
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/推荐系统/推荐系统算法复现/ml-100k")
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [140]:
# 加载rating数据
ratings = pd.read_csv('./u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
# 加载item信息
item = pd.read_csv('./u.item', sep='|', encoding='latin-1', names=['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

In [141]:
# 加载user数据
users = pd.read_csv('./u.user', sep='|', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])

# 去掉zip_code一列
users.drop('zip_code',axis = 1,inplace=True)

# 定义age转换函数，0-10岁为0，10-20岁为1，以此类推，最大取值为7
users.age = users.age//10

users

Unnamed: 0,user_id,age,gender,occupation
0,1,2,M,technician
1,2,5,F,other
2,3,2,M,writer
3,4,2,M,technician
4,5,3,F,other
...,...,...,...,...
938,939,2,F,student
939,940,3,M,administrator
940,941,2,M,student
941,942,4,F,librarian


接下来需要将age，gender，occupation三列拼接在一起，然后进行LabelEncode。最后得到的结果理想的形式为{'M':9,'F':8,'technician':29, ... ,'student':28}。这么做的目的是由于user的特征要进入同一个Embedding层，将users表的所有特征拼接在一起后进行统一的编码可以使得users表的特征共用一个Embedding层，这样就不用定义三个Embedding层

In [142]:
# 通过list将users的3个特征拼接在一起
concat_list = list(users.age) + list(users.gender) + list(users.occupation)
# 定义LabelEncoder
le = LabelEncoder()
# 将拼接的特征进行特征编码
encode_out = le.fit_transform(concat_list)

# 将拼接特征的前1/3的元素拆出来，得到编码后的age
encoded_age= pd.Series(encode_out[:int(len(encode_out)/3)])
# 将拼接特征的中间1/3的元素拆出来，得到编码后的gender
encoded_gender = pd.Series(encode_out[int(len(encode_out)/3):int(len(encode_out)*2/3)])
# 将拼接特征的后1/3的元素拆出来，得到编码后的occupation
encoded_occupation= pd.Series(encode_out[int(len(encode_out)*2/3):])

user_df = pd.DataFrame({'age': encoded_age, 'gender': encoded_gender, 'occupation': encoded_occupation})
user_df.index += 1
user_df

Unnamed: 0,age,gender,occupation
1,2,9,29
2,5,8,23
3,2,9,30
4,2,9,29
5,3,8,23
...,...,...,...
939,2,8,28
940,3,9,10
941,2,9,28
942,4,8,20


In [172]:
user_df.to_csv('user_df.csv')

In [157]:
# 加载item信息
item = pd.read_csv('./u.item', sep='|', encoding='latin-1', names=['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

item.drop(['movie_title', 'release_date', 'video_release_date', 'IMDb_URL'],axis = 1,inplace = True)
item


Unnamed: 0,item_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1679,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1680,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1681,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [170]:
# 我们希望在把user_df和user_df后面可以都共用一个Embedding层，所以编码值在user_df的最大编码值上面继续递增
item_df = {}

delta = 0
for i in item.columns[1:]:
  item_df[i] = item[i] + user_df.values.max() + delta
  delta += 1

item_df = pd.DataFrame(item_df)
item_df.index += 1
item_df

Unnamed: 0,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
1,30,31,32,34,35,36,36,37,38,39,40,41,42,43,44,45,46,47,48
2,30,32,33,33,34,35,36,37,38,39,40,41,42,43,44,45,47,47,48
3,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,47,47,48
4,30,32,32,33,34,36,36,37,39,39,40,41,42,43,44,45,46,47,48
5,30,31,32,33,34,35,37,37,39,39,40,41,42,43,44,45,47,47,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,30,31,32,33,34,35,36,37,39,39,40,41,42,43,44,45,46,47,48
1679,30,31,32,33,34,35,36,37,38,39,40,41,42,43,45,45,47,47,48
1680,30,31,32,33,34,35,36,37,39,39,40,41,42,43,45,45,46,47,48
1681,30,31,32,33,34,36,36,37,38,39,40,41,42,43,44,45,46,47,48


In [173]:
item_df.to_csv("item_df.csv")