<a href="https://colab.research.google.com/github/thisiscd/RecommendSystem/blob/main/data_preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# 文件位置
path = '../data/'
# 当前的路径
import os
os.chdir("/content/drive/My Drive/RecommendSystem/NCF")

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Tue Jun  6 01:37:27 2023

@author: chendu
"""

# 数据读取与处理
import numpy as np
import pandas as pd

# 深度学习
from tensorflow.keras import models, layers, utils  #(2.6.0)

path = '../data/'

dtf_users = pd.read_csv(path + 'dtf_users.csv')
dtf_items = pd.read_csv(path + 'itemAttribute.csv')

embeddings_size = 50
usr, prd = dtf_users['userid'].unique().size, dtf_items.shape[0]

print(usr)
print(prd)

# 用户 Users 维度(1,embedding_size)
xusers_in = layers.Input(name="xusers_in", shape=(1,))
xusers_emb = layers.Embedding(name="xusers_emb", input_dim=usr, output_dim=embeddings_size)(xusers_in)
xusers = layers.Reshape(name='xusers', target_shape=(embeddings_size,))(xusers_emb)
# 产品 Products 维度(1,embedding_size)
xproducts_in = layers.Input(name="xproducts_in", shape=(1,))
xproducts_emb = layers.Embedding(name="xproducts_emb", input_dim=prd, output_dim=embeddings_size)(xproducts_in)
xproducts = layers.Reshape(name='xproducts', target_shape=(embeddings_size,))(xproducts_emb)
# 矩阵乘法，即我们我们上面提到的因子矩阵相乘 维度(1)
xx = layers.Dot(name='xx', normalize=True, axes=1)([xusers, xproducts])
# 预测得分 维度(1)
y_out = layers.Dense(name="y_out", units=1, activation='linear')(xx)
# 编译
model = models.Model(inputs=[xusers_in,xproducts_in], outputs=y_out, name="CollaborativeFiltering")
model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mean_absolute_percentage_error'])

utils.plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True)

train = dtf_users
# 训练
training = model.fit(x=[train["userid"], train["itemid"]], y=train["rating"], epochs=100, batch_size=128, shuffle=True, verbose=1, validation_split=0.3)
model = training.model
# 测试
test = train
test["rating_hat"] = model.predict([test["userid"], test["itemid"]])

print(test['rating_hat'])


19835
507172
Epoch 1/100

# 工具库导入

In [None]:
# -*- coding: utf-8 -*-

"""
工具库导入
"""

# 评估与预处理
from sklearn import metrics, preprocessing

# 数据读取与处理
import numpy as np
import pandas as pd

# 绘图
import matplotlib.pyplot as plt
import seaborn as sns

import time


In [None]:
def get_user_data(path):
    users = []
    items = []
    ratings = []
    for line in open(path, 'r'):
        if not line.find('|') == -1:
            (user_id, item_num) = line.strip('\n').split('|')
            users.extend([int(user_id) for _ in range(int(item_num))])
        else:
            (item_id, rating) = line.strip('\n').split('  ')
            items.append(int(item_id))
            ratings.append(int(rating))
        
    users_data = np.column_stack([users, items, ratings])
    dtf_users = pd.DataFrame(users_data, columns=['userid', 'itemid', 'rating'], dtype=int)
    return dtf_users

def get_item_attribute(path):
    return pd.read_table(path, sep='|', names=['itemid', 'attribute1', 'attribute2'])



# 读取数据

In [None]:
# 将 txt 转换为 csv 格式
dtf_users = get_user_data(path + 'train.txt')
dtf_users.to_csv(path + 'dtf_users.csv', index=False)
dtf_items = get_item_attribute(path + 'itemAttribute.txt')
dtf_items.to_csv(path + 'itemAttribute.csv', index = False)



In [None]:
# 读取数据
dtf_users = pd.read_csv(path + 'dtf_users.csv')
dtf_items = pd.read_csv(path + 'itemAttribute.csv')
dtf_items = dtf_items.set_index('itemid')
# print(dtf_items.head(10))


# 数据分析 && 特征工程

In [None]:
"""
    数据分析 && 特征工程
"""

# 属性缺失字段处理 TODO:
# print(dtf_items)        
# dtf_items = dtf_items[~dtf_items['attribute1'].isna()]
# dtf_items.dropna(thresh= 2)
# dtf_items['attribute1'].astype('Int64')
# dtf_items.astype('Int64')
# print(dtf_items)     



# 构建 Item-Attribute 矩阵

In [None]:
# 构建 Item-Attribute 矩阵
"""
attribute = [ a for a in dtf_items['attribute1'].unique()]
attribute.extend([a for a in dtf_items['attribute2'].unique()])
columns = list(set(attribute))
# 将属性切分出来作为标签
for col in columns:
    dtf_items[col] = dtf_items.apply(lambda x: 1 if col in x["attribute1"] or col in x["attribute2"] else 0, axis=1)
dtf_items.to_csv(path + 'dtf_items.csv')
"""
# 构建热力图并可视化
"""
#print(dtf_items.head(4))
fig, ax = plt.subplots(figsize=(20,5))
sns.heatmap(dtf_items==0, vmin=0, vmax=1, cbar=False, ax=ax).set_title("Items x Attributes")
plt.show()
"""


# user-item矩阵

In [None]:
# user-item矩阵
tmp = dtf_users.copy()
dtf_users = tmp.set_index(['userid', 'itemid'])['rating'].unstack('itemid')
missing_cols = list(set(dtf_items.index) - set(dtf_users.columns))

missing_data = pd.DataFrame(np.nan, index=dtf_users.index, columns=missing_cols)
dtf_users = pd.concat([dtf_users, missing_data], axis=1)
#for col in missing_cols:
#    dtf_users[col] = np.nan

dtf_users = dtf_users[sorted(dtf_users.columns)]

"""
print(dtf_users.head(5))
fig, ax = plt.subplots(figsize=(20,5))
sns.heatmap(dtf_users==0, vmin=0, vmax=1, cbar=False, ax=ax).set_title("Users x Items")
plt.show()    
"""


# 数据幅度缩放

In [None]:
# 数据幅度缩放
"""
dtf_users = pd.DataFrame(
                        preprocessing.MinMaxScaler(feature_range=(0.5,1)).fit_transform(dtf_users.values), 
                        columns=dtf_users.columns, 
                        index=dtf_users.index
                        )
"""
# print(dtf_users.head(5))

# 划分训练集和测试集

In [None]:
split = int(0.8*dtf_users.shape[1])
dtf_train = dtf_users.loc[:, :split-1]
dtf_test = dtf_users.loc[:, split:]

dtf_train.to_csv(path + 'trainset.csv')
dtf_test.to_csv(path + 'testset.csv')