## 1. 数据预处理

In [18]:
import pickle
import pandas as pd
import numpy as np
import os

pickle是python内置的一个序列化库，可以实现任意对象与文本之间的相互转化，
也可以实现任意对象与二进制之间的相互转化。

In [19]:
# 写一个方法，用来把json文件一条条的读入并放到一个DF中
def to_df(path):
    """转化为DataFrame结构

    :param path: 文件路径
    :return:
    """
    with open(path, 'r') as fin:
        df = {}
        i = 0
        for line in fin:
            df[i] = eval(line)
            i += 1
        df = pd.DataFrame.from_dict(df, orient='index')
        return df

### （1）读取行为数据文件和与元数据文件并存储为pkl格式文件
用到了pickle库的函数，该函数用于将 Python 对象转换成二进制文件。

API：
```python
dump (obj, file,protocol=None, *, fix mports=True)
```
obj:要转换的 Python 对象；
file:二进制文件对象，必须以"wb"的方式进行操作。
protocol:pickle 的转码协议

In [20]:
# 文件path
base_path = 'E:/master/其它资料/推荐系统/DIN_data/'
electronics_path = base_path + 'Electronics_5.json'
reviews_pkl_path = base_path + 'reviews.pkl'
meta_path = base_path + 'meta_Electronics.json'
meta_pkl_path = base_path + 'meta.pkl'

# 防止重复构造pkl文件
if os.path.exists(reviews_pkl_path):
    pass
else:
    # 先把行为数据读取为DataFrame
    reviews_df = to_df(electronics_path)

    # 把行为数据存为pkl文件
    with open(reviews_pkl_path, 'wb') as f:
        pickle.dump(
            reviews_df,
            f,
            pickle.HIGHEST_PROTOCOL
        )

if os.path.exists(meta_pkl_path):
    pass
else:
    # 元数据转DataFrame
    meta_df = to_df(meta_path)
    # 只保留在review_df中出现过的广告
    meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())]
    # 重设index（就是把1，3，4变成1，2，3这种）
    meta_df = meta_df.reset_index(drop=True)
    # 保存为pkl文件
    with open(meta_pkl_path, 'wb') as f:
        pickle.dump(
            meta_df,
            f,
            pickle.HIGHEST_PROTOCOL
        )

### （2）对reviews和meta数据进行处理

In [21]:
# 先读取行为数据和元数据
reviews_df = pd.read_pickle(reviews_pkl_path)
meta_df = pd.read_pickle(reviews_pkl_path)

# 行为数据只保留用户ID，物品ID，时间戳
reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]
# 元数据只保留物品id，所属品类
meta_df = pd.read_pickle(meta_pkl_path)
meta_df = meta_df[['asin', 'categories']]
# 因为有多个类别，这里只保留一个类别
meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1])


In [22]:
def build_map(df, col_name):
    """制作一个映射，键为列名，值为序列数字

    :param df: reviews_df / meta_df
    :param col_name: 列名
    :return: 字典，键
    """
    # 拿到一列所有不重复的元素
    key = sorted(df[col_name].unique().tolist())
    # 转为字典 key - 序列数字
    m = dict(zip(key, range(len(key))))
    # 把该列的内容转为 序列
    df[col_name] = df[col_name].map(lambda x: m[x])
    # 返回
    return m, key

In [23]:
# reviews_df文件的用户ID映射
revi_map, revi_key = build_map(reviews_df, 'reviewerID')
# meta_df文件的物品ID映射
asin_map, asin_key = build_map(meta_df, 'asin')
# meta_df文件物品种类映射
cate_map, cate_key = build_map(meta_df, 'categories')

# 根据编码后return的字典获取用户数量，商品数量，品类数，样本数量
user_count, item_count = len(revi_map), len(asin_map)
cate_count, example_count = len(cate_map), reviews_df.shape[0]

In [None]:
user_count
item_count
cate_count
example_count

In [28]:
meta_df[:5]

Unnamed: 0,asin,categories
0,0,738
1,1,157
2,2,571
3,3,707
4,7,799


In [29]:
reviews_df[:5]

Unnamed: 0,reviewerID,asin,unixReviewTime
0,176008,528881469,1370131200
1,173739,528881469,1290643200
2,134504,528881469,1283990400
3,24476,528881469,1290556800
4,57419,528881469,1317254400


In [30]:
# 按物品id排序，并重置索引
meta_df = meta_df.sort_values('asin')
meta_df = meta_df.reset_index(drop=True)
meta_df[:5]

Unnamed: 0,asin,categories
0,0,738
1,1,157
2,2,571
3,3,707
4,4,714


In [31]:
# reviews_df文件物品id进行映射
reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x])
# 按照用户id、浏览时间进行排序
reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime'])
# 重置索引
reviews_df = reviews_df.reset_index(drop=True)
reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]

reviews_df[:5]

Unnamed: 0,reviewerID,asin,unixReviewTime
0,0,13179,1400457600
1,0,17993,1400457600
2,0,28326,1400457600
3,0,29247,1400457600
4,0,62275,1400457600


In [32]:
# 各个物品对应的类别
cate_list = np.array(meta_df['categories'], dtype='int32')
cate_list

array([738, 157, 571, ...,  63, 674, 351])

In [33]:
# 保存所需数据为pkl文件
remap_pkl_path = base_path + 'remap.pkl'
with open(remap_pkl_path, 'wb') as f:
    # 行为数据
    pickle.dump(
        reviews_df,
        f,
        pickle.HIGHEST_PROTOCOL
    )
    # 商品对应类别
    pickle.dump(
        cate_list,
        f,
        pickle.HIGHEST_PROTOCOL
    )
    # 统计数据
    pickle.dump(
        (user_count, item_count, cate_count, example_count),
        f,
        pickle.HIGHEST_PROTOCOL
    )
    # 不重复元素（下标即序号）
    pickle.dump(
        (asin_key, cate_key, revi_key),
        f,
        pickle.HIGHEST_PROTOCOL
    )