# Description:
这是实验的数据预处理模块，此次实验使用的亚马逊产品数据集里面的Electronics子集， 具体详情描述可以参考：[http://jmcauley.ucsd.edu/data/amazon/](http://jmcauley.ucsd.edu/data/amazon/)。 这里用的2014年的那两个per-category dataset。大体思路分为两个部分：
1. 把原始的json数据转成pd的形式， 从meta数据集中只保留在reviews文件中出现过的商品
2. 把pd数据转成pkl数据， 后面用这个生成数据

In [1]:
import numpy as np
import pandas as pd
import pickle
import gc
import random
from tqdm import tqdm

random.seed(2020)

# Convert_pd

In [3]:
def to_df(file_path):
    """
        转换为DataFrame结构
        file_path: 文件路径
        return: DtaFrame
    """
    with open(file_path, 'r') as fin:
        df = {}
        i = 0
        for line in tqdm(fin):
            #print(line)
            df[i] = eval(line)   #   直接针对字符串运行
            i += 1
            
            if i > 1000000:   # 笔记本内存不够了， 先提取少量一部分, 如果电脑允许，这里可以去掉
                break
        df = pd.DataFrame.from_dict(df, orient='index')
        return df            

In [4]:
# 处理review
reviews_df = to_df('./raw_data/reviews_Electronics.json')

1000000it [00:37, 26838.86it/s]


In [5]:
reviews_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AKM1MP6P0OYPR,132793040,"Vicki Gibson ""momo4""","[1, 1]",Corey Barker does a great job of explaining Bl...,5.0,Very thorough,1365811200,"04 13, 2013"
1,A2CX7LUOHB2NDG,321732944,Bernie,"[0, 0]",While many beginner DVDs try to teach you ever...,5.0,Adobe Photoshop CS5 Crash Course with master P...,1341100800,"07 1, 2012"
2,A2NWSAGRHCP8N5,439886341,bowmans2007,"[1, 1]",It never worked. My daughter worked to earn th...,1.0,absolutely horrible,1367193600,"04 29, 2013"
3,A2WNBOD3WNDNKT,439886341,JAL,"[1, 1]",Some of the functions did not work properly. ...,3.0,Disappointing,1374451200,"07 22, 2013"
4,A1GI0U4ZRJA8WN,439886341,Truthfull,"[4, 4]",Do not waste your money on this thing it is te...,1.0,TERRIBLE DONT WASTE YOUR MONEY,1334707200,"04 18, 2012"


In [6]:
with open('./raw_data/reviews.pkl', 'wb') as f:
    pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL)

In [7]:
unique_asin = reviews_df['asin'].unique()

In [8]:
del reviews_df
gc.collect()

309

In [9]:
# 处理meta_Electroics  从meta数据集中只保留在reviews文件中出现过的商品
meta_df = to_df('./raw_data/meta_Electronics.json')
meta_df = meta_df[meta_df['asin'].isin(unique_asin)]
meta_df = meta_df.reset_index(drop=True)

498196it [00:42, 11777.44it/s]


In [15]:
meta_df.head()

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand
0,132793040,http://ecx.images-amazon.com/images/I/31JIPhp%...,The Kelby Training DVD Mastering Blend Modes i...,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Mastering Blend Modes in A...,,,,
1,321732944,http://ecx.images-amazon.com/images/I/31uogm6Y...,,"[[Electronics, Computers & Accessories, Cables...",Kelby Training DVD: Adobe Photoshop CS5 Crash ...,,,,
2,439886341,http://ecx.images-amazon.com/images/I/51k0qa8f...,Digital Organizer and Messenger,"[[Electronics, Computers & Accessories, PDAs, ...",Digital Organizer and Messenger,8.15,{'Electronics': 144944},"{'also_viewed': ['0545016266', 'B009ECM8QY', '...",
3,511189877,http://ecx.images-amazon.com/images/I/41HaAhbv...,The CLIKR-5 UR5U-8780L remote control is desig...,"[[Electronics, Accessories & Supplies, Audio &...",CLIKR-5 Time Warner Cable Remote Control UR5U-...,23.36,,"{'also_viewed': ['B001KC08A4', 'B00KUL8O0W', '...",
4,528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[[Electronics, GPS & Navigation, Vehicle GPS, ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B006ZOI9OY', 'B00C7FKT2A', '...",


In [10]:
pickle.dump(meta_df, open('./raw_data/meta.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)

# remap_id
这里再次进行处理， 基于上面的pkl文件， 处理如下：
1. reviews_df保留'reviewerID'【用户ID】, 'asin'【产品ID】, 'unixReviewTime'【浏览时间】三列
2. meta_df保留'asin'【产品ID】, 'categories'【种类】两列

In [12]:
reviews = pd.read_pickle('./raw_data/reviews.pkl')
reviews_df = reviews[['reviewerID', 'asin', 'unixReviewTime']]

meta = pd.read_pickle('./raw_data/meta.pkl')
meta_df = meta[['asin', 'categories']]

del reviews, meta
gc.collect()

0

In [13]:
# meta_df只保留最后一个
meta_df['categories'] = meta_df['categories'].map(lambda x: x[-1][-1])

In [14]:
meta_df.head()

Unnamed: 0,asin,categories
0,132793040,Monitor Accessories
1,321732944,Monitor Accessories
2,439886341,PDAs & Handhelds
3,511189877,TV Remote Controls
4,528881469,Trucking GPS


In [15]:
reviews_df.head()

Unnamed: 0,reviewerID,asin,unixReviewTime
0,AKM1MP6P0OYPR,132793040,1365811200
1,A2CX7LUOHB2NDG,321732944,1341100800
2,A2NWSAGRHCP8N5,439886341,1367193600
3,A2WNBOD3WNDNKT,439886341,1374451200
4,A1GI0U4ZRJA8WN,439886341,1334707200


In [16]:
print(meta_df.shape, reviews_df.shape)

(59634, 2) (1000001, 3)


In [17]:
# 上面的这个数太大了还是， 所以这里在进行采样一波， 按照用户的reviewerID采样， 采样出10万的用户数据来
select_user_id = np.random.choice(reviews_df['reviewerID'].unique(), size=100000, replace=False)
reviews_df = reviews_df[reviews_df['reviewerID'].isin(select_user_id)]
meta_df = meta_df[meta_df['asin'].isin(reviews_df['asin'].unique())]

In [18]:
print(meta_df.shape, reviews_df.shape)

(27266, 2) (132639, 3)


In [19]:
def build_map(df, col_name):
    """
    制作一个映射， 键为列名， 值为序列数字
    df: review_df / meta_df
    col_name: 列名
    return: 字典， 键
    """
    key = sorted(df[col_name].unique().tolist())
    m = dict(zip(key, range(len(key))))          # 这个是建立字典的常用操作， 得记住这个写法 [值， 索引]
    df[col_name] = df[col_name].map(lambda x: m[x])        # 这地方是把原来的值变为索引了？
    return m, key

In [20]:
# 给物品ID， 物品种类， 用户ID，建立值 -> 索引的映射
asin_map, asin_key = build_map(meta_df, 'asin')
cate_map, cate_key = build_map(meta_df, 'categories')
revi_map, revi_key = build_map(reviews_df, 'reviewerID')

In [21]:
user_count, item_count, cate_count, example_count = len(revi_map), len(asin_map), len(cate_map), reviews_df.shape[0]
print(user_count, item_count, cate_count, example_count)

100000 27266 650 132639


In [22]:
# 按物品id排序， 并重置索引
meta_df = meta_df.sort_values('asin').reset_index(drop=True)

In [23]:
# reviews_df文件物品id进行映射， 并按照用户id，浏览时间进行排序重置索引
reviews_df['asin'] = reviews_df['asin'].map(lambda x: asin_map[x])
reviews_df = reviews_df.sort_values(['reviewerID', 'unixReviewTime']).reset_index(drop=True)
reviews_df = reviews_df[['reviewerID', 'asin', 'unixReviewTime']]

In [24]:
# 各个物品对应的类别
cate_list = np.array(meta_df['categories'], dtype='int32')

In [25]:
# 保存所需数据为pkl文件
with open('./dataset/remap.pkl', 'wb') as f:
    pickle.dump(reviews_df, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump(cate_list, f, pickle.HIGHEST_PROTOCOL)
    pickle.dump((user_count, item_count, cate_count, example_count), f, pickle.HIGHEST_PROTOCOL)
    pickle.dump((asin_key, cate_key, revi_key), f, pickle.HIGHEST_PROTOCOL)