In [28]:
import json
import pprint
import pandas as pd 
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import torch
from torch.utils.data import Dataset, DataLoader

with open('user_info.json', 'r', encoding='utf-8') as file:
    user_data = json.load(file)

pprint.pprint(user_data)
df = pd.DataFrame(user_data)
# print(df['Renewal'])

[{'Age': 24,
  'Birth': 1995,
  'Browse': [1, 2, 2, 4, 5, 6],
  'BrowserTime': 300,
  'City': '北京',
  'Collection': [5, 3, 3, 4],
  'Datetime': '2024-09-01 14:30',
  'Education': '本科',
  'Gender': '男性',
  'Humidity': 60,
  'Interest': '音乐',
  'Major': '计算机科学与技术',
  'Marital': '未婚',
  'Month': 9,
  'Name': '张三',
  'Noise': 39,
  'Parttime': '下午',
  'PosType': '科技',
  'Renewal': [1, 3],
  'Reservation': [1, 3],
  'Search': '深度学习与推荐系统',
  'Temp': 25,
  'UserId': 0,
  'Weather': '晴',
  'Weekdays': '周一',
  'Windscale': 2},
 {'Age': 45,
  'Birth': 1982,
  'Browse': [12, 9, 9, 10, 11],
  'BrowserTime': 180,
  'City': '武汉',
  'Collection': [9, 10, 11, 12, 13],
  'Datetime': '2024-09-02 10:45',
  'Education': '硕士',
  'Gender': '女性',
  'Humidity': 70,
  'Interest': '阅读',
  'Major': '市场营销',
  'Marital': '已婚',
  'Month': 9,
  'Name': '李四',
  'Noise': 50,
  'Parttime': '上午',
  'PosType': '历史',
  'Renewal': [10, 11],
  'Reservation': [9, 10],
  'Search': '我国古代四大名著',
  'Temp': 11,
  'UserId': 1,
  'W

## 首先处理DIN网络的输入

### 预定义一些要用到的函数

In [29]:
# 定义一个新的dataframe， 存储处理后的数据
din_input = pd.DataFrame()

In [30]:
def label_encode_column(df, column_name):
    le = LabelEncoder()
    df[column_name + '_encoded'] = le.fit_transform(df[column_name])
    return le

def pad_sequence_list(seq_list, max_len):
    return [seq + [0]*(max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in seq_list]

### 输入din的第一部分：user behaviors

In [31]:
# Renewal : 用户续订的书籍列表
# Reservation ： 用户预约的书籍列表
# Collection: 用户收藏的书籍列表
# Browse : 用户浏览过的书籍，弱交互
import torch
from torch.nn.utils.rnn import pad_sequence

# 定义序列的最大长度
max_seq_length = 50  # 可以根据数据情况调整

# TODO:这里假设我们已经获得了用户的操作序列
# 先合并，再拆分为 ： 历史序列，当前正在看的书籍，下一个正样本书籍
history = df['Browse']+df['Renewal']+df['Collection']+df['Reservation']
din_input['curBook'] = history.apply(lambda x: x[-2] if len(x) > 1 else None)
din_input['positiveSample'] = history.apply(lambda x: x[-1] if len(x) > 0 else None)
filterHistory = history.apply(lambda x: x[:-2] if len(x) > 2 else [])
concatResult = pad_sequence_list(filterHistory, max_seq_length)
din_input['bookHistory'] = concatResult
print(din_input)

   curBook  positiveSample                                        bookHistory
0        1               3  [1, 2, 2, 4, 5, 6, 1, 3, 5, 3, 3, 4, 0, 0, 0, ...
1        9              10  [12, 9, 9, 10, 11, 10, 11, 9, 10, 11, 12, 13, ...
2        4               5  [2, 3, 4, 5, 3, 6, 7, 2, 3, 3, 4, 5, 3, 4, 0, ...
3       13              14  [13, 14, 12, 12, 13, 14, 15, 12, 15, 0, 0, 0, ...
4       12              13  [10, 12, 13, 14, 15, 12, 14, 15, 10, 12, 13, 1...


### 情景化特征：连续值的处理（时长相关）

In [32]:
# 先查看一下目前有什么特征值
print(df.columns)
print(df['Month'][0])

Index(['Name', 'UserId', 'Gender', 'Age', 'Birth', 'Education', 'Major',
       'Marital', 'Interest', 'Search', 'Browse', 'Collection', 'Reservation',
       'Renewal', 'Datetime', 'Month', 'Weekdays', 'Parttime', 'PosType',
       'Weather', 'Temp', 'Humidity', 'Windscale', 'Noise', 'BrowserTime',
       'City'],
      dtype='object')
9


In [33]:
# 1. BrowserTime : 停留时间，以秒计时 。能表示用户当前总体的借书欲望 。为用户浏览的最后一本书的停留时长，以及最后一本的打开时间
# 需要归一化到 [0, 1] 范围
scaler_browser = MinMaxScaler()
BrowserTime_norm = scaler_browser.fit_transform(df[['BrowserTime']])
din_input['BrowserTime'] = BrowserTime_norm.flatten()

# 2. Datetime : 用户浏览的绝对时间,每天是一个循环，用分钟表示单日时间
df['Datetime'] = pd.to_datetime(df['Datetime'])
Hour = df['Datetime'].dt.hour
Minute = df['Datetime'].dt.minute
times = Hour* 60 + Minute
din_input['Datetime'] = times / 1440.0
print(din_input)

   curBook  positiveSample                                        bookHistory   
0        1               3  [1, 2, 2, 4, 5, 6, 1, 3, 5, 3, 3, 4, 0, 0, 0, ...  \
1        9              10  [12, 9, 9, 10, 11, 10, 11, 9, 10, 11, 12, 13, ...   
2        4               5  [2, 3, 4, 5, 3, 6, 7, 2, 3, 3, 4, 5, 3, 4, 0, ...   
3       13              14  [13, 14, 12, 12, 13, 14, 15, 12, 15, 0, 0, 0, ...   
4       12              13  [10, 12, 13, 14, 15, 12, 14, 15, 10, 12, 13, 1...   

   BrowserTime  Datetime  
0         0.60  0.604167  
1         0.12  0.447917  
2         0.40  0.833333  
3         1.00  0.510417  
4         0.00  0.770833  


### 情景化特征：其他离散的 context feature 处理

In [34]:
import pandas as pd

# 处理 'Month' 特征
# 将 1-12 月映射为 0-11
din_input['Month'] = df['Month'] - 1

# 处理 'Weekdays' 特征
# 将 '周一' 到 '周日' 映射为 0-6
weekdays_mapping = {
    '周一': 0,
    '周二': 1,
    '周三': 2,
    '周四': 3,
    '周五': 4,
    '周六': 5,
    '周日': 6
}
din_input['Weekdays'] = df['Weekdays'].map(weekdays_mapping)

# 处理 'Parttime' 特征
# 将 '清晨'、'上午'、'中午'、'下午'、'夜晚' 映射为 0-4
parttime_mapping = {
    '清晨': 0,
    '上午': 1,
    '中午': 2,
    '下午': 3,
    '夜晚': 4
}
din_input['Parttime'] = df['Parttime'].map(parttime_mapping)

# 处理 'PosType' 特征
# 将 '文学'、'历史'、'小说'、'科技'、'其他' 映射为 0-4
postype_mapping = {
    '文学': 0,
    '历史': 1,
    '小说': 2,
    '科技': 3,
    '其他': 4
}
din_input['PosType'] = df['PosType'].map(postype_mapping)

# 处理 'Weather' 特征
# 将天气类型映射为 0-6
weather_mapping = {
    '晴': 0,
    '多云': 1,
    '雨': 2,
    '雪': 3,
    '冰雹': 4,
    '霜冻': 5,
    '雾': 6
}
din_input['Weather'] = df['Weather'].map(weather_mapping)

# 处理'city'特征
city_mapping = {'北京': 0, '上海': 1, '广州': 2, '深圳': 3, '杭州': 4, '成都': 5, '重庆': 6, '西安': 7, '南京': 8, '苏州': 9, '武汉': 10} 
din_input['City'] = df['City'].apply(lambda x:x.strip()).map(city_mapping)

# 处理 'Search' 特征
din_input['Search'] = df['Search']

# 处理 'Temp' 特征
# 将温度等级映射为 0-3
def temp_mapping(temprature):
    return int(temprature / 10)    
din_input['Temp'] = df['Temp'].map(temp_mapping)

# 处理 'Humidity' 特征
# 将湿度值映射为 0-4
def encode_humidity(humidity):
    return int(humidity / 20)
din_input['Humidity'] = df['Humidity'].apply(encode_humidity)

# 处理 'Windscale' 特征
# 将风级区间映射为 0-3
def windscale_mapping(windscale):
    return int(windscale / 4)
din_input['Windscale'] = df['Windscale'].map(windscale_mapping)

# 处理 'Noise' 特征
# 将噪音等级映射为 0-4
def noise_mapping(noise):
    return int(noise / 20)

din_input['Noise'] = df['Noise'].map(noise_mapping)

# 查看处理后的 din_input
print(din_input.head())
print(din_input.columns)

   curBook  positiveSample                                        bookHistory   
0        1               3  [1, 2, 2, 4, 5, 6, 1, 3, 5, 3, 3, 4, 0, 0, 0, ...  \
1        9              10  [12, 9, 9, 10, 11, 10, 11, 9, 10, 11, 12, 13, ...   
2        4               5  [2, 3, 4, 5, 3, 6, 7, 2, 3, 3, 4, 5, 3, 4, 0, ...   
3       13              14  [13, 14, 12, 12, 13, 14, 15, 12, 15, 0, 0, 0, ...   
4       12              13  [10, 12, 13, 14, 15, 12, 14, 15, 10, 12, 13, 1...   

   BrowserTime  Datetime  Month  Weekdays  Parttime  PosType  Weather  City   
0         0.60  0.604167      8         0         3        3        0     0  \
1         0.12  0.447917      8         1         1        1        1    10   
2         0.40  0.833333      8         2         4        0        2     1   
3         1.00  0.510417      8         3         2        0        0     2   
4         0.00  0.770833      8         4         3        1        1     3   

      Search  Temp  Humidity  Windscal

### 读取 user profile 特征

In [35]:
din_input['UserId'] = df['UserId'] 
din_input['Name']= df['Name']
din_input['Gender'] = df['Gender']
din_input['Age'] = df['Age']
din_input['Education'] = df['Education']
din_input['Major'] = df['Major']
din_input['Marital'] = df['Marital']
din_input['Interest'] = df['Interest']
print(din_input.columns)

Index(['curBook', 'positiveSample', 'bookHistory', 'BrowserTime', 'Datetime',
       'Month', 'Weekdays', 'Parttime', 'PosType', 'Weather', 'City', 'Search',
       'Temp', 'Humidity', 'Windscale', 'Noise', 'UserId', 'Name', 'Gender',
       'Age', 'Education', 'Major', 'Marital', 'Interest'],
      dtype='object')
