In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


# jupyter配置
pd.options.display.max_rows=10000 #Notebook 的一个cell的显示行数
pd.options.display.max_columns=10000#Notebook 的一个cell的显示列数

In [19]:
## 数据处理
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

# 计算用户特征
user_features = train_data.groupby('user_id').agg({
    'rating': ['mean', 'count', 'std', 'var'],
    'votes': ['mean', 'max'],
    'helpful_votes': ['mean', 'max']
}).reset_index()

user_features.columns = ['user_id', 'user_rating_mean', 'user_rating_count', 'user_rating_std', 'user_rating_var', 
                         'user_votes_mean', 'user_votes_max', 
                         'user_helpful_votes_mean', 'user_helpful_votes_max']
user_features['user_votes_helpful_rate'] = user_features['user_helpful_votes_mean'] / user_features['user_votes_mean']
user_features['user_helpful_rating'] = user_features['user_votes_helpful_rate'] * user_features['user_rating_mean']
user_features.head()

Unnamed: 0,user_id,user_rating_mean,user_rating_count,user_rating_std,user_rating_var,user_votes_mean,user_votes_max,user_helpful_votes_mean,user_helpful_votes_max,user_votes_helpful_rate,user_helpful_rating
0,0,4.572139,201,0.604996,0.36602,3.970149,18,3.059701,18,0.770677,3.523641
1,1,4.902222,225,0.326295,0.106468,2.342222,32,1.337778,27,0.571157,2.799941
2,2,3.816456,158,1.138994,1.297307,1.56962,25,0.582278,9,0.370968,1.415782
3,3,3.829493,434,1.086748,1.181022,7.096774,69,5.02765,47,0.708442,2.712972
4,4,4.227964,329,0.962566,0.926533,5.43465,34,4.431611,34,0.815436,3.447635


In [21]:

# 计算产品特征
product_features = train_data.groupby('product_id').agg({
    'rating': ['mean', 'count', 'std', 'var'],
    'votes': ['mean', 'max'],
    'helpful_votes': ['mean', 'max']
}).reset_index()
product_features.columns = ['product_id', 'product_rating_mean', 'product_rating_count', 'product_rating_std', 'product_rating_var',
                            'product_votes_mean', 'product_votes_max',
                            'product_helpful_votes_mean', 'product_helpful_votes_max']
product_features['product_votes_helpful_rate'] = product_features['product_helpful_votes_mean'] / product_features['product_votes_mean']
product_features['product_helpful_rating'] = product_features['product_votes_helpful_rate'] * product_features['product_rating_mean']
product_features.head()


Unnamed: 0,product_id,product_rating_mean,product_rating_count,product_rating_std,product_rating_var,product_votes_mean,product_votes_max,product_helpful_votes_mean,product_helpful_votes_max,product_votes_helpful_rate,product_helpful_rating
0,0,4.4,5,0.894427,0.8,8.0,34,7.4,32,0.925,4.07
1,1,4.5,4,1.0,1.0,15.25,23,13.25,22,0.868852,3.909836
2,3,4.076923,26,1.163549,1.353846,6.846154,56,4.923077,51,0.719101,2.93172
3,4,4.0,1,,,7.0,7,4.0,4,0.571429,2.285714
4,5,5.0,1,,,0.0,0,0.0,0,,


In [22]:
# 生成用户和产品特征
train_data = train_data.merge(user_features, on='user_id').merge(product_features, on='product_id')
test_data = test_data.merge(user_features, on='user_id', how='left').merge(product_features, on='product_id', how='left')

# 填充缺失值
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)  # 或者使用其他合适的填充方法

# 保存训练测试数据
train_data.to_csv('./data/train_processed.csv')
test_data.to_csv('./data/test_processed.csv')
test_data

Unnamed: 0,ID,user_id,product_id,product_name,user_rating_mean,user_rating_count,user_rating_std,user_rating_var,user_votes_mean,user_votes_max,user_helpful_votes_mean,user_helpful_votes_max,user_votes_helpful_rate,user_helpful_rating,product_rating_mean,product_rating_count,product_rating_std,product_rating_var,product_votes_mean,product_votes_max,product_helpful_votes_mean,product_helpful_votes_max,product_votes_helpful_rate,product_helpful_rating
0,0,1916,185507,Maria [Australia],4.458700,112483,1.043160,1.088182,7.315808,681,5.298009,657,0.724187,3.228931,5.000000,1.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
1,1,1759,143430,"The Mitchells: Five for Victory (Van Stockum, ...",4.184919,557,1.462944,2.140204,6.154399,91,4.113106,42,0.668320,2.796864,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000
2,2,1980,155861,Superman Cartoons of Max & Dav,3.956067,478,1.295110,1.677311,5.575314,52,3.414226,41,0.612383,2.422627,4.714286,7.0,0.487950,0.238095,12.285714,57.0,11.428571,56.0,0.930233,4.385382
3,3,1502,71535,The Rainbow Fish Anniversary Edition,4.310484,28404,1.139541,1.298554,3.643290,380,1.713843,147,0.470411,2.027699,3.666667,6.0,1.751190,3.066667,22.833333,44.0,14.166667,27.0,0.620438,2.274939
4,4,1255,13521,Confessions (Oxford World's Classics),3.716019,412,1.665285,2.773173,15.849515,162,9.116505,94,0.575191,2.137423,4.647059,17.0,0.492592,0.242647,13.411765,79.0,10.529412,71.0,0.785088,3.648349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223548,223548,1916,214704,"Godzilla, King of the Monsters",4.458700,112483,1.043160,1.088182,7.315808,681,5.298009,657,0.724187,3.228931,3.933333,15.0,1.222799,1.495238,5.266667,24.0,3.866667,21.0,0.734177,2.887764
223549,223549,1502,169489,Houses of the Holy,4.310484,28404,1.139541,1.298554,3.643290,380,1.713843,147,0.470411,2.027699,4.451613,31.0,0.809885,0.655914,4.419355,46.0,2.225806,15.0,0.503650,2.242053
223550,223550,874,150908,Frankenstein and the Monster from Hell,4.414830,4855,0.674531,0.454992,10.290834,229,9.061792,206,0.880569,3.887564,3.125000,8.0,1.246423,1.553571,4.750000,10.0,3.500000,8.0,0.736842,2.302632
223551,223551,922,198737,X-Men,4.636000,1250,0.722164,0.521521,5.146400,42,3.952800,42,0.768071,3.560777,4.118280,93.0,0.942354,0.888032,8.010753,196.0,5.096774,141.0,0.636242,2.620221


In [18]:
# 定义特征和目标变量
features = ['user_rating_mean', 'user_rating_count', 'user_votes_mean', 'user_votes_max', 'user_helpful_votes_mean', 'user_helpful_votes_max'] \
         + ['product_id', 'product_rating_mean', 'product_rating_count', 'product_votes_mean', 'product_votes_max','product_helpful_votes_mean', 'product_helpful_votes_max']
# features = ['user_rating_mean', 'user_votes_mean', 'product_rating_mean', 'product_votes_mean']

X = train_data[features]
y = train_data['rating']
X

Unnamed: 0,user_rating_mean,user_rating_count,user_votes_mean,user_votes_max,user_helpful_votes_mean,user_helpful_votes_max,product_id,product_rating_mean,product_rating_count,product_votes_mean,product_votes_max,product_helpful_votes_mean,product_helpful_votes_max
0,4.251799,556,7.611511,88,5.462230,70,154533,4.555556,9,4.444444,10,4.0,8
1,4.458700,112483,7.315808,681,5.298009,657,154533,4.555556,9,4.444444,10,4.0,8
2,4.785417,480,2.343750,20,1.685417,16,154533,4.555556,9,4.444444,10,4.0,8
3,4.299320,147,11.204082,94,10.224490,92,154533,4.555556,9,4.444444,10,4.0,8
4,4.143437,739,3.657645,51,2.652233,46,154533,4.555556,9,4.444444,10,4.0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
745884,4.128788,132,2.977273,14,2.265152,13,140202,3.000000,1,4.000000,4,3.0,3
745885,4.128788,132,2.977273,14,2.265152,13,79222,5.000000,1,0.000000,0,0.0,0
745886,4.128788,132,2.977273,14,2.265152,13,1185,4.000000,1,2.000000,2,1.0,1
745887,4.128788,132,2.977273,14,2.265152,13,71558,5.000000,1,2.000000,2,2.0,2


In [5]:
# 创建 MinMaxScaler 对象
scaler = MinMaxScaler()

# 拟合并转换数据
normalized_data = scaler.fit_transform(X)

array([[7.62842181e-01, 4.90758275e-03, 1.37024614e-01, ...,
        2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
       [8.28423978e-01, 1.00000000e+00, 1.31421749e-01, ...,
        2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
       [9.31983403e-01, 4.23190107e-03, 3.72131582e-02, ...,
        2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
       ...,
       [7.23851417e-01, 1.13799020e-03, 4.92168973e-02, ...,
        5.34759358e-04, 3.44827586e-03, 3.94166338e-04],
       [7.23851417e-01, 1.13799020e-03, 4.92168973e-02, ...,
        5.34759358e-04, 6.89655172e-03, 7.88332676e-04],
       [7.23851417e-01, 1.13799020e-03, 4.92168973e-02, ...,
        8.02139037e-04, 1.03448276e-02, 1.18249901e-03]])

In [9]:
a = np.array([[7.62842181e-01, 4.90758275e-03, 1.37024614e-01],
        [2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
        [8.28423978e-01, 1.00000000e+00, 1.31421749e-01],
        [2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
        [9.31983403e-01, 4.23190107e-03, 3.72131582e-02],
        [2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
        [7.23851417e-01, 1.13799020e-03, 4.92168973e-02],
        [5.34759358e-04, 3.44827586e-03, 3.94166338e-04],
        [7.23851417e-01, 1.13799020e-03, 4.92168973e-02],
        [5.34759358e-04, 6.89655172e-03, 7.88332676e-04],
        [7.23851417e-01, 1.13799020e-03, 4.92168973e-02],
        [8.02139037e-04, 1.03448276e-02, 1.18249901e-03]])

In [10]:
import torch 
b = torch.tensor(a)
b

tensor([[7.6284e-01, 4.9076e-03, 1.3702e-01],
        [2.6738e-03, 1.3793e-02, 3.1533e-03],
        [8.2842e-01, 1.0000e+00, 1.3142e-01],
        [2.6738e-03, 1.3793e-02, 3.1533e-03],
        [9.3198e-01, 4.2319e-03, 3.7213e-02],
        [2.6738e-03, 1.3793e-02, 3.1533e-03],
        [7.2385e-01, 1.1380e-03, 4.9217e-02],
        [5.3476e-04, 3.4483e-03, 3.9417e-04],
        [7.2385e-01, 1.1380e-03, 4.9217e-02],
        [5.3476e-04, 6.8966e-03, 7.8833e-04],
        [7.2385e-01, 1.1380e-03, 4.9217e-02],
        [8.0214e-04, 1.0345e-02, 1.1825e-03]], dtype=torch.float64)

In [4]:
field_dims = [5, 10, 15]  # 三个字段，每个字段有不同数量的类别
offsets = np.array((0, *np.cumsum(field_dims)[:-1]))
offsets

array([ 0,  5, 15, 30])

In [9]:
print(np.cumsum(field_dims))

[ 5 15 30]


In [None]:
import torch
import numpy as np

field_dims = [5, 10, 15]  # 三个字段，每个字段有不同数量的类别
embed_dim = 4

# 定义模型
model = FeaturesEmbedding(field_dims, embed_dim)

# 模拟输入，表示每个字段的ID
x = torch.tensor([[0, 1, 2], [1, 3, 4]])  # (batch_size=2, num_fields=3)

# 前向传播，获取嵌入向量
output = model(x)
print(output.shape) 