In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


# jupyter配置
pd.options.display.max_rows=10000 #Notebook 的一个cell的显示行数
pd.options.display.max_columns=10000#Notebook 的一个cell的显示列数

In [2]:
## 数据处理
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

# 计算用户特征
user_features = train_data.groupby('user_id').agg({
    'rating': ['mean', 'count'],
    'votes': ['mean', 'max'],
    'helpful_votes': ['mean', 'max']
}).reset_index()
user_features.columns = ['user_id', 'user_rating_mean', 'user_rating_count', 'user_votes_mean', 'user_votes_max', 
                         'user_helpful_votes_mean', 'user_helpful_votes_max']

# 计算产品特征
product_features = train_data.groupby('product_id').agg({
    'rating': ['mean', 'count'],
    'votes': ['mean', 'max'],
    'helpful_votes': ['mean', 'max']
}).reset_index()
product_features.columns = ['product_id', 'product_rating_mean', 'product_rating_count', 'product_votes_mean', 'product_votes_max',
                            'product_helpful_votes_mean', 'product_helpful_votes_max']

train_data = train_data.merge(user_features, on='user_id').merge(product_features, on='product_id')


# 生成用户和产品特征
test_data = test_data.merge(user_features, on='user_id', how='left').merge(product_features, on='product_id', how='left')

# 填充缺失值
test_data.fillna(0, inplace=True)  # 或者使用其他合适的填充方法



In [None]:
# 保存训练测试数据
train_data.to_csv('./data/train_processed.csv')
test_data.to_csv('./data/test_processed.csv')

In [3]:
# 定义特征和目标变量
features = ['user_rating_mean', 'user_rating_count', 'user_votes_mean', 'user_votes_max', 'user_helpful_votes_mean', 'user_helpful_votes_max'] \
         + ['product_id', 'product_rating_mean', 'product_rating_count', 'product_votes_mean', 'product_votes_max','product_helpful_votes_mean', 'product_helpful_votes_max']
# features = ['user_rating_mean', 'user_votes_mean', 'product_rating_mean', 'product_votes_mean']

X = train_data[features]
y = train_data['rating']
X

Unnamed: 0,user_rating_mean,user_rating_count,user_votes_mean,user_votes_max,user_helpful_votes_mean,user_helpful_votes_max,product_id,product_rating_mean,product_rating_count,product_votes_mean,product_votes_max,product_helpful_votes_mean,product_helpful_votes_max
0,4.251799,556,7.611511,88,5.462230,70,154533,4.555556,9,4.444444,10,4.0,8
1,4.458700,112483,7.315808,681,5.298009,657,154533,4.555556,9,4.444444,10,4.0,8
2,4.785417,480,2.343750,20,1.685417,16,154533,4.555556,9,4.444444,10,4.0,8
3,4.299320,147,11.204082,94,10.224490,92,154533,4.555556,9,4.444444,10,4.0,8
4,4.143437,739,3.657645,51,2.652233,46,154533,4.555556,9,4.444444,10,4.0,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
745884,4.128788,132,2.977273,14,2.265152,13,140202,3.000000,1,4.000000,4,3.0,3
745885,4.128788,132,2.977273,14,2.265152,13,79222,5.000000,1,0.000000,0,0.0,0
745886,4.128788,132,2.977273,14,2.265152,13,1185,4.000000,1,2.000000,2,1.0,1
745887,4.128788,132,2.977273,14,2.265152,13,71558,5.000000,1,2.000000,2,2.0,2


In [5]:
# 创建 MinMaxScaler 对象
scaler = MinMaxScaler()

# 拟合并转换数据
normalized_data = scaler.fit_transform(X)

array([[7.62842181e-01, 4.90758275e-03, 1.37024614e-01, ...,
        2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
       [8.28423978e-01, 1.00000000e+00, 1.31421749e-01, ...,
        2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
       [9.31983403e-01, 4.23190107e-03, 3.72131582e-02, ...,
        2.67379679e-03, 1.37931034e-02, 3.15333071e-03],
       ...,
       [7.23851417e-01, 1.13799020e-03, 4.92168973e-02, ...,
        5.34759358e-04, 3.44827586e-03, 3.94166338e-04],
       [7.23851417e-01, 1.13799020e-03, 4.92168973e-02, ...,
        5.34759358e-04, 6.89655172e-03, 7.88332676e-04],
       [7.23851417e-01, 1.13799020e-03, 4.92168973e-02, ...,
        8.02139037e-04, 1.03448276e-02, 1.18249901e-03]])