#### 2.1.4.1 读取数据


数据路径root_path、sample_data_path需要自行修改

In [32]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 加载数据函数
def load_data(frac=1.0):
    """
    加载用户、样本、行为日志和广告特征数据，并对行为日志进行按比例采样。
    
    参数：
    - frac: float, 采样比例，默认为 1.0（加载全部数据）
    
    返回：
    - user: 用户数据
    - sample: 样本数据
    - user_behavior: 用户行为日志数据
    - ad_feature: 广告特征数据
    """
    root_path = "G:/DataSets/Ali_Display_Ad_Click/original_data"
    
    # exit("为防止重复执行此代码，现已退出。如果是首次执行，请注释掉此行。")
    
    # 加载广告特征和用户数据
    ad_feature = pd.read_csv(f'{root_path}/ad_feature.csv')
    print("[log] ad_feature读取完毕")
    user_profile = pd.read_csv(f"{root_path}/user_profile.csv")
    print("[log] user_profile读取完毕")
    
    # 修正列名并处理缺失值
    user_profile = user_profile.rename({'new_user_class_level ': 'new_user_class_level', 'userid': 'user_id'}, axis=1)
    # user_profile['new_user_class_level'] = user_profile['new_user_class_level'].fillna(-1)
    # user_profile['pvalue_level'] = user_profile['pvalue_level'].fillna(-1)
    # ad_feature['brand'] = ad_feature['brand'].fillna(-1)
    print("[log] 数据预处理完毕")
    
    # 读取行为日志数据（按顺序采样）
    # total_rows = sum(1 for _ in open(f'{root_path}/behavior_log.csv')) - 1  # 减去表头
    # print(f"行为日志总行数：{total_rows}")  # 723268134
    total_rows = 723268134  # 这样以后就不用每次都读取文件计算行数了
    n_sample = int(frac * total_rows)
    print(f">>> 行为日志加载前 {frac*100:.2f}%（{n_sample} 行）...")
    behavior_log = pd.read_csv(f'{root_path}/behavior_log.csv', nrows=n_sample)
    behavior_log = behavior_log.rename({'user': 'user_id'}, axis=1)  # 统一命名为 user_id
    behavior_log = behavior_log.rename({'cate': 'cate_id'}, axis=1)  # 统一命名为 cate_id
    print("[log] behavior_log加载完毕")
    
    # 获取采样用户
    sampled_users = behavior_log['user_id'].unique()
    print(f">>> 采样用户数量：{len(sampled_users)}")
    
    # 根据采样用户筛选样本数据和用户数据
    raw_sample = pd.read_csv(f'{root_path}/raw_sample.csv')
    raw_sample = raw_sample[raw_sample['user'].isin(sampled_users)]
    raw_sample = raw_sample.rename({'user': 'user_id'}, axis=1)  # 统一命名为 user_id
    print("[log] raw_sample读取并筛选完毕")
    
    user_profile = user_profile[user_profile['user_id'].isin(sampled_users)]
    print("[log] user_profile筛选完毕")
    
    return user_profile, raw_sample, behavior_log, ad_feature

# 设置采样比例
sampling_fraction = 0.001

# 加载数据
user_profile, raw_sample, behavior_log, ad_feature = load_data(frac=sampling_fraction)

# 查看加载的数据摘要
print(">>> 用户数据：", user_profile.shape)
print(">>> 样本数据：", raw_sample.shape)
print(">>> 采样后的用户行为数据：", behavior_log.shape)
print(">>> 广告特征数据：", ad_feature.shape)

# 保存采样数据到指定目录
sample_data_path = "G:/DataSets/Ali_Display_Ad_Click/sample_data"
os.makedirs(sample_data_path, exist_ok=True)

user_profile.to_csv(f"{sample_data_path}/user_profile.csv", index=False)
print("[log] user_profile数据已保存")
raw_sample.to_csv(f"{sample_data_path}/raw_sample.csv", index=False)
print("[log] raw_sample数据已保存")
behavior_log.to_csv(f"{sample_data_path}/behavior_log.csv", index=False)
print("[log] behavior_log数据已保存")
ad_feature.to_csv(f"{sample_data_path}/ad_feature.csv", index=False)
print("[log] ad_feature数据已保存")

# 显示部分数据
print(user_profile.head())


[log] ad_feature读取完毕
[log] user_profile读取完毕
[log] 数据预处理完毕
>>> 行为日志加载前 0.10%（723268 行）...
[log] behavior_log加载完毕
>>> 采样用户数量：194813
[log] raw_sample读取并筛选完毕
[log] user_profile筛选完毕
>>> 用户数据： (185915, 9)
>>> 样本数据： (7114606, 6)
>>> 采样后的用户行为数据： (723268, 5)
>>> 广告特征数据： (846811, 6)
[log] user_profile数据已保存
[log] raw_sample数据已保存
[log] behavior_log数据已保存
[log] ad_feature数据已保存
    user_id  cms_segid  cms_group_id  final_gender_code  age_level  \
0       234          0             5                  2          5   
2       612          0             8                  1          2   
6      5777         44             5                  2          5   
7      6211          0             9                  1          3   
15    10812          0             4                  2          4   

    pvalue_level  shopping_level  occupation  new_user_class_level  
0            NaN               3           0                   3.0  
2            2.0               3           0                   NaN  
6     

In [33]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

sample_data_path = "G:/DataSets/Ali_Display_Ad_Click/sample_data"

# 加载采样后的数据
user_profile = pd.read_csv(f"{sample_data_path}/user_profile.csv")
print("[log] user_profile数据加载完毕")

raw_sample = pd.read_csv(f"{sample_data_path}/raw_sample.csv")
print("[log] raw_sample数据加载完毕")

behavior_log = pd.read_csv(f"{sample_data_path}/behavior_log.csv")
print("[log] behavior_log数据加载完毕")

ad_feature = pd.read_csv(f"{sample_data_path}/ad_feature.csv")
print("[log] ad_feature数据加载完毕")

# 查看加载的数据摘要
print(">>> 用户数据：", user_profile.shape)
print(">>> 样本数据：", raw_sample.shape)
print(">>> 用户行为数据：", behavior_log.shape)
print(">>> 广告特征数据：", ad_feature.shape)


[log] user_profile数据加载完毕
[log] raw_sample数据加载完毕
[log] behavior_log数据加载完毕
[log] ad_feature数据加载完毕
>>> 用户数据： (185915, 9)
>>> 样本数据： (7114606, 6)
>>> 用户行为数据： (723268, 5)
>>> 广告特征数据： (846811, 6)


In [34]:
# 显示部分数据
user_profile.head()


Unnamed: 0,user_id,cms_segid,cms_group_id,final_gender_code,age_level,pvalue_level,shopping_level,occupation,new_user_class_level
0,234,0,5,2,5,,3,0,3.0
1,612,0,8,1,2,2.0,3,0,
2,5777,44,5,2,5,2.0,3,0,2.0
3,6211,0,9,1,3,,3,0,2.0
4,10812,0,4,2,4,,2,0,


In [35]:
raw_sample.head()


Unnamed: 0,user_id,time_stamp,adgroup_id,pid,nonclk,clk
0,555266,1494307136,11,430539_1007,1,0
1,117840,1494036743,11,430548_1007,1,0
2,623911,1494625301,11,430548_1007,1,0
3,623911,1494451608,11,430548_1007,1,0
4,286630,1494218579,13,430539_1007,1,0


In [36]:
behavior_log.head()


Unnamed: 0,user_id,time_stamp,btag,cate_id,brand
0,558157,1493741625,pv,6250,91286
1,558157,1493741626,pv,6250,91286
2,558157,1493741627,pv,6250,91286
3,728690,1493776998,pv,11800,62353
4,332634,1493809895,pv,1101,365477


In [37]:
ad_feature.head()

Unnamed: 0,adgroup_id,cate_id,campaign_id,customer,brand,price
0,63133,6406,83237,1,95471.0,170.0
1,313401,6406,83237,1,87331.0,199.0
2,248909,392,83237,1,32233.0,38.0
3,208458,392,83237,1,174374.0,139.0
4,110847,7211,135256,2,145952.0,32.99


#### 2.1.4.2 缺失值&编码

In [38]:
# 查看描述性统计信息和基本信息
# print(">>> 用户数据描述性统计信息")
# print(user_profile.describe(include='all'))  # 包括所有列
# 
# print("\n>>> 样本数据描述性统计信息")
# print(raw_sample.describe(include='all'))
# 
# print("\n>>> 用户行为数据描述性统计信息")
# print(behavior_log.describe(include='all'))
# 
# print("\n>>> 广告特征数据描述性统计信息")
# print(ad_feature.describe(include='all'))

# 计算缺失值比例
def missing_ratio(df, name):
    print(f"\n>>> {name} 缺失值比例")
    missing = df.isnull().sum()
    total = len(df)
    ratio = (missing / total * 100).round(2)
    missing_df = pd.DataFrame({'缺失值数量': missing, '缺失比例 (%)': ratio})
    print(missing_df)

missing_ratio(user_profile, "用户数据")
missing_ratio(raw_sample, "样本数据")
missing_ratio(behavior_log, "用户行为数据")
missing_ratio(ad_feature, "广告特征数据")

# pvalue_level, new_user_class_level、 brand有缺失值，这里暂时不处理



>>> 用户数据 缺失值比例
                      缺失值数量  缺失比例 (%)
user_id                   0      0.00
cms_segid                 0      0.00
cms_group_id              0      0.00
final_gender_code         0      0.00
age_level                 0      0.00
pvalue_level          97605     52.50
shopping_level            0      0.00
occupation                0      0.00
new_user_class_level  47115     25.34

>>> 样本数据 缺失值比例
            缺失值数量  缺失比例 (%)
user_id         0       0.0
time_stamp      0       0.0
adgroup_id      0       0.0
pid             0       0.0
nonclk          0       0.0
clk             0       0.0

>>> 用户行为数据 缺失值比例
            缺失值数量  缺失比例 (%)
user_id         0       0.0
time_stamp      0       0.0
btag            0       0.0
cate_id         0       0.0
brand           0       0.0

>>> 广告特征数据 缺失值比例
              缺失值数量  缺失比例 (%)
adgroup_id        0      0.00
cate_id           0      0.00
campaign_id       0      0.00
customer          0      0.00
brand        246330     29.09
price   

In [39]:
# import os
# import numpy as np
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# 
# # 加载数据函数
# def load_data(frac=1.0):
#     """
#     加载用户、样本、行为日志和广告特征数据，并对行为日志进行按比例随机采样。
#     
#     参数：
#     - frac: float, 采样比例，默认为 1.0（加载全部数据）
#     
#     返回：
#     - user: 用户数据
#     - sample: 样本数据
#     - user_behavior: 用户行为日志数据
#     - ad_feature: 广告特征数据
#     """
#     root_path = "G:/DataSets/Ali_Display_Ad_Click/original_data"
#     
#     # 加载数据
#     raw_sample = pd.read_csv(f'{root_path}/raw_sample.csv')
#     print("raw_sample读取完毕")
#     ad_feature = pd.read_csv(f'{root_path}/ad_feature.csv')
#     print("ad_feature读取完毕")
#     user_profile = pd.read_csv(f"{root_path}/user_profile.csv")
#     print("user_profile读取完毕")
#     
#     # 加载行为日志数据时进行采样
#     print(f"行为日志按 {frac*100:.2f}% 比例采样加载...")
#     behavior_log = pd.read_csv(
#         f'{root_path}/behavior_log.csv', 
#         skiprows=lambda i: i > 0 and np.random.rand() > frac,  # 按随机数过滤
#     )
#     print("behavior_log加载完毕")
#     
#     # 修正列名并处理缺失值
#     user_profile = user_profile.rename({'new_user_class_level ': 'new_user_class_level'}, axis=1)
#     user_profile['new_user_class_level'] = user_profile['new_user_class_level'].fillna(-1)
#     user_profile['pvalue_level'] = user_profile['pvalue_level'].fillna(-1)
#     ad_feature['brand'] = ad_feature['brand'].fillna(-1)
#     
#     return user_profile, raw_sample, behavior_log, ad_feature
# 
# # 设置采样比例
# sampling_fraction = 0.0001
# 
# # 加载数据
# user_profile, raw_sample, behavior_log, ad_feature = load_data(frac=sampling_fraction)
# 
# # 查看加载的数据摘要
# print("用户数据：", user_profile.shape)
# print("样本数据：", raw_sample.shape)
# print("采样后的用户行为数据：", behavior_log.shape)
# print("广告特征数据：", ad_feature.shape)
# 
# # 显示部分数据
# print(user_profile.head())


In [40]:
# # 数据预处理函数
# def data_label_encode(user_behavior, ad_feature):
#     """
#     对类别特征进行编码并清洗行为数据中的无效时间戳。
#     """
#     # 初始化类别编码器
#     cate_lbe = LabelEncoder()
#     cate_ids = np.concatenate((ad_feature['cate_id'].unique(), user_behavior['cate_id'].unique()))
#     cate_lbe.fit(cate_ids)
#     
#     # 对类别特征进行编码
#     ad_feature['cate_id'] = cate_lbe.transform(ad_feature['cate_id']) + 1  # +1是为了避免0值（0往往被当做特殊值处理）
#     user_behavior['cate_id'] = cate_lbe.transform(user_behavior['cate_id']) + 1
#     
#     # 品牌特征编码
#     brand_lbe = LabelEncoder()
#     brand_ids = np.concatenate((ad_feature['brand'].unique(), user_behavior['brand'].unique()))
#     brand_lbe.fit(brand_ids)
#     ad_feature['brand'] = brand_lbe.transform(ad_feature['brand']) + 1
#     user_behavior['brand'] = brand_lbe.transform(user_behavior['brand']) + 1
#     
#     # 移除时间戳为无效值的行为记录
#     user_behavior = user_behavior[user_behavior['time_stamp'] > 0]
#     
#     return user_behavior, ad_feature
# 
# # 数据预处理
# behavior_log, ad_feature = data_label_encode(behavior_log, ad_feature)
# 
# # 查看预处理后的结果
# print("用户行为数据：", behavior_log.shape)
# print("广告特征数据：", ad_feature.shape)
# behavior_log.head()


In [41]:
# def time_transform(df):
#     df['date'] = pd.to_datetime(df['time_stamp'], unit='s')  # unit='s'表示时间戳是秒
#     df['date_ymd'] = df['date'].dt.date
#     df['year'] = df['date'].dt.year
#     df['month'] = df['date'].dt.month
#     df['day'] = df['date'].dt.day
#     df['weekday'] = df['date'].dt.weekday
#     
#     return df

def time_transform(df):
    # 将时间戳转换为datetime格式
    df['date'] = pd.to_datetime(df['time_stamp'], unit='s')  # unit='s' 表示时间戳是秒
    # 提取 day 和 hour 字段
    df['day'] = df['date'].dt.day
    df['hour'] = df['date'].dt.hour
    # 删除原始时间戳列。因为都是2017-05的数据，然后minute与second也不是很需要使用，所以也不保留
    df.drop(columns=['time_stamp', 'date'], inplace=True)
    
    return df


# # user_behavior_data_pv只保留pv的数据和sample之后的user数据，用于后续的特征工程
# user_behavior_data_pv = behavior_log[behavior_log['btag']=='pv']
# user_behavior_data_pv = user_behavior_data_pv[user_behavior_data_pv['user'].isin(user_profile['user_id'].unique())].reset_index(drop=True)
# user_behavior_data_pv = time_transform(user_behavior_data_pv)
raw_sample = time_transform(raw_sample)  # 将时间戳转换为日期属性
behavior_log = time_transform(behavior_log)  # 将时间戳转换为日期属性

In [42]:
# # 抽样函数
# def data_sample(frac, user, sample, user_behavior, ad_feature):
#     """
#     按指定比例对用户进行抽样，并筛选与抽样用户相关的行为和样本数据。
#     """
#     # 抽样用户
#     sel_user = user.sample(frac=frac, random_state=1024).reset_index(drop=True)
#     sel_user_ids = sel_user.user_id.unique()
#     
#     # 筛选相关数据
#     sel_sample = sample[sample['user'].isin(sel_user_ids)].reset_index(drop=True)
#     sel_user_behavior = user_behavior[user_behavior['user_id'].isin(sel_user_ids)].reset_index(drop=True)
#     sel_ad_feature = ad_feature.copy()  # 广告特征直接复制即可
#     
#     return sel_user, sel_sample, sel_user_behavior, sel_ad_feature
# 
# # 设置抽样比例
# frac = 0.001
# 
# # 进行数据抽样
# sel_user, sel_sample, sel_user_behavior, sel_ad_feature = data_sample(
#     frac, user, sample, user_behavior, ad_feature)
# 
# # 查看抽样后的结果
# print("抽样后的用户数据：", sel_user.shape)
# print("抽样后的样本数据：", sel_sample.shape)
# print("抽样后的用户行为数据：", sel_user_behavior.shape)


In [43]:
# # 数据保存函数
# def save_data(sel_user, sel_sample, sel_user_behavior, sel_ad_feature):
#     """
#     将抽样和预处理后的数据保存为二进制文件格式。
#     """
#     exit(1)
#     os.makedirs('data1/final_data', exist_ok=True)  # 确保输出目录存在
#     sel_user.to_pickle('data1/final_data/user_data.pkl')
#     sel_sample.to_pickle('data1/final_data/sample_data.pkl')
#     sel_user_behavior.to_pickle('data1/final_data/user_behavior_data.pkl')
#     sel_ad_feature.to_pickle('data1/final_data/ad_data.pkl')
# 
# # 保存数据
# save_data(sel_user, sel_sample, sel_user_behavior, sel_ad_feature)
# 
# print("数据已保存到 data1/final_data 文件夹中。")


In [44]:

# 保存processed_data到指定目录
processed_data_path = "G:/DataSets/Ali_Display_Ad_Click/processed_data"
os.makedirs(processed_data_path, exist_ok=True)

user_profile.to_csv(f"{processed_data_path}/user_profile.csv", index=False)
print("[log] user_profile数据已保存")
raw_sample.to_csv(f"{processed_data_path}/raw_sample.csv", index=False)
print("[log] raw_sample数据已保存")
behavior_log.to_csv(f"{processed_data_path}/behavior_log.csv", index=False)
print("[log] behavior_log数据已保存")
ad_feature.to_csv(f"{processed_data_path}/ad_feature.csv", index=False)
print("[log] ad_feature数据已保存")

# 显示部分数据
print(user_profile.head())

[log] user_profile数据已保存
[log] raw_sample数据已保存
[log] behavior_log数据已保存
[log] ad_feature数据已保存
   user_id  cms_segid  cms_group_id  final_gender_code  age_level  \
0      234          0             5                  2          5   
1      612          0             8                  1          2   
2     5777         44             5                  2          5   
3     6211          0             9                  1          3   
4    10812          0             4                  2          4   

   pvalue_level  shopping_level  occupation  new_user_class_level  
0           NaN               3           0                   3.0  
1           2.0               3           0                   NaN  
2           2.0               3           0                   2.0  
3           NaN               3           0                   2.0  
4           NaN               2           0                   NaN  
