# 数据预处理与特征工程

## 数据一览

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!unzip /content/drive/Shareddrives/137074046的5T空间/competitions/tianchi_competitions/腾讯大数据多任务/wechat_algo_data.zip -d /content/drive/Shareddrives/137074046的5T空间/competitions/tianchi_competitions/腾讯大数据多任务/

In [None]:
# 数据处理工具库
import numpy as np
import pandas as pd

In [None]:
path='/content/drive/Shareddrives/137074046的5T空间/competitions/tianchi_competitions/腾讯大数据多任务/wechat_algo_data/'

In [None]:
# feed数据
feed_df = pd.read_csv(path+"feed_info.csv")
feed_df.head(3)

In [None]:
# action数据
action_df = pd.read_csv(path+'user_action.csv')
action_df.head(3)

In [None]:
# 预测数据
test = pd.read_csv(path+'test_a.csv')
test.head(3)

## 数据合并与预处理

In [None]:
import os
import copy
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# 对list字段进行切分和映射编码
def split(column):
    if not isinstance(column,str):
        return []
    keys = column.strip().split(';')
    for key in keys:
        if key not in key2index:
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], keys))

In [None]:
def preprocess(sample,dense_features):
    '''
    特征工程：对数值型特征做对数变换; id型特征+1; 缺失值补充0。
    '''
    sample[dense_features] = sample[dense_features].fillna(0.0)
    sample[dense_features] = np.log(sample[dense_features] + 1.0)
    
    sample[["authorid", "bgm_song_id", "bgm_singer_id"]] += 1  # 0 用于填未知
    sample[["authorid", "bgm_song_id", "bgm_singer_id", "videoplayseconds"]] = sample[["authorid", "bgm_song_id", "bgm_singer_id", "videoplayseconds"]].fillna(0)
    sample["videoplayseconds"] = np.log(sample["videoplayseconds"] + 1.0)
    sample[["authorid", "bgm_song_id", "bgm_singer_id"]] = sample[["authorid", "bgm_song_id", "bgm_singer_id"]].astype(int)
    return sample

In [None]:
# 合并数据
test['date_'] = 15
action_df = pd.concat([action_df,test])

In [None]:
# 标签列
target = ["read_comment", "like", "click_avatar", "forward"]
# 稀疏特征
sparse_features = ['userid', 'feedid', 'authorid', 'bgm_song_id', 'bgm_singer_id']
# 变长序列特征
varlen_features = ['manual_tag_list','manual_keyword_list']
# 稠密特征
dense_features = ['videoplayseconds']

In [None]:
# 数据合并
feed_df = feed_df[['feedid', 'authorid', 'videoplayseconds', 'bgm_song_id', 'bgm_singer_id','manual_tag_list','manual_keyword_list']]
data = action_df.merge(feed_df, how='left',on='feedid') #行为数据拼接，作者id，bgm_song_id 
data = preprocess(data,dense_features) #特征处理
data = data[dense_features+sparse_features+varlen_features+['date_']+target]

In [None]:
# 变长特征编码
encoder = {}
global key2index
for f in ['manual_keyword_list','manual_tag_list']:
    key2index = {}
    f_list = list(map(split, data[f].values))
    f_length = np.array(list(map(len, f_list)))
    max_len = max(f_length)
    print(f'{f} 字段最长的取值序列长度为 {max_len}')
    # Notice : padding=`post`
    data[f] = list(pad_sequences(f_list, maxlen=max_len, padding='post', ))
    encoder[f] = copy.copy(key2index)

In [None]:
# 稀疏特征编码
for featid in sparse_features:
    print(f"编码ID字段：{featid}")
    encoder[featid] = {uid:ucode+1 for ucode,uid in enumerate(data[featid].unique())} 
    data[featid] = data[featid].apply(lambda x: encoder[featid].get(x,0))

In [None]:
data.head(10)

In [None]:
print('数据维度：', data.shape)
print('数据字段：', data.columns.tolist())
print('不同的date_取值: ', data['date_'].unique())
# 如果资源比较少，可以在这里进行数据采样
data = data.sample(frac = 1.0)

In [None]:
# 或者手动创建文件夹data_and_feature
!mkdir $path+data_and_feature

In [None]:
data.head(10)

In [None]:
# 构建训练集，验证集和测试集
# 第14天样本作为验证集
train = data[data['date_'] < 14].drop(['date_'],axis = 1)
val = data[data['date_'] == 14].drop(['date_'],axis = 1)  
test = data[data['date_'] == 15].drop(['date_'],axis = 1)

In [None]:
import gc
import joblib
del action_df
del feed_df
del data
gc.collect()

In [None]:
encoder

In [None]:
joblib.dump(train, path+'train.txt')
joblib.dump(val, path+'val.txt')
joblib.dump(test, path+'test.txt')
joblib.dump(encoder, path+'encoder.txt')

In [None]:
train.shape

In [None]:
np.sqrt(0.5)