In [None]:
import math
import numpy as np
import pandas as pd
import statsmodels as sm
import matplotlib.pylab as plt
import config as cf

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error
from chinese_calendar import is_workday, is_holiday
from jupyterthemes import jtplot
from util import timeit
from joblib import Parallel, delayed
from IPython.core.display import clear_output


jtplot.style()
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))

In [None]:
train_df = pd.read_csv(cf.round1_train_file_path, sep = ' ')
test_a_df = pd.read_csv(cf.round1_test_a_file_path, sep = ' ')
test_b_df = pd.read_csv(cf.round1_test_b_file_path, sep = ' ')
test_df = test_a_df.append(test_b_df)

category_df = train_df['item_category_list'].unique()
category_ids = pd.DataFrame({'item_category_list' : category_df, 'item_category_id' : np.arange(len(category_df))})
train_df = train_df.merge(category_ids, on='item_category_list')
test_df = test_df.merge(category_ids, on='item_category_list')

time_offset = 8 * 60 * 60 - 365 * 24 * 60 * 60
train_df.loc[:,'context_datetime'] = pd.to_datetime(train_df.loc[:,'context_timestamp'] + time_offset, unit='s')
test_df.loc[:,'context_datetime'] = pd.to_datetime(test_df.loc[:,'context_timestamp'] + time_offset, unit='s')
train_df.loc[:,'context_day'] = train_df.loc[:,'context_datetime'].map(lambda x:x.day)
test_df.loc[:,'context_day'] = test_df.loc[:,'context_datetime'].map(lambda x:x.day)
train_df.loc[:,'context_hour'] = train_df.loc[:,'context_datetime'].map(lambda x:x.hour)
test_df.loc[:,'context_hour'] = test_df.loc[:,'context_datetime'].map(lambda x:x.hour)
train_df.loc[:,'user_age_level2'] = train_df.loc[:, 'user_age_level'].apply(lambda x: 1 if x == 1004 or x == 1005 or x == 1006 or x == 1007  else 2)
test_df.loc[:,'user_age_level2'] = test_df.loc[:, 'user_age_level'].apply(lambda x: 1 if x == 1004 or x == 1005 or x == 1006 or x == 1007  else 2)
train_df.loc[:,'user_star_level2'] = train_df.loc[:, 'user_star_level'].apply(lambda x: 1 if x == -1 or x == 3000 or x == 3001 else 3 if x == 3010 else 2)
test_df.loc[:,'user_star_level2'] = test_df.loc[:, 'user_star_level'].apply(lambda x: 1 if x == -1 or x == 3000 or x == 3001 else 3 if x == 3010 else 2)
# train_df.loc[:,'context_minustamp'] = train_df.loc[:,'context_datetime'].map(lambda x:x.hour * 60 + x.minute)
# test_df.loc[:,'context_minustamp'] = test_df.loc[:,'context_datetime'].map(lambda x:x.hour * 60 + x.minute)

stat_df = train_df
all_df = pd.concat([train_df, test_df]).drop_duplicates(subset='instance_id')

In [None]:
def min_max_normalize(df, name_list):
    for name in name_list:
        # 归一化
        max_number = df[name].max()
        min_number = df[name].min()
        # assert max_number != min_number, 'max == min in COLUMN {0}'.format(name)
        df.loc[:,name] = df.loc[:,name].map(lambda x: float(x - min_number + 0.001) / float(max_number - min_number + 0.001))
        # 做简单的平滑,试试效果如何
    return df

def min_max_normalize_log(df, name_list):
    for name in name_list:
        # 归一化
        max_number = df[name].max()
        min_number = df[name].min()
        # assert max_number != min_number, 'max == min in COLUMN {0}'.format(name)
        df.loc[:,name] = df.loc[:,name].map(lambda x: np.log(x + 1) / np.log(max_number + 1))
        # 做简单的平滑,试试效果如何
    return df

def normalize_log(df, name_list):
    for name in name_list:
        df.loc[:,name] = df.loc[:,name].map(lambda x: np.log(x + 1))
    return df

> 建立基础特征数据

In [None]:
def gen_base_feature(df):
    feature_list = []
    if 'is_trade' in df:
        feature_list.append('is_trade')
    feature_list.extend(['instance_id', 'context_id', 'context_timestamp', 'context_day', 'item_property_list'])
    feature_list.extend(['user_id', 'item_id', 'shop_id', 'item_brand_id', 'item_city_id', 'item_category_id'])
    feature_list.extend(['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level',
                         'user_gender_id', 'user_age_level', 'user_age_level2', 'user_occupation_id',
                         'user_star_level', 'user_star_level2', 'context_page_id', 'shop_review_num_level', 'shop_star_level',
                         'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description'])
    base_feature = df[feature_list]
    return base_feature

def gen_base_combine_feature(df):
    base_df = df[['item_id', 'user_id', 'shop_id',
                 'item_sales_level', 'item_price_level', 'item_collected_level', 'item_pv_level',
                 'user_gender_id', 'user_age_level', 'user_star_level', 'user_occupation_id',
                 'shop_review_num_level', 'shop_star_level']].drop_duplicates()
    combine_list = {
        'price_sale': ['item_sales_level', 'item_price_level'],
        'collect_sale': ['item_sales_level', 'item_collected_level'],
        'collect_price': ['item_price_level', 'item_collected_level'],
        'collect_pv': ['item_pv_level', 'item_collected_level'],
        'sale_pv': ['item_pv_level', 'item_sales_level'],
        
        'gender_age': ['user_gender_id', 'user_age_level'],
        'gender_occ': ['user_gender_id', 'user_occupation_id'],
        'gender_star': ['user_gender_id', 'user_star_level'],
        'review_star': ['shop_review_num_level', 'shop_star_level'],
        
        'price_gender': ['user_gender_id', 'item_price_level'],
        'price_occ': ['item_price_level', 'user_occupation_id'],
        'price_star': ['item_price_level', 'user_star_level'],
    }
    for key, combine in combine_list.items():
        base_df.loc[:,key] = base_df[combine[0]] * 24 + base_df[combine[1]]
    
    return base_df


def process_intersection_id(train_df, test_df, stat_df = None, all_df = None):
    columns = ['user_id', 'item_id', 'shop_id', 'item_brand_id']
    ret_tr_df = train_df[columns].drop_duplicates()
    ret_te_df = test_df[columns].drop_duplicates()
    
    for column in columns:
        a = set(ret_tr_df[column].tolist())
        b = set(ret_te_df[column].tolist())
        is_id = a & b
        tr_df = ret_tr_df.loc[ret_tr_df[column].isin(is_id)][[column]]
        tr_df.loc[:, column + '_is'] = 1
        te_df = ret_te_df.loc[ret_te_df[column].isin(is_id)][[column]]
        te_df.loc[:, column + '_is'] = 1
        ret_tr_df = ret_tr_df.merge(tr_df.drop_duplicates(), how='left')
        ret_te_df = ret_te_df.merge(te_df.drop_duplicates(), how='left')
    
    return ret_tr_df.fillna(-1).astype('int64'), ret_te_df.fillna(-1).astype('int64')

    
def process_base_feature(train_df, test_df, stat_df=None, all_df = None):
    tr_df = gen_base_feature(train_df)
    te_df = gen_base_feature(test_df)
    return tr_df, te_df

def process_base_combine_feature(train_df, test_df, stat_df=None, all_df = None):
    return map(gen_base_combine_feature, (train_df, test_df))

# train_base_ft, test_base_ft = process_base_feature(train_df, test_df)
# train_base_ft1, test_base_ft1 = process_base_combine_feature(train_df, test_df)
# train_id_is, test_id_is = process_intersection_id(train_df, test_df)

> 建立用户特征数据

In [None]:
def gen_user_ot_feature(stat_df, extend_days):
    stat_user_df = stat_df[['user_id', 'context_day', 'context_hour', 'is_trade']]
    
    feature_frames = []
    stat_days = set(stat_user_df['context_day'].unique())
    extend_days = set(extend_days)
    for day in stat_days | extend_days:
        dfs = {'all':stat_user_df.loc[stat_user_df['context_day'] != day],
               'last':stat_user_df.loc[stat_user_df['context_day'] == day - 1]
              }
        user_features = dfs['all'].drop(columns=['is_trade']).drop_duplicates()
        user_features.loc[:, 'context_day'] = day
        groupbys = {'u' : ['user_id'], 
                    'uh' : ['user_id', 'context_hour']}

        for key, groupby in groupbys.items():
            for dkey, user_df in dfs.items():
                if user_df.empty:
                    continue
                key = dkey + '_' + key
                trade_cnt_key = 'utrade_' + key + '_cnt'
                query_cnt_key = 'uquery_' + key + '_cnt'
                user_rate = user_df[groupby + ['is_trade']].rename(columns={'is_trade':trade_cnt_key})
                user_rate.loc[:, query_cnt_key] = 1
                user_rate = user_rate.groupby(groupby, as_index=False).sum()
                user_rate.loc[:, 'urate_' + key] = user_rate[trade_cnt_key] / user_rate[query_cnt_key]
                user_features = user_features.merge(user_rate, how='left')

        cnt_columns = filter(lambda x:x.endswith('_cnt'), user_features.columns.values)
        if day not in stat_days:
            user_features.loc[:,cnt_columns] = user_features.loc[:,cnt_columns] * (len(stat_days) - 1) / len(stat_days)
        user_features = min_max_normalize_log(user_features, cnt_columns)
        feature_frames.append(user_features)
        
    return pd.concat(feature_frames).drop_duplicates()

def gen_user_prob_feature(stat_df, columns):
    data = stat_df[['instance_id'] + columns]
    print('用户有多少性别')
    itemcnt = data.groupby(['user_id'], as_index=False)['instance_id'].agg({'user_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_id'], how='left')

    for col in ['user_gender_id','user_age_level', 'user_occupation_id', 'user_star_level']:
        itemcnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg({str(col) + '_user_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'user_id'], how='left')
        data[str(col) + '_user_prob']=data[str(col) + '_user_cnt']/data['user_cnt']
    del data['user_cnt']

    print('性别的年龄段，职业有多少')
    itemcnt = data.groupby(['user_gender_id'], as_index=False)['instance_id'].agg({'user_gender_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_gender_id'], how='left')

    for col in ['user_age_level', 'user_occupation_id', 'user_star_level']:
        itemcnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg({str(col) + '_user_gender_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'user_gender_id'], how='left')
        data[str(col) + '_user_gender_prob']=data[str(col) + '_user_gender_cnt']/data['user_gender_cnt']
    del data['user_gender_cnt']

    print('user_age_level对应的user_occupation_id，user_star_level')
    itemcnt = data.groupby(['user_age_level'], as_index=False)['instance_id'].agg({'user_age_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_age_level'], how='left')

    for col in ['user_occupation_id', 'user_star_level']:
        itemcnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg({str(col) + '_user_age_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'user_age_level'], how='left')
        data[str(col) + '_user_age_prob']=data[str(col) + '_user_age_cnt']/data['user_age_cnt']
    del data['user_age_cnt']

    print('user_occupation_id对应的user_star_level')
    itemcnt = data.groupby(['user_occupation_id'], as_index=False)['instance_id'].agg({'user_occ_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_occupation_id'], how='left')
    for col in ['user_star_level']:
        itemcnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg({str(col) + '_user_occ_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'user_occupation_id'], how='left')
        data[str(col) + '_user_occ_prob']=data[str(col) + '_user_occ_cnt']/data['user_occ_cnt']
    del data['user_occ_cnt']
    
    del data['instance_id']
    
    cnt_columns = filter(lambda x:x.endswith('_cnt'), data.columns.values)
    data = min_max_normalize(data, cnt_columns)
    return data


def process_user_ot_feature(train_df, test_df, stat_df, all_df = None):
    stat_user_df = gen_user_ot_feature(stat_df, test_df['context_day'].unique())
    base_columns = ['user_id', 'context_day', 'context_hour']
    return map(lambda df:df[base_columns].merge(stat_user_df).drop_duplicates(), (train_df, test_df))


def process_user_prob_feature(train_df, test_df, stat_df, all_df):
    base_columns = ['user_id', 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']
    stat_user_df = gen_user_prob_feature(all_df, base_columns).drop_duplicates()
    return map(lambda df:df[base_columns].merge(stat_user_df).drop_duplicates(), (train_df, test_df))


# train_user_ct1, test_user_ct1 = process_user_ot_feature(train_df, test_df, stat_df)
# train_user_ct2, test_user_ct2 = process_user_prob_feature(train_df, test_df, stat_df, all_df)

> 建立用户-商品特征数据

In [None]:
def gen_user_item_ot_feature(stat_df, extend_days):
    stat_user_item_df = stat_df[['user_id', 'shop_id', 'item_id', 'item_brand_id', 'item_category_id', 'item_city_id', 'item_price_level', 'context_day', 'is_trade']]
    
    feature_frames = []
    stat_days = set(stat_user_item_df['context_day'].unique())
    extend_days = set(extend_days)
    for day in stat_days | extend_days:
        dfs = {'all':stat_user_item_df.loc[stat_user_item_df['context_day'] != day],
               'last':stat_user_item_df.loc[stat_user_item_df['context_day'] == day - 1]
              }
        user_item_features = dfs['all'].drop(columns=['is_trade']).drop_duplicates()
        user_item_features.loc[:, 'context_day'] = day
        groupbys = {'us' : ['user_id', 'shop_id'],
                    'ui' : ['user_id', 'item_id'], 
                    'ub' : ['user_id', 'item_brand_id'], 
                    'ucg' : ['user_id', 'item_category_id'],
                    'uct' : ['user_id', 'item_city_id'],
                    'uip' : ['user_id', 'item_price_level']}

        for key, groupby in groupbys.items():
            for dkey, user_item_df in dfs.items():
                if user_item_df.empty:
                    continue
                key = dkey + '_' + key
                trade_cnt_key = 'uitrade_' + key + '_cnt'
                query_cnt_key = 'uiquery_' + key + '_cnt'
                user_item_rate = user_item_df[groupby + ['is_trade']].rename(columns={'is_trade':trade_cnt_key})
                user_item_rate.loc[:, query_cnt_key] = 1
                user_item_rate = user_item_rate.groupby(groupby, as_index=False).sum()
                user_item_rate.loc[:, 'uirate_' + key] = user_item_rate[trade_cnt_key] / user_item_rate[query_cnt_key]
                user_item_features = user_item_features.merge(user_item_rate, how='left')

        cnt_columns = filter(lambda x:x.endswith('_cnt'), user_item_features.columns.values)
        if day not in stat_days:
            user_item_features.loc[:,cnt_columns] = user_item_features.loc[:,cnt_columns] * (len(stat_days) - 1) / len(stat_days)
        user_item_features = min_max_normalize_log(user_item_features, cnt_columns)
        feature_frames.append(user_item_features)
        
    return pd.concat(feature_frames).drop_duplicates()

def gen_user_item_prob_feature(stat_df, columns):
    data = stat_df[['instance_id'] + columns]
    itemcnt = data.groupby(['user_id'], as_index=False)['instance_id'].agg({'user_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_id'], how='left')
    print('一个user有多少item_id,item_brand_id……')
    for col in ['item_id',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg({str(col)+'_user_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left')
        data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt']

    print('一个user_gender有多少item_id,item_brand_id……')
    itemcnt = data.groupby(['user_gender_id'], as_index=False)['instance_id'].agg({'user_gender_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_gender_id'], how='left')
    for col in ['item_id',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg({str(col)+'_user_gender_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left')
        data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt']

    print('一个user_age_level有多少item_id,item_brand_id……')
    itemcnt = data.groupby(['user_age_level'], as_index=False)['instance_id'].agg({'user_age_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_age_level'], how='left')
    for col in ['item_id',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg({str(col)+'_user_age_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left')
        data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt']

    print('一个user_occupation_id有多少item_id,item_brand_id…')
    itemcnt = data.groupby(['user_occupation_id'], as_index=False)['instance_id'].agg({'user_occ_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['user_occupation_id'], how='left')
    for col in ['item_id',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg({str(col)+'_user_occ_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left')
        data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt']
       

    # user_shop
    print('一个user有多少shop_id,shop_review_num_level……')

    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_id'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_id'], how='left')
        data[str(col) + '_user_prob'] = data[str(col) + '_user_cnt'] / data['user_cnt']
    del data['user_cnt']

    print('一个user_gender有多少shop_id,shop_review_num_level……')
    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_gender_id'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_gender_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_gender_id'], how='left')
        data[str(col) + '_user_gender_prob'] = data[str(col) + '_user_gender_cnt'] / data['user_gender_cnt']
    del data['user_gender_cnt']

    print('一个user_age_level有多少shop_id,shop_review_num_level……')
    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_age_level'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_age_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_age_level'], how='left')
        data[str(col) + '_user_age_prob'] = data[str(col) + '_user_age_cnt'] / data['user_age_cnt']
    del data['user_age_cnt']

    print('一个user_occupation_id有多少shop_id,shop_review_num_level……')
    for col in ['shop_id', 'shop_review_num_level', 'shop_star_level']:
        item_shop_cnt = data.groupby([col, 'user_occupation_id'], as_index=False)['instance_id'].agg(
            {str(col) + '_user_occ_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'user_occupation_id'], how='left')
        data[str(col) + '_user_occ_prob'] = data[str(col) + '_user_occ_cnt'] / data['user_occ_cnt']
    del data['user_occ_cnt']

    
    # shop_item
    print('一个shop有多少item_id,item_brand_id,item_city_id,item_price_level……')
    itemcnt = data.groupby(['shop_id'], as_index=False)['instance_id'].agg({'shop_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['shop_id'], how='left')
    for col in ['item_id',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'shop_id'], as_index=False)['instance_id'].agg({str(col)+'_shop_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'shop_id'], how='left')
        data[str(col) + '_shop_prob'] = data[str(col) + '_shop_cnt'] / data['shop_cnt']
    del data['shop_cnt']

    print('一个shop_review_num_level有多少item_id,item_brand_id,item_city_id,item_price_level……')
    itemcnt = data.groupby(['shop_review_num_level'], as_index=False)['instance_id'].agg({'shop_rev_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['shop_review_num_level'], how='left')
    for col in ['item_id',
                'item_brand_id','item_city_id','item_price_level',
                'item_sales_level','item_collected_level','item_pv_level']:
        item_shop_cnt = data.groupby([col, 'shop_review_num_level'], as_index=False)['instance_id'].agg({str(col)+'_shop_rev_cnt': 'count'})
        data = pd.merge(data, item_shop_cnt, on=[col, 'shop_review_num_level'], how='left')
        data[str(col) + '_shop_rev_prob'] = data[str(col) + '_shop_rev_cnt'] / data['shop_rev_cnt']
    del data['shop_rev_cnt']

    del data['instance_id']
    
    cnt_columns = filter(lambda x:x.endswith('_cnt'), data.columns.values)
    data = min_max_normalize(data, cnt_columns)
    return data


def process_user_item_ot_feature(train_df, test_df, stat_df, all_df = None):
    stat_user_item_df = gen_user_item_ot_feature(stat_df, test_df['context_day'].unique())
    base_columns = ['user_id', 'shop_id', 'item_id', 'item_brand_id', 'item_category_id', 'context_day', 'item_city_id', 'item_price_level']
    return map(lambda df:df[base_columns].merge(stat_user_item_df).drop_duplicates(), (train_df, test_df))


def process_user_item_prob_feature(train_df, test_df, stat_df, all_df):
    base_columns = ['user_id', 'item_id', 'item_brand_id','item_city_id','item_price_level', 'user_age_level', 'shop_id', 'shop_star_level',
                    'user_gender_id', 'user_occupation_id', 'item_sales_level','item_collected_level','item_pv_level', 'shop_review_num_level']
    stat_user_item_df = gen_user_item_prob_feature(all_df, base_columns).drop_duplicates()
    return map(lambda df:df[base_columns].merge(stat_user_item_df).drop_duplicates(), (train_df, test_df))

# train_user_ft, test_user_ft = process_user_item_ot_feature(train_df, test_df, stat_df)
# train_user_ft, test_user_ft = process_user_item_prob_feature(train_df, test_df, stat_df, all_df)

> 建立商品特征数据

In [None]:
def gen_item_ot_feature(stat_df, extend_days):
    stat_item_df = stat_df[['item_id', 'item_brand_id', 'item_category_id', 'item_city_id', 'context_day', 'context_hour', 'user_gender_id', 'is_trade']]
    
    feature_frames = []
    stat_days = set(stat_item_df['context_day'].unique())
    extend_days = set(extend_days)
    for day in stat_days | extend_days:
        dfs = {'all':stat_item_df.loc[stat_item_df['context_day'] != day],
               'last':stat_item_df.loc[stat_item_df['context_day'] == day - 1]
              }
        item_features = dfs['all'].drop(columns=['is_trade']).drop_duplicates()
        item_features.loc[:, 'context_day'] = day
        groupbys = {'i' : ['item_id'], 
                    'ib' : ['item_brand_id'], 
                    'icg' : ['item_category_id'],
                    'ict' : ['item_city_id'],
                    'ih' : ['item_id', 'context_hour'], 
                    'ibh' : ['item_brand_id', 'context_hour'], 
                    'icgh' : ['item_category_id', 'context_hour'],
                    'iug' : ['item_id', 'user_gender_id'],
                    'ibug' : ['item_brand_id', 'user_gender_id'],
                    'icug' : ['item_category_id', 'user_gender_id']}
        for key, groupby in groupbys.items():
            for dkey, item_df in dfs.items():
                if item_df.empty:
                    continue
                key = dkey + '_' + key
                trade_cnt_key = 'itrade_' + key + '_cnt'
                query_cnt_key = 'iquery_' + key + '_cnt'
                item_rate = item_df[groupby + ['is_trade']].rename(columns={'is_trade':trade_cnt_key})
                item_rate.loc[:, query_cnt_key] = 1
                item_rate = item_rate.groupby(groupby, as_index=False).sum()
                item_rate.loc[:, 'irate_' + key] = item_rate[trade_cnt_key] / item_rate[query_cnt_key]
                item_features = item_features.merge(item_rate, how='left')

        cnt_columns = filter(lambda x:x.endswith('_cnt'), item_features.columns.values)
        if day not in stat_days:
            item_features.loc[:,cnt_columns] = item_features.loc[:,cnt_columns] * (len(stat_days) - 1) / len(stat_days)
        item_features = min_max_normalize_log(item_features, cnt_columns)
        feature_frames.append(item_features)
        
    return pd.concat(feature_frames).drop_duplicates()

def gen_item_property_feature(df):
    prop_item_df = df[['item_property_list']].drop_duplicates()
    for i in range(5):
        prop_item_df.loc[:,'property_%d'%(i)] = prop_item_df.loc[:,'item_property_list'].apply(
            lambda x:x.split(";")[i] if len(x.split(";")) > i else " ")
    return prop_item_df


def gen_item_prob_feature(stat_df, columns):
    print('一个item有多少brand,price salse collected level……')
    data = stat_df[['instance_id'] + columns]

    itemcnt = data.groupby(['item_id'], as_index=False)['instance_id'].agg({'item_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_id'], how='left')

    for col in ['item_brand_id','item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_id'], as_index=False)['instance_id'].agg({str(col) + '_item_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_id'], how='left')
        data[str(col) + '_item_prob']=data[str(col) + '_item_cnt']/data['item_cnt']
    del data['item_cnt']

    print('一个brand有多少price salse collected level……')

    itemcnt = data.groupby(['item_brand_id'], as_index=False)['instance_id'].agg({'item_brand_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_brand_id'], how='left')

    for col in ['item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_brand_id'], as_index=False)['instance_id'].agg({str(col) + '_brand_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_brand_id'], how='left')
        data[str(col) + '_brand_prob'] = data[str(col) + '_brand_cnt'] / data['item_brand_cnt']
    del data['item_brand_cnt']

    print('一个city有多少item_price_level，item_sales_level，item_collected_level，item_pv_level')

    itemcnt = data.groupby(['item_city_id'], as_index=False)['instance_id'].agg({'item_city_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_city_id'], how='left')
    for col in ['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_city_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left')
        data[str(col) + '_city_prob'] = data[str(col) + '_city_cnt'] / data['item_city_cnt']
    del data['item_city_cnt']

    print('一个price有多少item_sales_level，item_collected_level，item_pv_level')

    itemcnt = data.groupby(['item_price_level'], as_index=False)['instance_id'].agg({'item_price_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_price_level'], how='left')
    for col in ['item_sales_level', 'item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_city_id'], as_index=False)['instance_id'].agg({str(col) + '_price_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_city_id'], how='left')
        data[str(col) + '_price_prob'] = data[str(col) + '_price_cnt'] / data['item_price_cnt']
    del data['item_price_cnt']

    print('一个item_sales_level有多少item_collected_level，item_pv_level')

    itemcnt = data.groupby(['item_sales_level'], as_index=False)['instance_id'].agg({'item_salse_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_sales_level'], how='left')
    for col in ['item_collected_level', 'item_pv_level']:
        itemcnt = data.groupby([col, 'item_sales_level'], as_index=False)['instance_id'].agg({str(col) + '_salse_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_sales_level'], how='left')
        data[str(col) + '_salse_prob'] = data[str(col) + '_salse_cnt'] / data['item_salse_cnt']
    del data['item_salse_cnt']

    print('一个item_collected_level有多少item_pv_level')

    itemcnt = data.groupby(['item_collected_level'], as_index=False)['instance_id'].agg({'item_coll_cnt': 'count'})
    data = pd.merge(data, itemcnt, on=['item_collected_level'], how='left')
    for col in ['item_pv_level']:
        itemcnt = data.groupby([col, 'item_collected_level'], as_index=False)['instance_id'].agg({str(col) + '_coll_cnt': 'count'})
        data = pd.merge(data, itemcnt, on=[col, 'item_collected_level'], how='left')
        data[str(col) + '_coll_prob'] = data[str(col) + '_coll_cnt'] / data['item_coll_cnt']
    del data['item_coll_cnt']
    
    del data['instance_id']
    
    cnt_columns = filter(lambda x:x.endswith('_cnt'), data.columns.values)
    data = min_max_normalize(data, cnt_columns)
    return data

def process_item_prob_feature(train_df, test_df, stat_df, all_df):
    base_columns = ['item_id', 'item_brand_id','item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level']
    stat_item_df = gen_item_prob_feature(all_df, base_columns).drop_duplicates()
    return map(lambda df:df[base_columns].merge(stat_item_df).drop_duplicates(), (train_df, test_df))

def process_item_ot_feature(train_df, test_df, stat_df, all_df = None):
    stat_item_df = gen_item_ot_feature(stat_df, test_df['context_day'].unique())
    base_columns = ['item_id', 'item_brand_id', 'item_category_id', 'item_city_id', 'context_day', 'context_hour', 'user_gender_id']
    return map(lambda df:df[base_columns].merge(stat_item_df).drop_duplicates(), (train_df, test_df))

def process_item_property_feature(train_df, test_df, stat_df = None, all_df = None):
    return map(gen_item_property_feature, (train_df, test_df))

# train_item_ft, test_item_ft = process_item_ot_feature(train_df, test_df, stat_df)
# train_item_ft, test_item_ft = process_item_property_feature(train_df, test_df, stat_df)
# train_item_ft, test_item_ft = process_item_prob_feature(train_df, test_df, stat_df, all_df)

> 建立店铺特征

In [None]:
def gen_shop_ot_feature(stat_df, extend_days):
    stat_shop_df = stat_df[['shop_id', 'context_day', 'context_hour', 'user_gender_id', 'user_age_level2', 'is_trade']]
    
    feature_frames = []
    stat_days = set(stat_shop_df['context_day'].unique())
    extend_days = set(extend_days)
    for day in stat_days | extend_days:
        dfs = {'all':stat_shop_df.loc[stat_shop_df['context_day'] != day],
               'last':stat_shop_df.loc[stat_shop_df['context_day'] == day - 1]
              }
        shop_features = dfs['all'].drop(columns=['is_trade']).drop_duplicates()
        shop_features.loc[:, 'context_day'] = day
        groupbys = {'s' : ['shop_id'], 
                    'sh' : ['shop_id', 'context_hour'],
                    'sug' : ['shop_id', 'user_gender_id'],
                    'sua' : ['shop_id', 'user_age_level2']}
        for key, groupby in groupbys.items():
            for dkey, shop_df in dfs.items():
                if shop_df.empty:
                    continue
                key = dkey + '_' + key
                trade_cnt_key = 'strade_' + key + '_cnt'
                query_cnt_key = 'squery_' + key + '_cnt'
                shop_rate = shop_df[groupby + ['is_trade']].rename(columns={'is_trade':trade_cnt_key})
                shop_rate.loc[:, query_cnt_key] = 1
                shop_rate = shop_rate.groupby(groupby, as_index=False).sum()
                shop_rate.loc[:, 'srate_' + key] = shop_rate[trade_cnt_key] / shop_rate[query_cnt_key]
                shop_features = shop_features.merge(shop_rate, how='left')

        cnt_columns = filter(lambda x:x.endswith('_cnt'), shop_features.columns.values)
        if day not in stat_days:
            shop_features.loc[:,cnt_columns] = shop_features.loc[:,cnt_columns] * (len(stat_days) - 1) / len(stat_days)
        shop_features = min_max_normalize_log(shop_features, cnt_columns)
        feature_frames.append(shop_features)
        
    return pd.concat(feature_frames).drop_duplicates()
    
    

def process_shop_score_qcut_feature(train_df, test_df, stat_df=None, all_df=None):
    tr_shop_df = train_df[['shop_id', 'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']]
    te_shop_df = test_df[['shop_id', 'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']]
    
    a_shop_df = tr_shop_df.append(te_shop_df)
    labels = map(lambda x:str(x), range(11))
    
    _, bins = pd.qcut(a_shop_df['shop_review_positive_rate'], 24, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_review_positive_rate_qcut'] = pd.cut(tr_shop_df['shop_review_positive_rate'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_review_positive_rate_qcut'] = pd.cut(te_shop_df['shop_review_positive_rate'], bins=bins, labels=labels).astype(int)
    
    _, bins = pd.qcut(a_shop_df['shop_score_service'], 11, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_score_service_qcut'] = pd.cut(tr_shop_df['shop_score_service'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_score_service_qcut'] = pd.cut(te_shop_df['shop_score_service'], bins=bins, labels=labels).astype(int)
    
    _, bins = pd.qcut(a_shop_df['shop_score_delivery'], 11, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_score_delivery_qcut'] = pd.cut(tr_shop_df['shop_score_delivery'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_score_delivery_qcut'] = pd.cut(te_shop_df['shop_score_delivery'], bins=bins, labels=labels).astype(int)
    
    _, bins = pd.qcut(a_shop_df['shop_score_description'], 11, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_score_description_qcut'] = pd.cut(tr_shop_df['shop_score_description'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_score_description_qcut'] = pd.cut(te_shop_df['shop_score_description'], bins=bins, labels=labels).astype(int)
    
    return tr_shop_df.drop_duplicates(), te_shop_df.drop_duplicates()

def process_shop_ot_feature(train_df, test_df, stat_df, all_df=None):
    stat_shop_df = gen_shop_ot_feature(stat_df, test_df['context_day'].unique())
    base_columns = ['shop_id', 'context_day', 'context_hour', 'user_gender_id', 'user_age_level2']
    return map(lambda df:df[base_columns].merge(stat_shop_df).drop_duplicates(), (train_df, test_df))

# train_shop_ct1, test_shop_ct1 = process_shop_score_qcut_feature(train_df, test_df)
train_shop_ct2, test_shop_ct2 = process_shop_ot_feature(train_df, test_df, stat_df)

> 建立上下文特征数据

In [None]:
def predict_category_hit(row):
    pre_list = row['predict_category_property'].split(';')
    category_list = row['item_category_list'].split(';')
    # start with second level category
    ret = 0
    for i in category_list[1:]:
        for k in range(len(pre_list)):
            if i in pre_list[k]:
                # combime small datas.
                if ret == 0 or k < ret:
                    return 5 if k > 5 else k
    return ret

def gen_context_time_feature(df):
    context_df = df[['user_id', 'item_id', 'context_id', 'context_timestamp', 'context_datetime', 'context_hour', 'context_page_id', 'context_day']]
    # by time info
    context_df.loc[:,'context_week'] = context_df.loc[:,'context_datetime'].map(lambda x:x.weekday())
    context_df.loc[:,'context_minute'] = context_df.loc[:,'context_datetime'].map(lambda x:x.minute)
    context_df.loc[:,'context_tmhour'] = context_df.loc[:,'context_hour'] + context_df.loc[:,'context_minute'] / 60
    context_df.loc[:,'context_minute_5'] = context_df.loc[:,'context_minute'].map(lambda x:x / 5)
    context_df.loc[:,'context_tmhour_5'] = context_df.loc[:,'context_tmhour'].map(lambda x:x / 5)
    context_df.loc[:,'context_minute_20'] = context_df.loc[:,'context_minute'].map(lambda x:x / 20)
    context_df.loc[:,'context_tmhour_20'] = context_df.loc[:,'context_tmhour'].map(lambda x:x / 20)
    context_df.loc[:,'context_tmhour_sin'] = context_df.loc[:,'context_tmhour'].map(lambda x: math.sin((x-12)/24*2*math.pi))
    context_df.loc[:,'context_tmhour_cos'] = context_df.loc[:,'context_tmhour'].map(lambda x: math.cos((x-12)/24*2*math.pi))
    context_df.loc[:,'context_isworkday'] = context_df.loc[:,'context_week'].map(lambda x: 1 if x < 5 else 2)
    
    user_query_day = context_df[['user_id', 'context_day']]
    user_query_day.loc[:,'u_day_query_cnt'] = 1
    user_query_day = user_query_day.groupby(['user_id', 'context_day'], as_index=False).sum()

    user_query_hour = context_df[['user_id', 'context_day', 'context_hour']]
    user_query_hour.loc[:,'u_hour_query_cnt'] = 1
    user_query_hour = user_query_hour.groupby(['user_id', 'context_day', 'context_hour'], as_index=False).sum()
   
    user_query_features = user_query_hour.merge(user_query_day)
    cnt_columns = filter(lambda x:x.endswith('_cnt'), user_query_features.columns.values)
    user_query_features = min_max_normalize(user_query_features, cnt_columns)
    
    item_query_day = context_df[['item_id', 'context_day']]
    item_query_day.loc[:,'i_day_query_cnt'] = 1
    item_query_day = item_query_day.groupby(['item_id', 'context_day'], as_index=False).sum()

    item_query_hour = context_df[['item_id', 'context_day', 'context_hour']]
    item_query_hour.loc[:,'i_hour_query_cnt'] = 1
    item_query_hour = item_query_hour.groupby(['item_id', 'context_day', 'context_hour'], as_index=False).sum()
   
    item_query_features = item_query_hour.merge(item_query_day)
    cnt_columns = filter(lambda x:x.endswith('_cnt'), item_query_features.columns.values)
    item_query_features = min_max_normalize(item_query_features, cnt_columns)
    
    feature_frames = []
    for name, day_df in context_df.groupby('context_day', as_index=False):
        query_hour = day_df[['context_hour']]
        query_hour.loc[:,'hour_query_cnt'] = 1
        query_hour = query_hour.groupby('context_hour', as_index=False).sum()
        query_hour.loc[:,'context_day'] = name
        
        query_features = query_hour.drop_duplicates()
        cnt_columns = filter(lambda x:x.endswith('_cnt'), query_features.columns.values)
        query_features = min_max_normalize(query_features, cnt_columns)
        feature_frames.append(query_features)
        
    query_features = pd.concat(feature_frames).drop_duplicates()
    return context_df.merge(user_query_features).merge(item_query_features).merge(query_features).drop(columns=['context_datetime', 'context_timestamp']).drop_duplicates()

def gen_context_predict_feature(df):
    cp_df = df[['item_category_list', 'predict_category_property']]
    frame = cp_df.apply(predict_category_hit, axis=1)
    frame.name = 'category_predict_hit'
    ret_df = df[['context_id']].join(frame)    
    return ret_df

def process_context_time_feature(train_df, test_df, stat_df=None, all_df=None):
    return map(gen_context_time_feature, (train_df, test_df))

def process_context_predict_feature(train_df, test_df, stat_df=None, all_df=None):
    return map(gen_context_predict_feature, (train_df, test_df))

# train_item_ct1, test_item_ct1 = process_context_time_feature(train_df, test_df)
# train_item_ct2, test_item_ct2 = process_context_predict_feature(train_df, test_df)
# TODO: 建立上下文预测属性数据

In [None]:
def rolling_count_by_group(grp, args):
    # 查询次数
    by = args[0]
    windows = args[1]
    grp = grp.sort_values("context_timestamp")
    for index, row in grp.iterrows():
        curr_date = row["context_timestamp"]
        for window in windows:
            frame_name = by[:4] + '_qr_' + str(window) + '_cnt'
            if window > 0:
                grp_in_range = grp[(grp["context_timestamp"] >= curr_date) & (
                    grp["context_timestamp"] < curr_date + window)]
            else:
                grp_in_range = grp[(grp["context_timestamp"] < curr_date) & (
                    grp["context_timestamp"] >= curr_date + window)]
            grp.at[index, frame_name] = grp_in_range.count()['context_id']# / float(window_size)
    return grp.groupby([by, 'context_timestamp'], as_index=False)[map(lambda x:by[:4] + '_qr_' + str(x) + '_cnt', windows)].max()

@timeit
def query_rolling_rate(df, by, windows):
    # 查询情况
    grouped = df.groupby(by)
    user_date_df = grouped.apply(rolling_count_by_group, [by, windows])

    return user_date_df

def gen_context_time_rolling_feature(df):
    context_df = df[['user_id', 'item_id', 'context_id', 'context_timestamp', 'context_day']]

    feature_frames = []
    for name, day_df in context_df.groupby('context_day', as_index=False):
        item_df = day_df[['item_id', 'context_id', 'context_timestamp', 'context_day']]
        user_df = day_df[['user_id', 'context_id', 'context_timestamp', 'context_day']]
        windows = [-300, 300, -1200, 1200, -3600, 3600]

        i_query_rolling = query_rolling_rate(item_df, 'item_id', windows)
        u_query_rolling = query_rolling_rate(user_df, 'user_id', windows)


        query_features = i_query_rolling.merge(u_query_rolling).drop_duplicates()
        cnt_columns = filter(lambda x:x.endswith('_cnt'), query_features.columns.values)
        query_features = min_max_normalize(query_features, cnt_columns)
        feature_frames.append(query_features)

    query_features = pd.concat(feature_frames).drop_duplicates()
    return query_features

def process_context_time_rolling_feature(train_df, test_df, stat_df=None, all_df=None):
    return map(gen_context_time_rolling_feature, (train_df, test_df))

# train_item_ctr, test_item_ctr = process_context_time_rolling_feature(train_df, test_df)

### 开始处理特征数据

In [None]:
print len(train_df), len(test_df)

a = set(train_df['user_id'].tolist())
b = set(test_df['user_id'].tolist())
print 'train user count : %d, test user count : %d, both user count : %d' %(len(a), len(b), len(a & b))

In [None]:
def process_train_test_features(tr_df, te_df, st_df, all_df, proc_func_list):
    tr_fs, te_fs = process_base_feature(tr_df, te_df)

    print 'begin base:', len(tr_fs), len(te_fs)
    for proc_func in proc_func_list:
        clear_output()
        print 'processing ' + proc_func.func_name + ' ...'
        tr_f, te_f = proc_func(tr_df, te_df, st_df, all_df)
        clear_output()
        print 'merging ' + proc_func.func_name + ':', len(tr_f), len(te_f)
        tr_fs = tr_fs.merge(tr_f, how='left')
        te_fs = te_fs.merge(te_f, how='left')
    clear_output()
    map(lambda x:x.drop_duplicates(inplace=True), (tr_fs, te_fs))
    print 'done features:', len(tr_fs), len(te_fs)
    return tr_fs, te_fs

In [None]:
# fast features
fast_proc_func_list = [process_base_combine_feature, process_intersection_id,
                       process_context_predict_feature, process_context_time_feature,
                       process_user_ot_feature, process_user_item_ot_feature,
                       process_item_ot_feature, process_shop_ot_feature,
                       process_item_property_feature, process_shop_score_qcut_feature,
                       process_item_prob_feature, process_user_prob_feature, process_user_item_prob_feature]

tr_fs, te_fs = process_train_test_features(train_df, test_df, stat_df, all_df, fast_proc_func_list)

In [None]:
# slow features
user_old = True

if user_old:
    tr_fs_2 = pd.read_csv(cf.train_data_features_2_file_path, index_col=0)
    te_fs_2 = pd.read_csv(cf.test_data_features_2_file_path, index_col=0)
else:
    slow_proc_func_list = [process_context_time_rolling_feature]
    tr_fs_2, te_fs_2 = process_train_test_features(train_df, test_df, stat_df, all_df, slow_proc_func_list)
    tr_fs_2.to_csv(cf.train_data_features_2_file_path)
    te_fs_2.to_csv(cf.test_data_features_2_file_path)

train_drop_columns = tr_fs_2.iloc[:,2:29].columns.values
test_drop_columns = te_fs_2.iloc[:,1:28].columns.values
tr_fs = tr_fs.merge(tr_fs_2.drop(columns=train_drop_columns), how='left')
te_fs = te_fs.merge(te_fs_2.drop(columns=test_drop_columns), how='left')

In [None]:
print tr_fs.columns.values
print tr_fs.shape

In [None]:
tr_fs.to_csv(cf.train_data_features_file_path)
te_fs.to_csv(cf.test_data_features_file_path)