In [None]:
import math
import numpy as np
import pandas as pd
import statsmodels as sm
import matplotlib.pylab as plt
import config as cf

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error
from chinese_calendar import is_workday, is_holiday
from jupyterthemes import jtplot
from util import timeit
from joblib import Parallel, delayed


jtplot.style()
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))

In [None]:
train_df = pd.read_csv(cf.round1_train_file_path, sep = ' ')
test_df = pd.read_csv(cf.round1_test_file_path, sep = ' ')

category_df = train_df['item_category_list'].unique()
category_ids = pd.DataFrame({'item_category_list' : category_df, 'item_category_id' : np.arange(len(category_df))})
train_df = train_df.merge(category_ids, on='item_category_list')
test_df = test_df.merge(category_ids, on='item_category_list')

time_offset = 8 * 60 * 60 - 365 * 24 * 60 * 60
train_df.loc[:,'context_datetime'] = pd.to_datetime(train_df.loc[:,'context_timestamp'] + time_offset, unit='s')
test_df.loc[:,'context_datetime'] = pd.to_datetime(test_df.loc[:,'context_timestamp'] + time_offset, unit='s')
train_df.loc[:,'context_day'] = train_df.loc[:,'context_datetime'].map(lambda x:x.day)
test_df.loc[:,'context_day'] = test_df.loc[:,'context_datetime'].map(lambda x:x.day)

stat_df = train_df

In [None]:
def min_max_normalize(df, name_list):
    for name in name_list:
        # 归一化
        max_number = df[name].max()
        min_number = df[name].min()
        # assert max_number != min_number, 'max == min in COLUMN {0}'.format(name)
        df.loc[:,name] = df.loc[:,name].map(lambda x: float(x - min_number + 1) / float(max_number - min_number + 1))
        # 做简单的平滑,试试效果如何
    return df

def min_max_normalize_log(df, name_list):
    for name in name_list:
        # 归一化
        max_number = df[name].max()
        min_number = df[name].min()
        # assert max_number != min_number, 'max == min in COLUMN {0}'.format(name)
        df.loc[:,name] = df.loc[:,name].map(lambda x: np.log(x + 1) / np.log(max_number + 1))
        # 做简单的平滑,试试效果如何
    return df

def normalize_log(df, name_list):
    for name in name_list:
        df.loc[:,name] = df.loc[:,name].map(lambda x: np.log(x + 1))
    return df

> 建立基础特征数据

In [None]:
def gen_base_feature(df):
    feature_list = []
    if 'is_trade' in df:
        feature_list.append('is_trade')
    feature_list.extend(['instance_id', 'user_id', 'context_id', 'context_timestamp', 'context_day', 'item_property_list'])
    feature_list.extend(['item_id', 'shop_id', 'item_brand_id', 'item_city_id', 'item_category_id'])
    feature_list.extend(['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level',
                         'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level',
                         'context_page_id', 'shop_review_num_level', 'shop_star_level',
                         'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description'])
    return df[feature_list]

def process_base_feature(train_df, test_df, stat_df=None):
    tr_df = gen_base_feature(train_df)
    te_df = gen_base_feature(test_df)
    return tr_df, te_df
    
# train_base_ft, test_base_ft = process_base_feature(train_df, test_df)

> 建立用户特征数据

In [None]:
def gen_user_feature(df, extend_days):
    user_df = df[['user_id', 'is_trade']]
    
    
    stat_user_df = stat_df[['user_id', 'context_day', 'is_trade']]
    
    feature_frames = []
    stat_days = set(stat_user_df['context_day'].unique())
    extend_days = set(extend_days)
    for day in stat_days | extend_days:
        user_df = stat_user_df.loc[stat_user_df['context_day'] != day]

        # user trade rate.
        # user trade rate, group by catetory.
        user_rate = user_df[['user_id', 'is_trade']].rename(columns={'is_trade':'u_trade_cnt'})
        user_rate.loc[:,'user_cnt'] = 1
        user_rate = user_rate.groupby('user_id', as_index=False).sum()
        user_rate.loc[:, 'user_rate'] = user_rate['u_trade_cnt'] * 100 / user_rate['user_cnt']
        
        user_features = user_df.drop(columns=['is_trade']).drop_duplicates().merge(user_rate)
        user_features.loc[:, 'context_day'] = day
        if day not in stat_days:
            cnt_columns = filter(lambda x:x.endswith('_cnt'), user_features.columns.values)
            user_features.loc[:,cnt_columns] = user_features.loc[:,cnt_columns] * (len(stat_days) - 1) / len(stat_days)
        
        feature_frames.append(user_features)
        
    return pd.concat(feature_frames).drop_duplicates()
    
    
def process_user_feature(train_df, test_df, stat_df):
    stat_user_df = gen_user_feature(stat_df, test_df['context_day'].unique())
    base_columns = ['user_id', 'context_day']
    return map(lambda df:df[base_columns].merge(stat_user_df).drop_duplicates(), (train_df, test_df))

# train_user_ct2, test_user_ct2 = process_user_feature(train_df, test_df, stat_df)

> 建立用户-商品特征数据

In [None]:
def gen_user_item_feature(stat_df, extend_days):
    stat_user_item_df = stat_df[['user_id', 'item_id', 'item_brand_id', 'item_category_id', 'context_day', 'is_trade']]
    
    feature_frames = []
    stat_days = set(stat_user_item_df['context_day'].unique())
    extend_days = set(extend_days)
    for day in stat_days | extend_days:
        user_item_df = stat_user_item_df.loc[stat_user_item_df['context_day'] != day]
        # user item trade rate.
        user_item_rate = user_item_df[['user_id', 'item_id', 'is_trade']].rename(columns={'is_trade':'u_item_trade_cnt'})
        user_item_rate.loc[:, 'u_item_cnt'] = 1
        user_item_rate = user_item_rate.groupby(['user_id', 'item_id'], as_index=False).sum()
        user_item_rate.loc[:, 'u_item_rate'] = user_item_rate['u_item_trade_cnt'] * 100 / user_item_rate['u_item_cnt']

        user_brand_rate = user_item_df[['user_id', 'item_brand_id', 'is_trade']].rename(columns={'is_trade':'u_brand_trade_cnt'})
        user_brand_rate.loc[:, 'u_brand_cnt'] = 1
        user_brand_rate = user_brand_rate.groupby(['user_id', 'item_brand_id'], as_index=False).sum()
        user_brand_rate.loc[:, 'u_brand_rate'] = user_brand_rate['u_brand_trade_cnt'] * 100 / user_brand_rate['u_brand_cnt']

        # user item trade rate, group by catetory.
        user_cate_rate = user_item_df[['user_id', 'item_category_id', 'is_trade']].rename(columns={'is_trade':'u_cate_trade_cnt'})
        user_cate_rate.loc[:, 'u_cate_cnt'] = 1
        user_cate_rate = user_cate_rate.groupby(['user_id', 'item_category_id'], as_index=False).sum()
        user_cate_rate.loc[:, 'u_cate_rate'] = user_cate_rate['u_cate_trade_cnt'] * 100 / user_cate_rate['u_cate_cnt']

        user_item_features = user_item_df.drop(columns=['is_trade']).drop_duplicates().merge(user_item_rate).merge(user_brand_rate).merge(user_cate_rate)
        user_item_features.loc[:, 'context_day'] = day
        
        if day not in stat_days:
            cnt_columns = filter(lambda x:x.endswith('_cnt'), user_item_features.columns.values)
            user_item_features.loc[:,cnt_columns] = user_item_features.loc[:,cnt_columns] * (len(stat_days) - 1) / len(stat_days)
        
        feature_frames.append(user_item_features)
        
    return pd.concat(feature_frames).drop_duplicates()

def process_user_item_feature(train_df, test_df, stat_df):
    stat_user_item_df = gen_user_item_feature(stat_df, test_df['context_day'].unique())
    base_columns = ['user_id', 'item_id', 'item_brand_id', 'item_category_id', 'context_day']
    return map(lambda df:df[base_columns].merge(stat_user_item_df).drop_duplicates(), (train_df, test_df))

# train_user_ft, test_user_ft = process_user_item_feature(train_df, test_df, stat_df)

> 建立商品特征数据

In [None]:
def gen_item_feature(stat_df, extend_days):
    stat_item_df = stat_df[['item_id', 'item_brand_id', 'item_category_id', 'item_city_id', 'context_day', 'is_trade']]
    
    feature_frames = []
    stat_days = set(stat_item_df['context_day'].unique())
    extend_days = set(extend_days)
    for day in stat_days | extend_days:
        item_df = stat_item_df.loc[stat_item_df['context_day'] != day]
    
        item_rate = item_df[['item_id', 'is_trade']].rename(columns={'is_trade':'item_trade_cnt'})
        item_rate.loc[:, 'item_cnt'] = 1
        item_rate = item_rate.groupby('item_id', as_index=False).sum()
        item_rate.loc[:, 'item_rate'] = item_rate['item_trade_cnt'] * 100 / item_rate['item_cnt']

        item_brand_rate = item_df[['item_brand_id', 'is_trade']].rename(columns={'is_trade':'brand_trade_cnt'})
        item_brand_rate.loc[:, 'brand_cnt'] = 1
        item_brand_rate = item_brand_rate.groupby('item_brand_id', as_index=False).sum()
        item_brand_rate.loc[:, 'brand_rate'] = item_brand_rate['brand_trade_cnt'] * 100 / item_brand_rate['brand_cnt']

        item_cate_rate = item_df[['item_category_id', 'is_trade']].rename(columns={'is_trade':'cate_trade_cnt'})
        item_cate_rate.loc[:, 'cate_cnt'] = 1
        item_cate_rate = item_cate_rate.groupby('item_category_id', as_index=False).sum()
        item_cate_rate.loc[:, 'cate_rate'] = item_cate_rate['cate_trade_cnt'] * 100 / item_cate_rate['cate_cnt']

        item_city_rate = item_df[['item_city_id', 'is_trade']].rename(columns={'is_trade':'city_trade_cnt'})
        item_city_rate.loc[:, 'city_cnt'] = 1
        item_city_rate = item_city_rate.groupby('item_city_id', as_index=False).sum()
        item_city_rate.loc[:, 'city_rate'] = item_city_rate['city_trade_cnt'] * 100/ item_city_rate['city_cnt']
    
        item_features = item_df.drop(columns=['is_trade']).drop_duplicates().merge(item_rate).merge(item_brand_rate).merge(item_cate_rate).merge(item_city_rate)
        item_features.loc[:, 'context_day'] = day
        
        cnt_columns = filter(lambda x:x.endswith('_cnt'), item_features.columns.values)
        if day not in stat_days:
            item_features.loc[:,cnt_columns] = item_features.loc[:,cnt_columns] * (len(stat_days) - 1) / len(stat_days)
        item_features = min_max_normalize_log(item_features, cnt_columns)
        
        feature_frames.append(item_features)
        
    return pd.concat(feature_frames).drop_duplicates()

def gen_item_property_feature(df):
    prop_item_df = df[['item_property_list']].drop_duplicates()
    for i in range(3):
        prop_item_df.loc[:,'property_%d'%(i)] = prop_item_df.loc[:,'item_property_list'].apply(
            lambda x:x.split(";")[i] if len(x.split(";")) > i else " ")
    return prop_item_df

def process_item_feature(train_df, test_df, stat_df):
    stat_item_df = gen_item_feature(stat_df, test_df['context_day'].unique())
    base_columns = ['item_id', 'item_brand_id', 'item_category_id', 'item_city_id', 'context_day']
    return map(lambda df:df[base_columns].merge(stat_item_df).drop_duplicates(), (train_df, test_df))

def process_item_property_feature(train_df, test_df, stat_df = None):
    return map(gen_item_property_feature, (train_df, test_df))

# train_item_ft, test_item_ft = process_item_feature(train_df, test_df, stat_df)
# train_item_ft, test_item_ft = process_item_property_feature(train_df, test_df, stat_df)

> 建立上下文特征数据

In [None]:
def predict_category_hit(row):
    pre_list = row['predict_category_property'].split(';')
    category_list = row['item_category_list'].split(';')
    # start with second level category
    ret = 0
    for i in category_list[1:]:
        for k in range(len(pre_list)):
            if i in pre_list[k]:
                # combime small datas.
                if ret == 0 or k < ret:
                    return 5 if k > 5 else k
    return ret

def gen_context_time_feature(df):
    context_df = df[['user_id', 'item_id', 'context_id', 'context_timestamp', 'context_datetime', 'context_page_id', 'context_day']]
    # by time info
    context_df.loc[:,'context_hour'] = context_df.loc[:,'context_datetime'].map(lambda x:x.hour)
    context_df.loc[:,'context_week'] = context_df.loc[:,'context_datetime'].map(lambda x:x.weekday())
    context_df.loc[:,'context_minute'] = context_df.loc[:,'context_datetime'].map(lambda x:x.minute)
    context_df.loc[:,'context_tmhour'] = context_df.loc[:,'context_hour'] + context_df.loc[:,'context_minute'] / 60
    context_df.loc[:,'context_tmhour_sin'] = context_df.loc[:,'context_tmhour'].map(lambda x: math.sin((x-12)/24*2*math.pi))
    context_df.loc[:,'context_tmhour_cos'] = context_df.loc[:,'context_tmhour'].map(lambda x: math.cos((x-12)/24*2*math.pi))
    
    user_query_day = context_df[['user_id', 'context_day']]
    user_query_day.loc[:,'u_day_query_cnt'] = 1
    user_query_day = user_query_day.groupby(['user_id', 'context_day'], as_index=False).sum()

    user_query_hour = context_df[['user_id', 'context_day', 'context_hour']]
    user_query_hour.loc[:,'u_hour_query_cnt'] = 1
    user_query_hour = user_query_hour.groupby(['user_id', 'context_day', 'context_hour'], as_index=False).sum()
   
    user_query_features = user_query_hour.merge(user_query_day)
    cnt_columns = filter(lambda x:x.endswith('_cnt'), user_query_features.columns.values)
    user_query_features = min_max_normalize_log(user_query_features, cnt_columns)
    
    item_query_day = context_df[['item_id', 'context_day']]
    item_query_day.loc[:,'i_day_query_cnt'] = 1
    item_query_day = item_query_day.groupby(['item_id', 'context_day'], as_index=False).sum()

    item_query_hour = context_df[['item_id', 'context_day', 'context_hour']]
    item_query_hour.loc[:,'i_hour_query_cnt'] = 1
    item_query_hour = item_query_hour.groupby(['item_id', 'context_day', 'context_hour'], as_index=False).sum()
   
    item_query_features = item_query_hour.merge(item_query_day)
    cnt_columns = filter(lambda x:x.endswith('_cnt'), item_query_features.columns.values)
    item_query_features = min_max_normalize_log(item_query_features, cnt_columns)
    
    feature_frames = []
    for name, day_df in context_df.groupby('context_day', as_index=False):
        query_hour = day_df[['context_hour']]
        query_hour.loc[:,'hour_query_cnt'] = 1
        query_hour = query_hour.groupby('context_hour', as_index=False).sum()
        query_hour.loc[:,'context_day'] = name
        
        query_features = query_hour.drop_duplicates()
        cnt_columns = filter(lambda x:x.endswith('_cnt'), query_features.columns.values)
        query_features = min_max_normalize(query_features, cnt_columns)
        feature_frames.append(query_features)
        
    query_features = pd.concat(feature_frames).drop_duplicates()
    return context_df.merge(user_query_features).merge(item_query_features).merge(query_features).drop(columns=['context_datetime', 'context_timestamp']).drop_duplicates()

def gen_context_predict_feature(df):
    cp_df = df[['item_category_list', 'predict_category_property']]
    frame = cp_df.apply(predict_category_hit, axis=1)
    frame.name = 'category_predict_hit'
    ret_df = df[['context_id']].join(frame)    
    return ret_df

def process_context_time_feature(train_df, test_df, stat_df=None):
    return map(gen_context_time_feature, (train_df, test_df))

def process_context_predict_feature(train_df, test_df, stat_df=None):
    return map(gen_context_predict_feature, (train_df, test_df))

# train_item_ct1, test_item_ct1 = process_context_time_feature(train_df, test_df)
# train_item_ct2, test_item_ct2 = process_context_predict_feature(train_df, test_df)
# TODO: 建立上下文预测属性数据

> 建立店铺特征

In [None]:
def gen_shop_feature(df, extend_days):
    shop_df = df[['shop_id', 'is_trade']]
    
    
    stat_shop_df = stat_df[['shop_id', 'context_day', 'is_trade']]
    
    feature_frames = []
    stat_days = set(stat_shop_df['context_day'].unique())
    extend_days = set(extend_days)
    for day in stat_days | extend_days:
        shop_df = stat_shop_df.loc[stat_shop_df['context_day'] != day]

        # shop trade rate.
        # shop trade rate, group by catetory.
        shop_rate = shop_df[['shop_id', 'is_trade']].rename(columns={'is_trade':'s_trade_cnt'})
        shop_rate.loc[:,'shop_cnt'] = 1
        shop_rate = shop_rate.groupby('shop_id', as_index=False).sum()
        shop_rate.loc[:, 'shop_rate'] = shop_rate['s_trade_cnt'] * 100 / shop_rate['shop_cnt']

        shop_features = shop_df.drop(columns=['is_trade']).drop_duplicates().merge(shop_rate)
        shop_features.loc[:, 'context_day'] = day
        cnt_columns = filter(lambda x:x.endswith('_cnt'), shop_features.columns.values)
        if day not in stat_days:
            shop_features.loc[:,cnt_columns] = shop_features.loc[:,cnt_columns] * (len(stat_days) - 1) / len(stat_days)
        shop_features = min_max_normalize_log(shop_features, cnt_columns)
        
        feature_frames.append(shop_features)
        
    return pd.concat(feature_frames).drop_duplicates()
    
    

def process_shop_score_qcut_feature(train_df, test_df, stat_df=None):
    tr_shop_df = train_df[['shop_id', 'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']]
    te_shop_df = test_df[['shop_id', 'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']]
    
    a_shop_df = tr_shop_df.append(te_shop_df)
    labels = map(lambda x:str(x), range(11))
    
    _, bins = pd.qcut(a_shop_df['shop_review_positive_rate'], 24, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_review_positive_rate_qcut'] = pd.cut(tr_shop_df['shop_review_positive_rate'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_review_positive_rate_qcut'] = pd.cut(te_shop_df['shop_review_positive_rate'], bins=bins, labels=labels).astype(int)
    
    _, bins = pd.qcut(a_shop_df['shop_score_service'], 11, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_score_service_qcut'] = pd.cut(tr_shop_df['shop_score_service'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_score_service_qcut'] = pd.cut(te_shop_df['shop_score_service'], bins=bins, labels=labels).astype(int)
    
    _, bins = pd.qcut(a_shop_df['shop_score_delivery'], 11, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_score_delivery_qcut'] = pd.cut(tr_shop_df['shop_score_delivery'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_score_delivery_qcut'] = pd.cut(te_shop_df['shop_score_delivery'], bins=bins, labels=labels).astype(int)
    
    _, bins = pd.qcut(a_shop_df['shop_score_description'], 11, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_score_description_qcut'] = pd.cut(tr_shop_df['shop_score_description'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_score_description_qcut'] = pd.cut(te_shop_df['shop_score_description'], bins=bins, labels=labels).astype(int)
    
    return tr_shop_df.drop_duplicates(), te_shop_df.drop_duplicates()

def process_shop_feature(train_df, test_df, stat_df):
    stat_shop_df = gen_shop_feature(stat_df, test_df['context_day'].unique())
    base_columns = ['shop_id', 'context_day']
    return map(lambda df:df[base_columns].merge(stat_shop_df).drop_duplicates(), (train_df, test_df))

# train_shop_ct1, test_shop_ct1 = process_shop_score_qcut_feature(train_df, test_df)
# train_shop_ct2, test_shop_ct2 = process_shop_feature(train_df, test_df, stat_df)