In [None]:
import numpy as np
import pandas as pd
import statsmodels as sm
import matplotlib.pylab as plt
import config as cf

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error
from chinese_calendar import is_workday, is_holiday
from jupyterthemes import jtplot

jtplot.style()
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))

In [None]:
train_df = pd.read_csv(cf.round1_train_file_path, sep = ' ')
test_df = pd.read_csv(cf.round1_test_file_path, sep = ' ')

category_df = train_df['item_category_list'].unique()
category_ids = pd.DataFrame({'item_category_list' : category_df, 'item_category_id' : np.arange(len(category_df))})
train_df = train_df.merge(category_ids, on='item_category_list')
test_df = test_df.merge(category_ids, on='item_category_list')

In [None]:
def cast_to(proc_func, train_df, test_df):
    tr_df = proc_func(train_df)
    te_df = proc_func(test_df)
    return tr_df, te_df

def apply_to(proc_func, train_df, test_df, on=None):
    a_df = proc_func(train_df)
    if on:
        tr_df = train_df[on].merge(a_df)
        te_df = test_df[on].merge(a_df)
    else:
        tr_df = train_df.merge(a_df)
        te_df = test_df.merge(a_df)
    return tr_df.drop_duplicates(), te_df.drop_duplicates()

> 建立基础特征数据

In [None]:
def gen_base_feature(df):
    feature_list = []
    if 'is_trade' in df:
        feature_list.append('is_trade')
    feature_list.extend(['instance_id', 'item_id', 'user_id', 'context_id', 'shop_id'])
    feature_list.extend(['item_brand_id', 'item_city_id', 'item_category_id'])
    feature_list.extend(['item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level',
                         'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level',
                         'context_page_id', 'shop_review_num_level', 'shop_star_level',
                         'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description'])
    return df[feature_list]

def process_base_feature(train_df, test_df):
    tr_df = gen_base_feature(train_df)
    te_df = gen_base_feature(test_df)
    return tr_df, te_df
    
# train_base_ft, test_base_ft = process_base_feature(train_df, test_df)

> 建立用户特征数据

In [None]:
def gen_user_item_feature(df):
    user_item_df = df[['user_id', 'item_id', 'item_brand_id', 'item_category_id', 'is_trade']]

    # user trade rate.
    # user trade rate, group by catetory.
    user_rate = user_item_df[['user_id', 'is_trade']].rename(columns={'is_trade':'u_trade_cnt'})
    user_rate.loc[:,'user_cnt'] = 1
    user_rate = user_rate.groupby('user_id', as_index=False).sum()
    #user_rate.sort_values(['u_trade_cnt', 'user_cnt'], ascending=[False,True])

    user_item_rate = user_item_df[['user_id', 'item_id', 'is_trade']].rename(columns={'is_trade':'u_item_trade_cnt'})
    user_item_rate.loc[:, 'u_item_cnt'] = 1
    user_item_rate = user_item_rate.groupby(['user_id', 'item_id'], as_index=False).sum()
    user_item_rate.loc[:, 'u_item_rate'] = user_item_rate['u_item_trade_cnt'] / user_item_rate['u_item_cnt']
#     user_item_rate.sort_values(['u_item_trade_cnt', 'u_item_cnt'], ascending=[False,True])

    user_brand_rate = user_item_df[['user_id', 'item_brand_id', 'is_trade']].rename(columns={'is_trade':'u_brand_trade_cnt'})
    user_brand_rate.loc[:, 'u_brand_cnt'] = 1
    user_brand_rate = user_brand_rate.groupby(['user_id', 'item_brand_id'], as_index=False).sum()
    user_brand_rate.loc[:, 'u_brand_rate'] = user_brand_rate['u_brand_trade_cnt'] / user_brand_rate['u_brand_cnt']
#     user_brand_rate.sort_values(['u_brand_trade_cnt', 'u_brand_cnt'], ascending=[False,True])

    user_cate_rate = user_item_df[['user_id', 'item_category_id', 'is_trade']].rename(columns={'is_trade':'u_cate_trade_cnt'})
    user_cate_rate.loc[:, 'u_cate_cnt'] = 1
    user_cate_rate = user_cate_rate.groupby(['user_id', 'item_category_id'], as_index=False).sum()
    user_cate_rate.loc[:, 'u_cate_rate'] = user_cate_rate['u_cate_trade_cnt'] / user_cate_rate['u_cate_cnt']
#     user_cate_rate.sort_values(['u_cate_trade_cnt', 'u_cate_cnt'], ascending=[False,True])

    user_features = user_item_df.drop(columns=['is_trade']).drop_duplicates().merge(user_rate).merge(user_item_rate).merge(user_brand_rate).merge(user_cate_rate)
    return user_features

def process_user_item_feature(train_df, test_df):
    return apply_to(gen_user_item_feature, train_df, test_df, ['user_id', 'item_id', 'item_brand_id', 'item_category_id'])

# train_user_ft, test_user_ft = process_user_item_feature(train_df, test_df)

> 建立商品特征数据

In [None]:
def gen_item_feature(df):
    item_df = df[['item_id', 'item_brand_id', 'item_category_id', 'item_city_id', 'is_trade']]
    
    item_rate = item_df[['item_id', 'is_trade']].rename(columns={'is_trade':'item_trade_cnt'})
    item_rate.loc[:, 'item_cnt'] = 1
    item_rate = item_rate.groupby('item_id', as_index=False).sum()
    item_rate.loc[:, 'item_rate'] = item_rate['item_trade_cnt'] / item_rate['item_cnt']
    
    item_brand_rate = item_df[['item_brand_id', 'is_trade']].rename(columns={'is_trade':'brand_trade_cnt'})
    item_brand_rate.loc[:, 'brand_cnt'] = 1
    item_brand_rate = item_brand_rate.groupby('item_brand_id', as_index=False).sum()
    item_brand_rate.loc[:, 'brand_rate'] = item_brand_rate['brand_trade_cnt'] / item_brand_rate['brand_cnt']
    
    item_cate_rate = item_df[['item_category_id', 'is_trade']].rename(columns={'is_trade':'cate_trade_cnt'})
    item_cate_rate.loc[:, 'cate_cnt'] = 1
    item_cate_rate = item_cate_rate.groupby('item_category_id', as_index=False).sum()
    item_cate_rate.loc[:, 'cate_rate'] = item_cate_rate['cate_trade_cnt'] / item_cate_rate['cate_cnt']
    
    item_city_rate = item_df[['item_city_id', 'is_trade']].rename(columns={'is_trade':'city_trade_cnt'})
    item_city_rate.loc[:, 'city_cnt'] = 1
    item_city_rate = item_city_rate.groupby('item_city_id', as_index=False).sum()
    item_city_rate.loc[:, 'city_rate'] = item_city_rate['city_trade_cnt'] / item_city_rate['city_cnt']
    
    feature_df = item_df.drop(columns=['is_trade']).drop_duplicates().merge(item_rate).merge(item_brand_rate).merge(item_cate_rate).merge(item_city_rate)
    return feature_df

def process_item_feature(train_df, test_df):
    return apply_to(gen_item_feature, train_df, test_df, ['item_id', 'item_brand_id', 'item_category_id', 'item_city_id'])

train_item_ft, test_item_ft = process_item_feature(train_df, test_df)

> 建立上下文特征数据

In [None]:
def predict_category_hit(row):
    pre_list = row['predict_category_property'].split(';')
    category_list = row['item_category_list'].split(';')
    # start with second level category
    ret = 0
    for i in category_list[1:]:
        for k in range(len(pre_list)):
            if i in pre_list[k]:
                # combime small datas.
                if ret == 0 or k < ret:
                    return 5 if k > 5 else k
    return ret

def gen_context_time_feature(df):
    context_df = df[['context_id', 'context_timestamp', 'context_page_id']]
    context_df.loc[:,'context_datetime'] = pd.to_datetime(context_df.loc[:,'context_timestamp'], unit='s')
    # by day
    # context_df.loc[:,'context_day'] = context_df.loc[:,'context_datetime'].map(lambda x:x.day)
    # by hours
    context_df.loc[:,'context_hour'] = context_df.loc[:,'context_datetime'].map(lambda x:x.hour)
    
    return context_df.drop(columns=['context_datetime'])

def gen_context_predict_feature(df):
    cp_df = df[['item_category_list', 'predict_category_property']]
    frame = cp_df.apply(predict_category_hit, axis=1)
    frame.name = 'category_predict_hit'
    ret_df = df[['context_id']].join(frame)    
    return ret_df

def process_context_time_feature(train_df, test_df):
    return cast_to(gen_context_time_feature, train_df, test_df)

def process_context_predict_feature(train_df, test_df):
    return cast_to(gen_context_predict_feature, train_df, test_df)

# train_item_ct1, test_item_ct1 = process_context_time_feature(train_df, test_df)
# train_item_ct2, test_item_ct2 = process_context_predict_feature(train_df, test_df)
# TODO: 建立上下文预测属性数据

> 建立店铺特征

In [None]:
def gen_shop_feature(df):
    shop_df = df[['shop_id', 'is_trade']]

    # shop trade rate.
    # shop trade rate, group by catetory.
    shop_rate = shop_df[['shop_id', 'is_trade']].rename(columns={'is_trade':'s_trade_cnt'})
    shop_rate.loc[:,'shop_cnt'] = 1
    shop_rate = shop_rate.groupby('shop_id', as_index=False).sum()
    #user_rate.sort_values(['u_trade_cnt', 'user_cnt'], ascending=[False,True])
    
    shop_feature = shop_df.merge(shop_rate)
    return shop_feature.drop(columns=['is_trade']).drop_duplicates()
    

def process_shop_score_qcut_feature(train_df, test_df):
    tr_shop_df = train_df[['shop_id', 'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']]
    te_shop_df = test_df[['shop_id', 'shop_review_positive_rate', 'shop_score_service', 'shop_score_delivery', 'shop_score_description']]
    
    a_shop_df = tr_shop_df.append(te_shop_df)
    labels = map(lambda x:str(x), range(11))
    
    _, bins = pd.qcut(a_shop_df['shop_review_positive_rate'], 24, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_review_positive_rate_qcut'] = pd.cut(tr_shop_df['shop_review_positive_rate'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_review_positive_rate_qcut'] = pd.cut(te_shop_df['shop_review_positive_rate'], bins=bins, labels=labels).astype(int)
    
    _, bins = pd.qcut(a_shop_df['shop_score_service'], 11, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_score_service_qcut'] = pd.cut(tr_shop_df['shop_score_service'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_score_service_qcut'] = pd.cut(te_shop_df['shop_score_service'], bins=bins, labels=labels).astype(int)
    
    _, bins = pd.qcut(a_shop_df['shop_score_delivery'], 11, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_score_delivery_qcut'] = pd.cut(tr_shop_df['shop_score_delivery'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_score_delivery_qcut'] = pd.cut(te_shop_df['shop_score_delivery'], bins=bins, labels=labels).astype(int)
    
    _, bins = pd.qcut(a_shop_df['shop_score_description'], 11, retbins=True, duplicates='drop')
    tr_shop_df.loc[:,'shop_score_description_qcut'] = pd.cut(tr_shop_df['shop_score_description'], bins=bins, labels=labels).astype(int)
    te_shop_df.loc[:,'shop_score_description_qcut'] = pd.cut(te_shop_df['shop_score_description'], bins=bins, labels=labels).astype(int)
    
    return tr_shop_df.drop_duplicates(), te_shop_df.drop_duplicates()

def process_shop_feature(train_df, test_df):
    return apply_to(gen_shop_feature, train_df, test_df, ['shop_id'])

# train_shop_ct1, test_shop_ct1 = process_shop_score_qcut_feature(train_df, test_df)
# train_shop_ct2, test_shop_ct2 = process_shop_feature(train_df, test_df)