In [None]:
import numpy as np
import pandas as pd
import statsmodels as sm
import matplotlib.pylab as plt
import config as cf

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error
from chinese_calendar import is_workday, is_holiday
from jupyterthemes import jtplot

jtplot.style()
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))

In [None]:
train_df = pd.read_csv(cf.round1_train_file_path, sep = ' ')
test_df = pd.read_csv(cf.round1_test_file_path, sep = ' ')

category_df = train_df['item_category_list'].unique()
category_ids = pd.DataFrame({'item_category_list' : category_df, 'item_category_id' : np.arange(len(category_df))})
train_df = train_df.merge(category_ids, on='item_category_list')
test_df = test_df.merge(category_ids, on='item_category_list')

> 建立用户特征数据

In [None]:
def process_user_item_feature(df):
    user_item_df = df[['user_id', 'item_id', 'item_brand_id', 'item_category_id', 'is_trade']]

    # user trade rate.
    # user trade rate, group by catetory.
    user_rate = user_item_df[['user_id', 'is_trade']].rename(columns={'is_trade':'u_trade_cnt'})
    user_rate.loc[:,'user_cnt'] = 1
    user_rate = user_rate.groupby('user_id', as_index=False).sum()
    #user_rate.sort_values(['u_trade_cnt', 'user_cnt'], ascending=[False,True])

    user_item_rate = user_item_df[['user_id', 'item_id', 'is_trade']].rename(columns={'is_trade':'u_item_trade_cnt'})
    user_item_rate.loc[:, 'u_item_cnt'] = 1
    user_item_rate = user_item_rate.groupby(['user_id', 'item_id'], as_index=False).sum()
    user_item_rate.loc[:, 'u_item_rate'] = user_item_rate['u_item_trade_cnt'] / user_item_rate['u_item_cnt']
#     user_item_rate.sort_values(['u_item_trade_cnt', 'u_item_cnt'], ascending=[False,True])

    user_brand_rate = user_item_df[['user_id', 'item_brand_id', 'is_trade']].rename(columns={'is_trade':'u_brand_trade_cnt'})
    user_brand_rate.loc[:, 'u_brand_cnt'] = 1
    user_brand_rate = user_brand_rate.groupby(['user_id', 'item_brand_id'], as_index=False).sum()
    user_brand_rate.loc[:, 'u_brand_rate'] = user_brand_rate['u_brand_trade_cnt'] / user_brand_rate['u_brand_cnt']
#     user_brand_rate.sort_values(['u_brand_trade_cnt', 'u_brand_cnt'], ascending=[False,True])

    user_cate_rate = user_item_df[['user_id', 'item_category_id', 'is_trade']].rename(columns={'is_trade':'u_cate_trade_cnt'})
    user_cate_rate.loc[:, 'u_cate_cnt'] = 1
    user_cate_rate = user_cate_rate.groupby(['user_id', 'item_category_id'], as_index=False).sum()
    user_cate_rate.loc[:, 'u_cate_rate'] = user_cate_rate['u_cate_trade_cnt'] / user_cate_rate['u_cate_cnt']
#     user_cate_rate.sort_values(['u_cate_trade_cnt', 'u_cate_cnt'], ascending=[False,True])

    user_features = user_item_df.merge(user_rate).merge(user_item_rate).merge(user_brand_rate).merge(user_cate_rate)
    return user_features

train_user_ft = process_user_item_feature(train_df)

> 建立商品特征数据

In [None]:
def process_item_feature(df):
    item_df = df[['item_id', 'item_brand_id', 'item_category_id', 'item_city_id', 'is_trade']]
    
    item_rate = item_df[['item_id', 'is_trade']].rename(columns={'is_trade':'item_trade_cnt'})
    item_rate.loc[:, 'item_cnt'] = 1
    item_rate = item_rate.groupby('item_id', as_index=False).sum()
    item_rate.loc[:, 'item_rate'] = item_rate['item_trade_cnt'] / item_rate['item_cnt']
    
    item_brand_rate = item_df[['item_brand_id', 'is_trade']].rename(columns={'is_trade':'brand_trade_cnt'})
    item_brand_rate.loc[:, 'brand_cnt'] = 1
    item_brand_rate = item_brand_rate.groupby('item_brand_id', as_index=False).sum()
    item_brand_rate.loc[:, 'brand_rate'] = item_brand_rate['brand_trade_cnt'] / item_brand_rate['brand_cnt']
    
    item_cate_rate = item_df[['item_category_id', 'is_trade']].rename(columns={'is_trade':'cate_trade_cnt'})
    item_cate_rate.loc[:, 'cate_cnt'] = 1
    item_cate_rate = item_cate_rate.groupby('item_category_id', as_index=False).sum()
    item_cate_rate.loc[:, 'cate_rate'] = item_cate_rate['cate_trade_cnt'] / item_cate_rate['cate_cnt']
    
    item_city_rate = item_df[['item_city_id', 'is_trade']].rename(columns={'is_trade':'city_trade_cnt'})
    item_city_rate.loc[:, 'city_cnt'] = 1
    item_city_rate = item_city_rate.groupby('item_city_id', as_index=False).sum()
    item_city_rate.loc[:, 'city_rate'] = item_city_rate['city_trade_cnt'] / item_city_rate['city_cnt']
    
    return item_df.merge(item_rate).merge(item_brand_rate).merge(item_cate_rate).merge(item_city_rate)

train_item_ft = process_item_feature(train_df)

> 建立上下文特征数据

In [None]:
def process_context_time_feature(df):
    context_df = df[['context_id', 'context_timestamp', 'context_page_id', 'is_trade']]
    context_df.loc[:,'context_datetime'] = pd.to_datetime(context_df.loc[:,'context_timestamp'], unit='s')
    # by day
    context_df.loc[:,'context_day'] = context_df.loc[:,'context_datetime'].map(lambda x:x.day)
    # by hours
    context_df.loc[:,'context_hour'] = context_df.loc[:,'context_datetime'].map(lambda x:x.hour)
    
    return df

def predict_category_hit(row):
    pre_list = row['predict_category_property'].split(';')
    category_list = row['item_category_list'].split(';')
    # start with second level category
    ret = 0
    for i in category_list[1:]:
        for k in range(len(pre_list)):
            if i in pre_list[k]:
                # combime small datas.
                if ret == 0 or k < ret:
                    return 5 if k > 5 else k
    return ret

def process_context_predict_feature(df):
    cp_df = df[['item_category_list', 'predict_category_property']]
    frame = cp_df.apply(predict_category_hit, axis=1)
    frame.name = 'category_predict_hit'
    ret_df = df[['context_id']].join(frame)    
    return ret_df

train_item_ct = process_context_time_feature(train_df)
train_item_ct = train_item_ct.merge(process_context_predict_feature(train_df))
train_item_ct['category_predict_hit'].value_counts()