In [None]:
import numpy as np
import pandas as pd
import statsmodels as sm
import matplotlib.pylab as plt
import config as cf

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error
from chinese_calendar import is_workday, is_holiday
from jupyterthemes import jtplot

jtplot.style()
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))

In [None]:
train_df = pd.read_csv(cf.round1_train_file_path, sep = ' ')
test_df = pd.read_csv(cf.round1_test_file_path, sep = ' ')

category_df = train_df['item_category_list'].unique()
category_ids = pd.DataFrame({'item_category_list' : category_df, 'item_category_id' : np.arange(len(category_df))})

train_df = train_df.merge(category_ids, on='item_category_list')
test_df = test_df.merge(category_ids, on='item_category_list')

time_offset = 8 * 60 * 60 - 365 * 24 * 60 * 60
train_df.loc[:,'context_datetime'] = pd.to_datetime(train_df.loc[:,'context_timestamp'] + time_offset, unit='s')
test_df.loc[:,'context_datetime'] = pd.to_datetime(test_df.loc[:,'context_timestamp'] + time_offset, unit='s')

In [None]:
# 每项数据结果数量
print train_df.nunique()

# 单项数据结果数量
print train_df['is_trade'].describe()

> 有 27% 的测试集用户在训练集中出现过，可以建立“历史用户”特征，提高这部分用户预测率

In [None]:
# user info
f = plt.figure(figsize=(18,6))

a = set(train_df['user_id'].tolist())
b = set(test_df['user_id'].tolist())
print 'train user count : %d, test user count : %d, both user count : %d' %(len(a), len(b), len(a & b))

# plot user age trade rates
a = train_df['user_age_level'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['user_age_level'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(251))
a.plot.pie(ax=f.add_subplot(252))

# plot user gender trade rates
a = train_df['user_gender_id'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['user_gender_id'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(254))
a.plot.pie(ax=f.add_subplot(255))

# plot user start trade rates
a = train_df['user_occupation_id'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['user_occupation_id'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(256))
a.plot.pie(ax=f.add_subplot(257))

# plot user occupation trade rates
a = train_df['user_star_level'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['user_star_level'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(259))
a.plot.pie(ax=f.add_subplot(2,5,10))

> 有 96% 的测试集商品在训练集中出现过，可以建立“历史商品”特征
> 属性标签，训练集样本中共有10000种属性组合，有6000种类属性

In [None]:
# item info
f = plt.figure(figsize=(18,6))

a = set(train_df['item_id'].tolist())
b = set(test_df['item_id'].tolist())
print 'train item count : %d, test item count : %d, both item count : %d' %(len(a), len(b), len(a & b))

# # encode plot item category list
# if 'item_category_id' not in train_df:
#     category_df = train_df['item_category_list'].unique()
#     category_ids = pd.DataFrame({'item_category_list' : category_df, 'item_category_id' : np.arange(len(category_df))})
#     train_df = train_df.merge(category_ids, on='item_category_list')
#     test_df = test_df.merge(category_ids, on='item_category_list')

# plot item category trade rates
a = train_df['item_category_id'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['item_category_id'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(251))
a.plot.pie(ax=f.add_subplot(252))

# todo deal with properties
prop_set = set()
for property_list in train_df['item_property_list'].unique():
    for prop in  property_list.split(';'):
        prop_set.add(prop)
print 'property combination count: %d, property item count : %d' %(len(train_df['item_property_list'].unique()), len(prop_set))
# TODO: calculate high trade rate property

# plot item city id
a = train_df['item_city_id'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['item_city_id'].value_counts()
city_map = a.to_frame(name='count').join(y.to_frame(name='is_trade'))
city_map.loc[:, 'rate'] = (city_map['is_trade'] / city_map['count'] * 100)

print city_map

# plot price level
a = train_df['item_price_level'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['item_price_level'].value_counts()
price_map = a.to_frame(name='count').join(y.to_frame(name='is_trade'))
price_map.loc[:, 'rate'] = (price_map['is_trade'] / price_map['count'] * 100)
price_map.sort_index().transpose()

train_raw_df = train_df.copy(deep=True)
test_raw_df = test_df.copy(deep=True)

> 属性标签，训练集样本中共有10000种属性组合，有6000种类属性

In [None]:
# store info
f = plt.figure(figsize=(18,9))

a = set(train_df['shop_id'].tolist())
b = set(test_df['shop_id'].tolist())
print 'train shop count : %d, test shop count : %d, both user count : %d' %(len(a), len(b), len(a & b))

# plot shop review num level trade rates
a = train_df['shop_review_num_level'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['shop_review_num_level'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(451))
a.plot.pie(ax=f.add_subplot(452))

# plot shop review positive rete trade rates
train_df.loc[:,'shop_review_positive_rate_qcut'] = pd.qcut(train_df['shop_review_positive_rate'], 24, duplicates='drop')
# train_df.loc[:,'shop_review_positive_rate_qcut'] = pd.qcut(train_df['shop_review_positive_rate'], 24, duplicates='drop', labels=range(11))
a = train_df['shop_review_positive_rate_qcut'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['shop_review_positive_rate_qcut'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(454))
a.plot.pie(ax=f.add_subplot(455))

# plot shop star level trade rates
a = train_df['shop_star_level'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['shop_star_level'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(4, 5, 6))
a.plot.pie(ax=f.add_subplot(4, 5, 7))

# plot shop review positive rete trade rates
train_df.loc[:,'shop_score_service_qcut'] = pd.qcut(train_df['shop_score_service'], 11, duplicates='drop')
# train_df.loc[:,'shop_review_positive_rate_qcut'] = pd.qcut(train_df['shop_review_positive_rate'], 24, duplicates='drop', labels=range(11))
a = train_df['shop_score_service_qcut'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['shop_score_service_qcut'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(4, 5, 9))
a.plot.pie(ax=f.add_subplot(4, 5, 10))

# plot shop review positive rete trade rates
train_df.loc[:,'shop_score_delivery_qcut'] = pd.qcut(train_df['shop_score_delivery'], 11, duplicates='drop')
# train_df.loc[:,'shop_review_positive_rate_qcut'] = pd.qcut(train_df['shop_review_positive_rate'], 24, duplicates='drop', labels=range(11))
a = train_df['shop_score_delivery_qcut'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['shop_score_delivery_qcut'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(4, 5, 11))
a.plot.pie(ax=f.add_subplot(4, 5, 12))

# plot shop review positive rete trade rates
train_df.loc[:,'shop_score_description_qcut'] = pd.qcut(train_df['shop_score_description'], 11, duplicates='drop')
# train_df.loc[:,'shop_review_positive_rate_qcut'] = pd.qcut(train_df['shop_review_positive_rate'], 24, duplicates='drop', labels=range(11))
a = train_df['shop_score_description_qcut'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['shop_score_description_qcut'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(4, 5, 14))
a.plot.pie(ax=f.add_subplot(4, 5, 15))

In [None]:
pd.qcut(train_df['shop_review_positive_rate'], 24, duplicates='drop').value_counts()

In [None]:
# context info
f = plt.figure(figsize=(18,6))

# plot user age trade rates
a = train_df['context_page_id'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['context_page_id'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(251))
a.plot.pie(ax=f.add_subplot(252))

# plot item category trade rates
a = train_df['item_category_id'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['item_category_id'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(254))
a.plot.pie(ax=f.add_subplot(255))

> 构建目录预测相关性数据，大约95%的数据和上下文预测的根目录匹配，所以我们从二级目录开始考虑

In [None]:
train_df = pd.read_csv(cf.round1_train_file_path, sep = ' ')
# context info
f = plt.figure(figsize=(18,6))

def predict_category_hit(row):
    ret = 0
    pre_list = row['predict_category_property'].split(';')
    category_list = row['item_category_list'].split(';')
    # start with second level category
    for i in category_list[1:]:
        for k in range(len(pre_list)):
            if i in pre_list[k]:
                # combime small datas.
                if ret == 0 or k < ret:
                    ret = 5 if k > 5 else k
    return ret

# context category predict info
if 'category_predict_hit' not in train_df:
    cp = train_df[['item_category_list', 'predict_category_property']]
    frame = cp.apply(predict_category_hit, axis=1)
    frame.name = 'category_predict_hit'
    train_df = train_df.join(frame)
    
# plot category predict trade rates
a = train_df['category_predict_hit'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['category_predict_hit'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(251))
a.plot.pie(ax=f.add_subplot(252))

In [None]:
# context info
# 构建属性命中相关性数据
train_df = pd.read_csv(cf.round1_train_file_path, sep = ' ')

property_map = {}
property_list = train_df['item_property_list']
for props in property_list:
    for prop in props.split(';'):
        if prop in property_map:
            property_map[prop] += 1
        else:
            property_map[prop] = 1
prop_df = pd.DataFrame(property_map.items(), columns=['property_id', 'doc_freq'])

predict_map = {}
predict_list = train_df['predict_category_property']
for cate_props in predict_list:
    for cate_prop in cate_props.split(';'):
        cp_pair = cate_prop.split(':')
        if len(cp_pair) < 2 or cp_pair[1] == -1:
            continue
        props = cp_pair[1]
        for prop in props.split(','):
            if prop in predict_map:
                predict_map[prop] += 1
            else:
                predict_map[prop] = 1
pred_df = pd.DataFrame(predict_map.items(), columns=['property_id', 'pre_freq'])
prop_df = prop_df.merge(pred_df)

In [None]:
predict_map

In [None]:
train_df = pd.read_csv(cf.round1_train_file_path, sep = ' ')
# context info
f = plt.figure(figsize=(18,6))

def predict_property_hit(row):
    ret = 0
    pre_list = row['predict_category_property'].split(';')
    property_list = row['item_property_list'].split(';')
    for i in range(len(property_list)):
        for k in pre_list:
            if property_list[i] in k:
                ret += float(1) / property_map[property_list[i]]
#     return ret
    if ret == 0:
        return 0
    elif ret <= 0.00001:
        return 1
    elif ret <= 0.00003:
        return 2
    else:
        return 3

# context property predict info
if 'property_predict_hit' not in train_df:
    cp = train_df[['item_category_list', 'item_property_list', 'predict_category_property']]
    frame = cp.apply(predict_property_hit, axis=1)
    frame.name = 'property_predict_hit'
    train_df = train_df.join(frame)
    
# plot property predict trade rates
a = train_df['property_predict_hit'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['property_predict_hit'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(254))
a.plot.pie(ax=f.add_subplot(255))
print a
print y

> 数据只有7天，17号数据只有16点之前的，24号数据只有16点之后的，暂不考虑引入星期等日期特征

In [None]:
# datetime info
f = plt.figure(figsize=(18,6))
# by day
train_df.loc[:,'context_day'] = train_df.loc[:,'context_datetime'].map(lambda x:x.day)
test_df.loc[:,'context_day'] = test_df.loc[:,'context_datetime'].map(lambda x:x.day)
# by hours
train_df.loc[:,'context_hour'] = train_df.loc[:,'context_datetime'].map(lambda x:x.hour)
test_df.loc[:,'context_hour'] = test_df.loc[:,'context_datetime'].map(lambda x:x.hour)

# plot day
a = train_df['context_day'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['context_day'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(251))
a.plot.pie(ax=f.add_subplot(252))
print a

# plot hour
a = train_df['context_hour'].value_counts()
y = train_df.loc[train_df['is_trade'] == 1]['context_hour'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(254))
a.plot.pie(ax=f.add_subplot(255))

# plot hour
trainW1_df = train_df.loc[(train_df['context_datetime'] >= '2017-09-18') & (train_df['context_datetime'] < '2017-09-23')]
a = trainW1_df['context_hour'].value_counts()
y = trainW1_df.loc[trainW1_df['is_trade'] == 1]['context_hour'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(256))
a.plot.pie(ax=f.add_subplot(257))

# plot hour
trainW2_df = train_df.loc[train_df['context_datetime'] >= '2017-09-23']
a = trainW2_df['context_hour'].value_counts()
y = trainW2_df.loc[trainW2_df['is_trade'] == 1]['context_hour'].value_counts()
(y / a * 100).plot.bar(ax=f.add_subplot(259))
a.plot.pie(ax=f.add_subplot(2,5,10))