In [None]:
import numpy as np
import pandas as pd
import statsmodels as sm
import matplotlib.pylab as plt
import config as cf
import import_ipynb
import sys

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error
from chinese_calendar import is_workday, is_holiday
from jupyterthemes import jtplot
from IPython.core.display import clear_output
from feature_extract import *

jtplot.style()
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))

In [None]:
train_df = pd.read_csv(cf.round1_train_file_path, sep = ' ')
test_df = pd.read_csv(cf.round1_test_file_path, sep = ' ')

category_df = train_df['item_category_list'].unique()
category_ids = pd.DataFrame({'item_category_list' : category_df, 'item_category_id' : np.arange(len(category_df))})
train_df = train_df.merge(category_ids, on='item_category_list')
test_df = test_df.merge(category_ids, on='item_category_list')

time_offset = 8 * 60 * 60 - 365 * 24 * 60 * 60
train_df.loc[:,'context_datetime'] = pd.to_datetime(train_df.loc[:,'context_timestamp'] + time_offset, unit='s')
test_df.loc[:,'context_datetime'] = pd.to_datetime(test_df.loc[:,'context_timestamp'] + time_offset, unit='s')
train_df.loc[:,'context_day'] = train_df.loc[:,'context_datetime'].map(lambda x:x.day)
test_df.loc[:,'context_day'] = test_df.loc[:,'context_datetime'].map(lambda x:x.day)

stat_df = train_df

In [None]:
# trainV_df = train_df.loc[((train_df['context_datetime'] >= '2017-09-19') & (train_df['context_datetime'] < '2017-09-25')) | ((train_df['context_datetime'] < '2017-09-19') & (train_df['context_timestamp'] % 3 != 0))]
# testV_df = train_df.loc[(train_df['context_datetime'] >= '2017-09-18') & (train_df['context_datetime'] < '2017-09-19') & (train_df['context_timestamp'] % 3 == 0)]

trainV_df = train_df.loc[(train_df['context_datetime'] >= '2017-09-18') & (train_df['context_datetime'] < '2017-09-24')]
testV_df = train_df.loc[(train_df['context_datetime'] >= '2017-09-24') & (train_df['context_datetime'] < '2017-09-25')]

In [None]:
print len(train_df), len(test_df)
print len(trainV_df), len(testV_df)

a = set(train_df['user_id'].tolist())
b = set(test_df['user_id'].tolist())
print 'train user count : %d, test user count : %d, both user count : %d' %(len(a), len(b), len(a & b))
a = set(trainV_df['user_id'].tolist())
b = set(test_df['user_id'].tolist())
print 'trainV user count : %d, test user count : %d, both user count : %d' %(len(a), len(b), len(a & b))

> 应用特征处理的结果到验证的训练集和测试集

In [None]:
def process_train_test_features(tr_df, te_df, st_df):
    proc_func_list = [process_context_predict_feature, process_context_time_feature,
                      process_user_feature, process_item_feature, process_item_property_feature, process_user_item_feature,
                      process_shop_feature, process_shop_score_qcut_feature]
#     proc_func_list = [process_context_predict_feature, process_context_time_feature,
#                       process_shop_score_qcut_feature]
    tr_fs, te_fs = process_base_feature(tr_df, te_df)

    print 'begin base:', len(tr_fs), len(te_fs)
    for proc_func in proc_func_list:
        clear_output()
        print 'processing ' + proc_func.func_name + ' ...'
        tr_f, te_f = proc_func(tr_df, te_df, st_df)
        clear_output()
        print 'merging ' + proc_func.func_name + ':', len(tr_f), len(te_f)
        tr_fs = tr_fs.merge(tr_f, how='left')
        te_fs = te_fs.merge(te_f, how='left')
    clear_output()
    map(lambda x:x.drop_duplicates(inplace=True), (tr_fs, te_fs))
    print 'done features:', len(tr_fs), len(te_fs)
    return tr_fs, te_fs

In [None]:
trV_fs, teV_fs = process_train_test_features(trainV_df, testV_df, stat_df)
tr_fs, te_fs = process_train_test_features(train_df, test_df, stat_df)

In [None]:
print tr_fs.columns.values
print tr_fs.shape

In [None]:
trV_fs.to_csv(cf.train_valid_features_file_path)
teV_fs.to_csv(cf.test_valid_features_file_path)

tr_fs.to_csv(cf.train_data_features_file_path)
te_fs.to_csv(cf.test_data_features_file_path)