In [None]:
import numpy as np
import pandas as pd
import statsmodels as sm
import matplotlib.pylab as plt
import config as cf
import import_ipynb
import sys

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error
from chinese_calendar import is_workday, is_holiday
from jupyterthemes import jtplot
from IPython.core.display import clear_output
from feature_extract import *

jtplot.style()
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:96% !important; }</style>"))

In [None]:
# trainV_df = train_df.loc[((train_df['context_datetime'] >= '2017-09-19') & (train_df['context_datetime'] < '2017-09-25')) | ((train_df['context_datetime'] < '2017-09-19') & (train_df['context_timestamp'] % 3 != 0))]
# testV_df = train_df.loc[(train_df['context_datetime'] >= '2017-09-18') & (train_df['context_datetime'] < '2017-09-19') & (train_df['context_timestamp'] % 3 == 0)]

trainV_df = train_df.loc[(train_df['context_datetime'] >= '2017-09-18') & (train_df['context_datetime'] < '2017-09-24')]
testV_df = train_df.loc[(train_df['context_datetime'] >= '2017-09-24') & (train_df['context_datetime'] < '2017-09-25')]

In [None]:
print len(train_df), len(test_df)
print len(trainV_df), len(testV_df)

a = set(train_df['user_id'].tolist())
b = set(test_df['user_id'].tolist())
print 'train user count : %d, test user count : %d, both user count : %d' %(len(a), len(b), len(a & b))
a = set(trainV_df['user_id'].tolist())
b = set(test_df['user_id'].tolist())
print 'trainV user count : %d, test user count : %d, both user count : %d' %(len(a), len(b), len(a & b))

> 应用特征处理的结果到验证的训练集和测试集

In [None]:
def process_train_test_features(tr_df, te_df, st_df, all_df, proc_func_list):
    tr_fs, te_fs = process_base_feature(tr_df, te_df)

    print 'begin base:', len(tr_fs), len(te_fs)
    for proc_func in proc_func_list:
        clear_output()
        print 'processing ' + proc_func.func_name + ' ...'
        tr_f, te_f = proc_func(tr_df, te_df, st_df, all_df)
        clear_output()
        print 'merging ' + proc_func.func_name + ':', len(tr_f), len(te_f)
        tr_fs = tr_fs.merge(tr_f, how='left')
        te_fs = te_fs.merge(te_f, how='left')
    clear_output()
    map(lambda x:x.drop_duplicates(inplace=True), (tr_fs, te_fs))
    print 'done features:', len(tr_fs), len(te_fs)
    return tr_fs, te_fs

In [None]:
# fast features
fast_proc_func_list = [process_base_combine_feature, process_intersection_id,
                       process_context_predict_feature, process_context_time_feature,
                       process_user_ot_feature, process_user_item_ot_feature,
                       process_item_ot_feature, process_shop_ot_feature,
                       process_item_property_feature, process_shop_score_qcut_feature,
                       process_item_prob_feature, process_user_prob_feature, process_user_item_prob_feature]

trV_fs, teV_fs = process_train_test_features(trainV_df, testV_df, stat_df, all_df, fast_proc_func_list)
tr_fs, te_fs = process_train_test_features(train_df, test_df, stat_df, all_df, fast_proc_func_list)

In [None]:
# slow features
user_old = True

if user_old:
    trV_fs_2 = pd.read_csv(cf.train_valid_features_2_file_path, index_col=0)
    teV_fs_2 = pd.read_csv(cf.test_valid_features_2_file_path, index_col=0)
    tr_fs_2 = pd.read_csv(cf.train_data_features_2_file_path, index_col=0)
    te_fs_2 = pd.read_csv(cf.test_data_features_2_file_path, index_col=0)
else:
    slow_proc_func_list = [process_context_time_rolling_feature]
    trV_fs_2, teV_fs_2 = process_train_test_features(trainV_df, testV_df, stat_df, all_df, slow_proc_func_list)
    tr_fs_2, te_fs_2 = process_train_test_features(train_df, test_df, stat_df, all_df, slow_proc_func_list)
    trV_fs_2.to_csv(cf.train_valid_features_2_file_path)
    teV_fs_2.to_csv(cf.test_valid_features_2_file_path)
    tr_fs_2.to_csv(cf.train_data_features_2_file_path)
    te_fs_2.to_csv(cf.test_data_features_2_file_path)

train_drop_columns = tr_fs_2.iloc[:,2:29].columns.values
test_drop_columns = te_fs_2.iloc[:,1:28].columns.values
trV_fs = trV_fs.merge(trV_fs_2.drop(columns=train_drop_columns), how='left')
teV_fs = teV_fs.merge(teV_fs_2.drop(columns=test_drop_columns), how='left')
tr_fs = tr_fs.merge(tr_fs_2.drop(columns=train_drop_columns), how='left')
te_fs = te_fs.merge(te_fs_2.drop(columns=test_drop_columns), how='left')

In [None]:
print tr_fs.columns.values
print tr_fs.shape

In [None]:
trV_fs.to_csv(cf.train_valid_features_file_path)
teV_fs.to_csv(cf.test_valid_features_file_path)

tr_fs.to_csv(cf.train_data_features_file_path)
te_fs.to_csv(cf.test_data_features_file_path)