# 模型拟合 CV部分
采用xgboost模型 logistic regression 用于预测reorder的概率
#重要说明：将eval_metric 从 logloss 改为 auc 
#增大了max_depth（6 -> 10） 明显提高了模型拟合分数
#在特征和模型数据增大时 tree_method会自动切换成prob 然后会出现奇怪的错误 程序终止运行 所以这里增加了tree_method

需要解决的问题 ： 目前评判标准是auc，但是kaggle上面的评分标准是mean-F1, 不知道在trainning的时候是否有必要自定义一个函数f1-score来作为eval-metric

In [1]:
import gc
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import xgboost
from functools import partial
from sklearn.cross_validation import train_test_split

train = pd.read_pickle('train.pkl')
train.drop(['eval_set'], axis=1, inplace=True)
train.loc[:, 'reordered'] = train.reordered.fillna(0)
xgb_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "auc"
    ,"eta"              : 0.1
    ,"max_depth"        : 10
    ,"min_child_weight" : 5
    ,"gamma"            :0.70
    ,"subsample"        :1.0
    ,"colsample_bytree" :0.95
    ,"alpha"            :2e-05
    ,"lambda"           :10
    ,'tree_method'      :'exact'
}




# 原程序自带的记时和用于输出结果的函数（可以忽略）
这里没有做修改

In [2]:

def load_data(path_data):
    '''
    --------------------------------order_product--------------------------------
    * Unique in order_id + product_id
    '''
    priors = pd.read_csv(path_data + 'order_products__prior.csv', 
                     dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    train = pd.read_csv(path_data + 'order_products__train.csv', 
                    dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    '''
    --------------------------------order--------------------------------
    * This file tells us which set (prior, train, test) an order belongs
    * Unique in order_id
    * order_id in train, prior, test has no intersection
    * this is the #order_number order of this user
    '''
    orders = pd.read_csv(path_data + 'orders.csv', 
                         dtype={
                                'order_id': np.int32,
                                'user_id': np.int64,
                                'eval_set': 'category',
                                'order_number': np.int16,
                                'order_dow': np.int8,
                                'order_hour_of_day': np.int8,
                                'days_since_prior_order': np.float32})

    #  order in prior, train, test has no duplicate
    #  order_ids_pri = priors.order_id.unique()
    #  order_ids_trn = train.order_id.unique()
    #  order_ids_tst = orders[orders.eval_set == 'test']['order_id'].unique()
    #  print(set(order_ids_pri).intersection(set(order_ids_trn)))
    #  print(set(order_ids_pri).intersection(set(order_ids_tst)))
    #  print(set(order_ids_trn).intersection(set(order_ids_tst)))

    '''
    --------------------------------product--------------------------------
    * Unique in product_id
    '''
    products = pd.read_csv(path_data + 'products.csv')
    aisles = pd.read_csv(path_data + "aisles.csv")
    departments = pd.read_csv(path_data + "departments.csv")
    sample_submission = pd.read_csv(path_data + "sample_submission.csv")
    
    return priors, train, orders, products, aisles, departments, sample_submission
class tick_tock:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + " begin ......")
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            print(self.process_name + " end ......")
            print('time lapsing {0} s \n'.format(end_time - self.begin_time))
            

def ka_add_groupby_features_n_vs_1(df, group_columns_list, target_columns_list, methods_list, keep_only_stats=True, verbose=1):
   
    with tick_tock("add stats features", verbose):
        dicts = {"group_columns_list": group_columns_list , "target_columns_list": target_columns_list, "methods_list" :methods_list}

        for k, v in dicts.items():
            try:
                if type(v) == list:
                    pass
                else:
                    raise TypeError(k + "should be a list")
            except TypeError as e:
                print(e)
                raise

        grouped_name = ''.join(group_columns_list)
        target_name = ''.join(target_columns_list)
        combine_name = [[grouped_name] + [method_name] + [target_name] for method_name in methods_list]

        df_new = df.copy()
        grouped = df_new.groupby(group_columns_list)

        the_stats = grouped[target_name].agg(methods_list).reset_index()
        the_stats.columns = [grouped_name] + \
                            ['_%s_%s_by_%s' % (grouped_name, method_name, target_name) \
                             for (grouped_name, method_name, target_name) in combine_name]
        if keep_only_stats:
            return the_stats
        else:
            df_new = pd.merge(left=df_new, right=the_stats, on=group_columns_list, how='left')
        return df_new
path_data = '../input/'
priors, train_detail, orders, products, aisles, departments, sample_submission = load_data(path_data)
        

# CV模块

自行创建 参考https://github.com/happycube/kaggle2017/blob/master/instacart/catboost-0723.ipynb
#用3 fold CV 验证模型的情况 并输出f1-score

In [3]:

train_details = pd.merge(
                left=train_detail,
                 right=orders, 
                 how='left', 
                 on='order_id'
        ).apply(partial(pd.to_numeric, errors='ignore', downcast='integer'))

try:
    df_train_gt = pd.read_csv('train.csv', index_col='order_id')
except:
    train_gtl = []

    for uid, subset in train_details.groupby('user_id'):
        subset1 = subset[subset.reordered == 1]
        oid = subset.order_id.values[0]

        if len(subset1) == 0:
            train_gtl.append((oid, 'None'))
            continue

        ostr = ' '.join([str(int(e)) for e in subset1.product_id.values])
        # .strip is needed because join can have a padding space at the end
        train_gtl.append((oid, ostr.strip()))

    df_train_gt = pd.DataFrame(train_gtl)

    df_train_gt.columns = ['order_id', 'products']
    df_train_gt.set_index('order_id', inplace=True)
    df_train_gt.sort_index(inplace=True)
    df_train_gt.to_csv('train.csv')

### 用于处理CV的xgboost函数 

In [4]:
def xgboost_cv(X_train, y_train, X_val, y_val, features_to_use):
    d_train = xgboost.DMatrix(X_train[features_to_use], y_train)
    
    d_val = xgboost.DMatrix(X_val[features_to_use], y_val)
    
    watchlist = [(d_val, "val"), (d_train, "train")];
    
    bst = xgboost.train(params=xgb_params, dtrain=d_train, num_boost_round = 80, evals=watchlist, verbose_eval=10)
    
    return bst

### 输出F1-score函数

In [5]:
def compare_results(df_gt, df_preds):
    
    df_gt_cut = df_gt.loc[df_preds.index]
    
    f1 = []
    for gt, pred in zip(df_gt_cut.sort_index().products, df_preds.sort_index().products):
        lgt = gt.replace("None", "-1").split(' ')
        lpred = pred.replace("None", "-1").split(' ')

        rr = (np.intersect1d(lgt, lpred))
        precision = np.float(len(rr)) / len(lpred)
        recall = np.float(len(rr)) / len(lgt)

        denom = precision + recall
        f1.append(((2 * precision * recall) / denom) if denom > 0 else 0)

    #print(np.mean(f1))
    return(np.mean(f1))

### CV的主要部分
3折CV 跑的时间很长

In [6]:
df_cvfolds = []
bst = []

for fold in range(3):
    train_subset = train[train.user_id % 3 != fold]
    valid_subset = train[train.user_id % 3 == fold]

    X_train = train_subset.drop('reordered', axis=1)
    y_train = train_subset.reordered

    X_val = valid_subset.drop('reordered', axis=1)
    y_val = valid_subset.reordered

    val_index = X_val[['user_id', 'product_id', 'order_id']]
    
    features_to_use = list(X_train.columns)
    features_to_use.remove('user_id')
    features_to_use.remove('product_id')
    features_to_use.remove('order_id')

    bst.append(xgboost_cv(X_train, y_train, X_val, y_val, features_to_use))
    
    d_test = xgboost.DMatrix(X_val[features_to_use], y_val) 
    
    lim = .203
    val_out = val_index.copy()

    val_out.loc[:,'reordered'] = (bst[-1].predict(d_test) > lim).astype(int)
    val_out.loc[:, 'product_id'] = val_out.product_id.astype(str)
    presubmit = ka_add_groupby_features_n_vs_1(val_out[val_out.reordered == 1], 
                                                   group_columns_list=['order_id'],
                                                   target_columns_list= ['product_id'],
                                                   methods_list=[lambda x: ' '.join(set(x))], keep_only_stats=True)

    presubmit = presubmit.set_index('order_id')
    presubmit.columns = ['products']

    fullfold = pd.DataFrame(index = val_out.order_id.unique())

    fullfold.index.name = 'order_id'
    fullfold['products'] = ['None'] * len(fullfold)

    fullfold.loc[presubmit.index, 'products'] = presubmit.products

    print(fold, compare_results(df_train_gt, fullfold))
    
    df_cvfolds.append(fullfold)

[0]	val-auc:0.825454	train-auc:0.826358
[10]	val-auc:0.830801	train-auc:0.832994
[20]	val-auc:0.832327	train-auc:0.835941
[30]	val-auc:0.833427	train-auc:0.838581
[40]	val-auc:0.834325	train-auc:0.841047
[50]	val-auc:0.834903	train-auc:0.843163
[60]	val-auc:0.835215	train-auc:0.844889
[70]	val-auc:0.835426	train-auc:0.846238
add stats features begin ......
add stats features end ......
time lapsing 0.833000183105 s 

(0, 0.38076062381574383)
[0]	val-auc:0.825384	train-auc:0.826354
[10]	val-auc:0.830839	train-auc:0.832889
[20]	val-auc:0.832485	train-auc:0.835936
[30]	val-auc:0.833593	train-auc:0.838538
[40]	val-auc:0.834494	train-auc:0.840964
[50]	val-auc:0.835051	train-auc:0.84305
[60]	val-auc:0.835401	train-auc:0.844641
[70]	val-auc:0.835621	train-auc:0.84608
add stats features begin ......
add stats features end ......
time lapsing 0.923000097275 s 

(1, 0.38060784899837524)
[0]	val-auc:0.824985	train-auc:0.826803
[10]	val-auc:0.830098	train-auc:0.83317
[20]	val-auc:0.831767	train-au

### CV的最终结果

In [7]:
df_cv = pd.concat(df_cvfolds)
print(compare_results(df_train_gt, df_cv))

0.3799451371


# 全部数据运行
注释中是之前用的xgboost.cv

In [3]:
train.drop(['user_id', 'product_id', 'order_id'], axis=1, inplace=True)
y_train = train.reordered
X_train = train.drop('reordered', axis = 1)
#如果数据量太大无法带动 请用下面的X_train和y_train
# X_train, X_val, y_train, y_val = train_test_split(train.drop('reordered', axis=1), train.reordered,
#                                                      test_size=0.2, random_state=42)

d_train = xgboost.DMatrix(X_train, y_train)

watchlist= [(d_train, "train")]
#### tang
# res = xgboost.cv(xgb_params, d_train, num_boost_round=10, nfold=3, seed=0,stratified=True,show_stdv=True)
# cv_mean = res.iloc[-1, 0]  
# cv_std = res.iloc[-1, 1]  
# print('CV-Mean: {0}+{1}'.format(cv_mean, cv_std))
# CV-Mean: 0.830923333333+0.000128857371625
# CV-Mean: 0.830947+0.000159241954271

In [4]:
# xgb_params = {
#     "objective"         : "reg:logistic"
#     ,"eval_metric"      : "auc"
#     ,"eta"              : 0.1
#     ,"max_depth"        : 10
#     ,"min_child_weight" : 5
#     ,"gamma"            :0.70
#     ,"subsample"        :0.95
#     ,"colsample_bytree" :0.95
#     ,"alpha"            :2e-05
#     ,"lambda"           :10
#     ,'tree_method'      :'exact'
# }
bst = xgboost.train(params=xgb_params, dtrain=d_train, num_boost_round=100, evals=watchlist, verbose_eval=10)

[0]	train-auc:0.827064
[10]	train-auc:0.832756
[20]	train-auc:0.835223
[30]	train-auc:0.837393
[40]	train-auc:0.839432
[50]	train-auc:0.841168
[60]	train-auc:0.842582
[70]	train-auc:0.843776
[80]	train-auc:0.844869
[90]	train-auc:0.845772
[100]	train-auc:0.846525
[110]	train-auc:0.847078
[120]	train-auc:0.847749
[130]	train-auc:0.84846
[140]	train-auc:0.849009
[150]	train-auc:0.849491
[160]	train-auc:0.849917
[170]	train-auc:0.850464


清理一下内存 不然会报错

In [6]:
del train
gc.collect()

NameError: name 'train' is not defined

# 结果导出

In [8]:
X_test = pd.read_pickle('X_test.pkl')
d_test = xgboost.DMatrix(X_test.drop(['eval_set', 'user_id', 'order_id', 'reordered', 'product_id'], axis=1))
X_test.loc[:,'reordered'] = (bst.predict(d_test) > 0.203).astype(int)
X_test.loc[:, 'product_id'] = X_test.product_id.astype(str)
submit = ka_add_groupby_features_n_vs_1(X_test[X_test.reordered == 1], 
                                               group_columns_list=['order_id'],
                                               target_columns_list= ['product_id'],
                                               methods_list=[lambda x: ' '.join(set(x))], keep_only_stats=True)


submit.columns = sample_submission.columns.tolist()
submit_final = sample_submission[['order_id']].merge(submit, how='left').fillna('None')
submit_final.to_csv("python_test.csv", index=False)

add stats features begin ......
add stats features end ......
time lapsing 1.4889998436 s 



# 结果 0.3832204  RANK 14%

# 画feature_importance图
需要解决怎么画个能看清楚的图

In [None]:
%matplotlib inline
xgboost.plot_importance(bst)