In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc

In [2]:
products = pd.read_pickle('products')
priors = pd.read_pickle('priors')
users = pd.read_pickle('users')
userXproduct = pd.read_pickle('userXproduct')
df_temp = pd.read_pickle('df_temp')
df_train = pd.read_pickle('df_train')


In [3]:
def eval_fun(labels, preds):
    labels = labels.split(' ')
    preds = preds.split(' ')
    rr = (np.intersect1d(labels, preds))
    precision = np.float(len(rr)) / len(preds)
    recall = np.float(len(rr)) / len(labels)
    try:
        f1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        return (precision, recall, 0.0)
    return (precision, recall, f1)

# 交叉验证实验

In [4]:
#一些辅助函数 为了交叉验证 需要把同一订单号的 物品 合为一个
def get_liststr(df_test):
    n=1
    for row in df_test:
        if n==1:
            temp=str(row)
            n=0
             
        else:
                temp += ' ' + str(row)
    return  temp

def get_liststr1(df_test):
    n=1
    for row in df_test.split(' '):
        if n==1:
            temp=row
            n=0
             
        else:
                temp += ' ' + row
    return  temp
#把预测结果 通过阈值 挑选出来 放入一个字符串中 ‘product1 2 3 4···’
def get_pred_results(df_test,thrshold=0.22):
    TRESHOLD = thrshold  # guess, should be tuned with crossval on a subset of train data

    d = dict()
    for row in df_test.itertuples():
        if row.pred > TRESHOLD:
            try:
                d[row.order_id] += ' ' + str(row.product_id)
            except:
                d[row.order_id] = str(row.product_id)

    for order in df_test.order_id:
        if order not in d:
            d[order] = 'None'

    sub = pd.DataFrame.from_dict(d, orient='index')
    sub.reset_index(inplace=True)
    sub.columns = ['order_id', 'products']
    return sub

In [5]:
labels = np.array(df_train['labels'],dtype=pd.Series)
df_train.drop(['labels'],axis=1,inplace=True)

f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order'



#lgb.plot_importance(bst, figsize=(9,20))

In [6]:
def fscore(df,bst,alpha):
    df['pred'] = bst.predict(df[f_to_use])
    train_pred=get_pred_results(df,thrshold=alpha)
    #合表
    train_pred1=pd.merge(train_pred,df_temp,on=['order_id'])
    #求F1结果表
    res = list()
    for entry in train_pred1.itertuples():
        res.append(eval_fun(entry[2], entry[3]))
    res = pd.DataFrame(np.array(res), columns=['precision', 'recall', 'f1'])
    return res['f1'].mean()

In [7]:
def fscore_xgb(df,bst,alpha):
    d_d=xgb.DMatrix(df[f_to_use])
    df['pred'] = bst.predict(d_d)
    train_pred=get_pred_results(df,thrshold=alpha)
    #合表
    train_pred1=pd.merge(train_pred,df_temp,on=['order_id'])
    #求F1结果表
    res = list()
    for entry in train_pred1.itertuples():
        res.append(eval_fun(entry[2], entry[3]))
    res = pd.DataFrame(np.array(res), columns=['precision', 'recall', 'f1'])
    return res['f1'].mean()

In [8]:
import warnings
warnings.filterwarnings('ignore')
#先使用最简单的k-Fold
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 98
kf=KFold(n_splits=3)    # 定义分成几个组
list_f1=[]
list_f2=[]
num=1
#clf=LGBMClassifier(objective='binary', boosting_type='gbdt')
#决定采用手动cv  因为需要了利用合表才能得到F1 传统方法不可以 数组合表 太可怕···
#把数组变成切边的形式  即可
import timeit
start=timeit.default_timer()

for train_index,test_index in kf.split(df_train, labels):
    
    #train_max=train_index.max()
    #train_min=train_index.min()
    test_max=test_index.max()+1
    test_min=test_index.min()
    X_test=df_train[test_min:test_max]
    X_train=df_train.drop(test_index)
    #X_train,X_test=data_train[train_index],data_train[test_index]
    y_train,y_test=labels[train_index],labels[test_index]   
    d_train = lgb.Dataset(X_train[f_to_use],
                      label=y_train,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'
    bst = lgb.train(params, d_train, ROUNDS)
    a=fscore(X_train,bst,0.22)
    b=fscore(X_test,bst,0.22)
    list_f1.append(a)
    list_f2.append(b)
    print('* {}: train:{}, test:{}'.format(num,a,b))
    num+=1
    
print('ALL:train:{} test:{}'.format(np.mean(list_f1),np.mean(list_f2)))
end = timeit.default_timer()
print('cost time:'+str(end-start))

* 1: train:0.3864333011521617, test:0.3852574104797039
* 2: train:0.3879747943099971, test:0.38371769833014324
* 3: train:0.3883149779286648, test:0.38347921830055154
ALL: test ALL： 0.387574357797
cost time:165.48224068095442


In [27]:
import warnings
warnings.filterwarnings('ignore')
#先使用最简单的k-Fold
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
params_xgb ={
  "objective"  : "reg:logistic",
  "eval_metric"   : "logloss",
  "eta"    :0.1,
  "max_depth"   : 9,
  "min_child_weight"  : 9,
  "gamma"             : 0.70,
  "subsample"          : 0.78,
  "colsample_bytree"    : 0.95,
  "alpha"             : 2e-05,
  "lambda"            : 10
        }
kf=KFold(n_splits=3)    # 定义分成几个组
list_f1=[]
list_f2=[]
num=1

#clf=LGBMClassifier(objective='binary', boosting_type='gbdt')
#决定采用手动cv  因为需要了利用合表才能得到F1 传统方法不可以 数组合表 太可怕···
#把数组变成切边的形式  即可
import timeit
start=timeit.default_timer()
print('start cv :-) long time```')
for train_index,test_index in kf.split(df_train, labels):
    
    test_max=test_index.max()+1
    test_min=test_index.min()
    X_test=df_train[test_min:test_max]
    X_train=df_train.drop(test_index)
    y_train,y_test=labels[train_index],labels[test_index]   
    d_train = xgb.DMatrix(X_train[f_to_use],
                      label=y_train)
    
    bst = xgb.train(params_xgb, d_train, ROUNDS)
    a=fscore_xgb(X_train,bst,0.22)
    b=fscore_xgb(X_test,bst,0.22)
    list_f1.append(a)
    list_f2.append(b)
    print('* {}: train:{}, test:{}'.format(num,a,b))
    num+=1
    
print('ALL:train:{} test:{}'.format(np.mean(list_f1),np.mean(list_f2)))
end = timeit.default_timer()
print('cost time:'+str(end-start))

start cv :-) long time```
* 1: train:0.3898881358325812, test:0.3859023167431283
* 2: train:0.39209670668587865, test:0.3845590343301159
* 3: train:0.39179431983115826, test:0.38377544466027075
ALL:train:0.39125972078320603 test:0.3847455985778383
cost time:802.6681804358959


In [33]:
del d_train
del X_test
del X_train
del y_train
del y_test
gc.collect()

529

In [28]:
### build candidates list for test ###

#前面搞好了 
df_test = pd.read_pickle('df_test')
#df_test, _ = features(test_orders)

#clf.fit(df_train[f_to_use],labels)
d_train = xgb.DMatrix(df_train[f_to_use],label=labels)
d_d=xgb.DMatrix(df_test[f_to_use])   

bst = xgb.train(params_xgb, d_train, ROUNDS)
df_test['pred'] = bst.predict(d_d)

sub=get_pred_results(df_test,0.22)
#sub.to_csv('sub.csv', index=False)
from datetime import datetime
now = datetime.now()
sub.to_csv('xgb_results_{}.{}.{}.csv'.format(
    str(now.date()),
    str(now.hour),
    str(now.minute)
), index = False)