In [1]:
import numpy as np
np.random.seed(1337)
import pandas as pd
import lightgbm as lgb
import gc

In [2]:
products = pd.read_pickle('products')
priors = pd.read_pickle('priors')
users = pd.read_pickle('users')
userXproduct = pd.read_pickle('userXproduct')
df_temp = pd.read_pickle('df_temp')
df_train = pd.read_pickle('df_train')

In [3]:
def eval_fun(labels, preds):
    labels = labels.split(' ')
    preds = preds.split(' ')
    rr = (np.intersect1d(labels, preds))
    precision = np.float(len(rr)) / len(preds)
    recall = np.float(len(rr)) / len(labels)
    try:
        f1 = 2 * precision * recall / (precision + recall)
    except ZeroDivisionError:
        return (precision, recall, 0.0)
    return (precision, recall, f1)

# 交叉验证实验

In [4]:
#一些辅助函数 为了交叉验证 需要把同一订单号的 物品 合为一个
def get_liststr(df_test):
    n=1
    for row in df_test:
        if n==1:
            temp=str(row)
            n=0
             
        else:
                temp += ' ' + str(row)
    return  temp

def get_liststr1(df_test):
    n=1
    for row in df_test.split(' '):
        if n==1:
            temp=row
            n=0
             
        else:
                temp += ' ' + row
    return  temp
#把预测结果 通过阈值 挑选出来 放入一个字符串中 ‘product1 2 3 4···’
def get_pred_results(df_test,thrshold=0.22):
    TRESHOLD = thrshold  # guess, should be tuned with crossval on a subset of train data

    d = dict()
    for row in df_test.itertuples():
        if row.pred > TRESHOLD:
            try:
                d[row.order_id] += ' ' + str(row.product_id)
            except:
                d[row.order_id] = str(row.product_id)

    for order in df_test.order_id:
        if order not in d:
            d[order] = 'None'

    sub = pd.DataFrame.from_dict(d, orient='index')
    sub.reset_index(inplace=True)
    sub.columns = ['order_id', 'products']
    return sub

In [5]:
labels = np.array(df_train['labels'],dtype=pd.Series)
df_train.drop(['labels'],axis=1,inplace=True)

f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', #'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order'



#lgb.plot_importance(bst, figsize=(9,20))

In [None]:
def fscore(df,bst,alpha):
    df['pred'] = bst.predict(df[f_to_use])
    train_pred=get_pred_results(df,thrshold=alpha)
    #合表
    train_pred1=pd.merge(train_pred,df_temp,on=['order_id'])
    #求F1结果表
    res = list()
    for entry in train_pred1.itertuples():
        res.append(eval_fun(entry[2], entry[3]))
    res = pd.DataFrame(np.array(res), columns=['precision', 'recall', 'f1'])
    return res['f1'].mean()

In [None]:
def fscore_xgb(df,bst,alpha):
    d_d=xgb.DMatrix(df[f_to_use])
    df['pred'] = bst.predict(d_d)
    train_pred=get_pred_results(df,thrshold=alpha)
    #合表
    train_pred1=pd.merge(train_pred,df_temp,on=['order_id'])
    #求F1结果表
    res = list()
    for entry in train_pred1.itertuples():
        res.append(eval_fun(entry[2], entry[3]))
    res = pd.DataFrame(np.array(res), columns=['precision', 'recall', 'f1'])
    return res['f1'].mean()

In [6]:
def fscore_nn(data, pred, model, alpha):
    data['pred'] = pred
    data_pred = get_pred_results(data, thrshold=alpha)
    # 合表
    data_pred1 = pd.merge(data_pred, df_temp, on=['order_id'])
    # 求F1结果表
    res = list()
    for entry in data_pred1.itertuples():
        res.append(eval_fun(entry[2], entry[3]))
    res = pd.DataFrame(np.array(res), columns=['precision', 'recall', 'f1'])
    return res["precision"].mean(), res['recall'].mean(), res['f1'].mean()

In [None]:
import warnings
warnings.filterwarnings('ignore')
#先使用最简单的k-Fold
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 98
kf=KFold(n_splits=3)    # 定义分成几个组
list_f1=[]
list_f2=[]
num=1
#clf=LGBMClassifier(objective='binary', boosting_type='gbdt')
#决定采用手动cv  因为需要了利用合表才能得到F1 传统方法不可以 数组合表 太可怕···
#把数组变成切边的形式  即可
import timeit
start=timeit.default_timer()

for train_index,test_index in kf.split(df_train, labels):
    
    #train_max=train_index.max()
    #train_min=train_index.min()
    test_max=test_index.max()+1
    test_min=test_index.min()
    X_test=df_train[test_min:test_max]
    X_train=df_train.drop(test_index)
    #X_train,X_test=data_train[train_index],data_train[test_index]
    y_train,y_test=labels[train_index],labels[test_index]   
    d_train = lgb.Dataset(X_train[f_to_use],
                      label=y_train,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'
    
    
    bst = lgb.train(params, d_train, ROUNDS, early_stopping_rounds=50)
    a=fscore(X_train,bst,0.22)
    b=fscore(X_test,bst,0.22)
    list_f1.append(a)
    list_f2.append(b)
    print('* {}: train:{}, test:{}'.format(num,a,b))
    num+=1
    
print('ALL:train:{} test:{}'.format(np.mean(list_f1),np.mean(list_f2)))
end = timeit.default_timer()
print('cost time:'+str(end-start))

In [None]:
import warnings
warnings.filterwarnings('ignore')
#先使用最简单的k-Fold
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
params_xgb ={
  "objective"  : "reg:logistic",
  "eval_metric"   : "logloss",
  "eta"    :0.1,
  "max_depth"   : 7,
  "min_child_weight"  : 3,
  "gamma"             : 0.70,
  "subsample"          : 0.78,
  "colsample_bytree"    : 0.95,
  "alpha"             : 2e-05,
  "lambda"            : 10
        }
kf=KFold(n_splits=3)    # 定义分成几个组
list_f1=[]
list_f2=[]
num=1

#clf=LGBMClassifier(objective='binary', boosting_type='gbdt')
#决定采用手动cv  因为需要了利用合表才能得到F1 传统方法不可以 数组合表 太可怕···
#把数组变成切边的形式  即可
import timeit
start=timeit.default_timer()
print('start cv :-) long time```')
count = 0

for train_index,test_index in kf.split(df_train, labels):    
    test_max=test_index.max()+1
    test_min=test_index.min()
    X_test=df_train[test_min:test_max]
    X_train=df_train.drop(test_index)
    y_train,y_test=labels[train_index],labels[test_index]   
    d_train = xgb.DMatrix(X_train[f_to_use],
                      label=y_train)
    
    ROUNDS = 98
    bst = xgb.train(params_xgb, d_train, ROUNDS)
    a=fscore_xgb(X_train,bst,0.22)
    b=fscore_xgb(X_test,bst,0.22)
    list_f1.append(a)
    list_f2.append(b)
    print('* {}: train:{}, test:{}'.format(num,a,b))
    num+=1
    count += 1
    if count >= 1:
        break
                                       
    
print('ALL:train:{} test:{}'.format(np.mean(list_f1),np.mean(list_f2)))
end = timeit.default_timer()
print('cost time:'+str(end-start))

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn import cross_validation
from sklearn import preprocessing

train, test, train_labels, test_labels = cross_validation.train_test_split(df_train, labels, test_size=0.3, random_state=0)

scalerX = preprocessing.MinMaxScaler(feature_range=(0, 1))
train_x = scalerX.fit_transform(train[f_to_use])
test_x = scalerX.transform(test[f_to_use])

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX TITAN X (CNMeM is enabled with initial size: 20.0% of memory, cuDNN 5005)


In [8]:
def nn_model():
    model = Sequential()
    model.add(Dense(128, input_dim=18, activation='relu'))
    model.add(Dense(256, input_dim=128, activation='relu'))
    model.add(Dense(64, input_dim=128, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation='sigmoid'))

    from keras import backend as K
    def weight_crossentropy(y_true, y_pred):
        return K.mean(-(y_true * K.log(y_pred)*1.8 + (1.0 - y_true) * K.log(1.0 - y_pred)*0.2), axis=-1)

    model.compile(loss=weight_crossentropy, optimizer="rmsprop", metrics=["accuracy"])
    return model

In [9]:
model = nn_model()
model.fit(train_x, train_labels, epochs=15, verbose=1, validation_split=0.1, batch_size=5000, shuffle=True)
pred_prob = model.predict_proba(test_x, batch_size=5000)

Train on 5339035 samples, validate on 593227 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

In [10]:
print(fscore_nn(test, pred_prob, model, 0.5))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(0.2093073525457366, 0.25989632542949426, 0.2074950323576962)


In [11]:
del train
del test
del train_x
del test_x
del train_labels
del test_labels
gc.collect()

107

In [None]:
### build candidates list for test ###

#前面搞好了 
df_test = pd.read_pickle('df_test')
#df_test, _ = features(test_orders)

#clf.fit(df_train[f_to_use],labels)
d_train = xgb.DMatrix(df_train[f_to_use],label=labels)
d_d=xgb.DMatrix(df_test[f_to_use])   

bst = xgb.train(params_xgb, d_train, ROUNDS)
df_test['pred'] = bst.predict(d_d)

sub=get_pred_results(df_test,0.22)
#sub.to_csv('sub.csv', index=False)
from datetime import datetime
now = datetime.now()
sub.to_csv('xgb_results_{}.{}.{}.csv'.format(
    str(now.date()),
    str(now.hour),
    str(now.minute)
), index = False)

In [9]:
df_test = pd.read_pickle('df_test')
#df_test, _ = features(test_orders)

train_x = scalerX.transform(df_train[f_to_use])
test_x = scalerX.transform(df_test[f_to_use])

model = nn_model()
model.fit(train_x, labels, epochs=20, verbose=1, batch_size=5000, shuffle=True)

df_test['pred'] = model.predict_proba(test_x,  batch_size=5000)
sub=get_pred_results(df_test,0.5)

#sub.to_csv('sub.csv', index=False)
from datetime import datetime
now = datetime.now()
sub.to_csv('nn_results_{}.{}.{}.csv'.format(
    str(now.date()),
    str(now.hour),
    str(now.minute)
), index = False)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20