In [1]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import time
from datetime import datetime
from datetime import timedelta
import pandas as pd
import pickle
import os
import math
import numpy as np

action_1_path = "./data/JData_Action_201602.csv"
action_2_path = "./data/JData_Action_201603.csv"
action_3_path = "./data/JData_Action_201604.csv"
comment_path = "./data/JData_Comment.csv"
product_path = "./data/JData_Product.csv"
user_path = "./data/JData_User.csv"

comment_date = ["2016-02-01", "2016-02-08", "2016-02-15", "2016-02-22", "2016-02-29", "2016-03-07", "2016-03-14",
                "2016-03-21", "2016-03-28",
                "2016-04-04", "2016-04-11", "2016-04-15"]


def convert_age(age_str):
    if age_str == u'-1':
        return 0
    elif age_str == u'15岁以下':
        return 1
    elif age_str == u'16-25岁':
        return 2
    elif age_str == u'26-35岁':
        return 3
    elif age_str == u'36-45岁':
        return 4
    elif age_str == u'46-55岁':
        return 5
    elif age_str == u'56岁以上':
        return 6
    else:
        return -1
#用户的基本信息
def get_basic_user_feat():
    dump_path = './data/cache/basic_user.csv'
    if os.path.exists(dump_path):
        user =pd.read_csv(dump_path)
    else:
        user = pd.read_csv(user_path, encoding='gbk')
        user['age'] = user['age'].map(convert_age)
        age_df = pd.get_dummies(user["age"], prefix="age")
        sex_df = pd.get_dummies(user["sex"], prefix="sex")
        user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
        user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)
        user.to_csv(dump_path,index=False)
    return user

#商品的基本信息
def get_basic_product_feat():
    dump_path = './data/cache/basic_product.csv'
    if os.path.exists(dump_path):
        product = pd.read_csv(dump_path)
    else:
        product = pd.read_csv(product_path)
        attr1_df = pd.get_dummies(product["a1"], prefix="a1")
        attr2_df = pd.get_dummies(product["a2"], prefix="a2")
        attr3_df = pd.get_dummies(product["a3"], prefix="a3")
        product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1)
        product.to_csv(dump_path,index=False)
    return product


def get_actions_1():
    action = pd.read_csv(action_1_path)
    return action

def get_actions_2():
    action2 = pd.read_csv(action_2_path)
    return action2

def get_actions_3():
    action3 = pd.read_csv(action_3_path)
    return action3

#行为数据
def get_actions(start_date, end_date):
    """

    :param start_date:
    :param end_date:
    :return: actions: pd.Dataframe
    """
    dump_path = './data/cache/all_action_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions = pd.read_csv(dump_path)
    else:
        action_1 = get_actions_1()
        action_1 = action_1[(action_1.time >= start_date) & (action_1.time < end_date)]
        action_2 = get_actions_2()
        action_2 = action_2[(action_2.time >= start_date) & (action_2.time < end_date)]
        actions=pd.concat([action_1,action_2])
        action_3 = get_actions_3()
        action_3 = action_3[(action_3.time >= start_date) & (action_3.time < end_date)]
        actions = pd.concat([actions, action_3]) # type: pd.DataFrame
        actions=actions[(actions.time>=start_date)&(actions.time<end_date)]
        actions.to_csv(dump_path,index=False)
    return actions


#所有行为的总和
def get_action_feat(start_date, end_date):
    dump_path = './data/cache/action_accumulate_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions =pd.read_csv(dump_path)
    else:
        actions = get_actions(start_date, end_date)
        actions = actions[['user_id', 'sku_id', 'type']]
        df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
        actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame
        actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
        del actions['type']
        actions.to_csv(dump_path,index=False)
    return actions

#行为按时间衰减
def get_accumulate_action_feat(start_date, end_date):
    dump_path = './data/cache/action_accumulate_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions = pd.read_csv(dump_path)
    else:
        actions = get_actions(start_date, end_date)
        df = pd.get_dummies(actions['type'], prefix='action')
        actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame
        actions.columns = ['user_id', 'sku_id', 'time', 'model_id', 'type',
                           'cate', 'brand','action_1', 'action_2', 'action_3',
                           'action_4', 'action_5', 'action_6']
        #近期行为按时间衰减
        actions['weights'] = actions['time'].map(lambda x: datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        #actions['weights'] = time.strptime(end_date, '%Y-%m-%d') - actions['datetime']
        actions['weights'] = actions['weights'].map(lambda x: math.exp(-x.days))
#         print actions
        actions['action_1'] = actions['action_1'] * actions['weights']
        actions['action_2'] = actions['action_2'] * actions['weights']
        actions['action_3'] = actions['action_3'] * actions['weights']
        actions['action_4'] = actions['action_4'] * actions['weights']
        actions['action_5'] = actions['action_5'] * actions['weights']
        actions['action_6'] = actions['action_6'] * actions['weights']
        del actions['model_id']
        del actions['type']
        del actions['time']
        del actions['weights']
        actions = actions.groupby(['user_id', 'sku_id', 'cate', 'brand'], as_index=False).sum()
        actions.to_csv(dump_path,index=False)
    return actions
#print get_accumulate_action_feat('2016-03-11','2016-04-11')
#评论数据
def get_comments_product_feat(start_date, end_date):
    dump_path = './data/cache/comments_accumulate_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        comments =pd.read_csv(dump_path)
    else:
        comments = pd.read_csv(comment_path)
        comment_date_end = end_date
        comment_date_begin = comment_date[0]
        for date in reversed(comment_date):
            if date < comment_date_end:
                comment_date_begin = date
                break
        comments = comments[(comments.dt >= comment_date_begin) & (comments.dt < comment_date_end)]
        df = pd.get_dummies(comments['comment_num'], prefix='comment_num')
        comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame
        #del comments['dt']
        #del comments['comment_num']
        comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]
        comments.to_csv(dump_path,index=False)
    return comments

#用户的行为转化率
def get_accumulate_user_feat(start_date, end_date):
    feature = ['user_id', 'user_action_1_ratio', 'user_action_2_ratio', 'user_action_3_ratio',
               'user_action_5_ratio', 'user_action_6_ratio']
    dump_path = './data/cache/user_feat_accumulate_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions = pd.read_csv(dump_path)
    else:
        actions = get_actions(start_date, end_date)
        df = pd.get_dummies(actions['type'], prefix='action')
        actions = pd.concat([actions['user_id'], df], axis=1)
        actions = actions.groupby(['user_id'], as_index=False).sum()
        actions['user_action_1_ratio'] = actions['action_4'] / actions['action_1']
        actions['user_action_2_ratio'] = actions['action_4'] / actions['action_2']
        actions['user_action_3_ratio'] = actions['action_4'] / actions['action_3']
        actions['user_action_5_ratio'] = actions['action_4'] / actions['action_5']
        actions['user_action_6_ratio'] = actions['action_4'] / actions['action_6']
        actions = actions[feature]
        actions.to_csv(dump_path,index=False)
    return actions
#用户平均访问间隔
def get_action_user_feat6(start_date,end_date):
    dump_path = './data/cache/user_feat6_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions = pd.read_csv(dump_path)
    else:

        df=get_actions(start_date,end_date)[['user_id','time']]
        df['user_id']=df['user_id'].astype('int')
        df['time']=df['time'].map(lambda x:x.split(' ')[0])
        df=df.drop_duplicates(['user_id','time'],keep='first')
        df['time']=df['time'].map(lambda x:datetime.strptime(x ,'%Y-%m-%d'))
        actions=df.groupby('user_id',as_index=False).agg(lambda x:x['time'].diff().mean())
        actions['avg_visit']=actions['time'].dt.days
        del actions['time']
        actions.to_csv(dump_path,index=False)
    return actions
#用户购买频率
def get_action_user_feat7(start_date, end_date):
    dump_path = './data/cache/user_feat7_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions =pd.read_csv(dump_path)
    else:
        actions = get_actions(start_date, end_date)[['user_id', 'sku_id', 'type']]
        actions = actions[actions['type']==4]
        del actions['type']
        actions = actions.groupby('user_id', as_index=False).count()
        actions['user_id']=actions['user_id'].astype('int')
        actions.columns=['user_id','buy_rate']
        actions['buy_rate']=actions['buy_rate'].map(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
        actions.to_csv(dump_path,index=False)
    return actions
#print get_action_user_feat7('2016-02-01', '2016-04-11')
#产品的行为转化率
def get_accumulate_product_feat(start_date, end_date):
    feature = ['sku_id', 'product_action_1_ratio', 'product_action_2_ratio', 'product_action_3_ratio',
               'product_action_5_ratio', 'product_action_6_ratio']
    dump_path = './data/cache/product_feat_accumulate_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions = pd.read_csv(dump_path)
    else:
        actions = get_actions(start_date, end_date)
        df = pd.get_dummies(actions['type'], prefix='action')
        actions = pd.concat([actions['sku_id'], df], axis=1)
        actions = actions.groupby(['sku_id'], as_index=False).sum()
        actions['product_action_1_ratio'] = actions['action_4'] / actions['action_1']
        actions['product_action_2_ratio'] = actions['action_4'] / actions['action_2']
        actions['product_action_3_ratio'] = actions['action_4'] / actions['action_3']
        actions['product_action_5_ratio'] = actions['action_4'] / actions['action_5']
        actions['product_action_6_ratio'] = actions['action_4'] / actions['action_6']
        actions = actions[feature]
        actions.to_csv(dump_path,index=False)
    return actions
#商品平均访问间隔
def get_action_product_feat7(start_date,end_date):
    dump_path = './data/cache/product_feat7_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions = pd.read_csv(dump_path)
    else:
        df=get_actions(start_date,end_date)[['sku_id','time']]
        df['time'] = df['time'].map(lambda x: x.split(' ')[0])
        df=df.drop_duplicates(['sku_id','time'],keep='first')
        df['time']=df['time'].map(lambda x:datetime.strptime(x,'%Y-%m-%d'))
        actions=df.groupby('sku_id',as_index=False).agg(lambda x: x['time'].diff().mean())
        actions['avg_visit']=actions['time'].dt.days
        del actions['time']
    return actions
#商品的重复购买率
def get_action_product_feat8(start_date,end_date):
    dump_path = './data/cache/product_feat7_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions = pd.read_csv(dump_path)
    else:
        df=get_actions(start_date,end_date)[['user_id','sku_id','type']]
        df=df[df['type']==4]#购买的行为
        df=df.groupby(['user_id','sku_id'],as_index=False).count()
        df.columns=['user_id','sku_id','count1']
        df['count1']=df['count1'].map(lambda x: 1 if x>1 else 0)
        grouped=df.groupby(['sku_id'],as_index=False)
        actions=grouped.count()[['sku_id','count1']]
        actions.columns=['sku_id','count']
        re_count=grouped.sum()[['sku_id','count1']]
        re_count.columns=['sku_id','re_count']
        actions=pd.merge(actions,re_count,on='sku_id',how='left')
        re_buy_rate=actions['re_count']/actions['count']
        actions=pd.concat([actions['sku_id'],re_buy_rate],axis=1)
        actions.columns=['sku_id','re_buy_rate']
        actions.to_csv(dump_path,index=False)
    return actions
#print get_action_product_feat8('2016-02-01', '2016-04-11')

#最近K天
def get_action_product_feat91(start_date, end_date):
    dump_path = './data/cache/product_feat9(1)_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions =pd.read_csv(dump_path)
    else:
        actions = get_actions(start_date, end_date)
        actions = actions[['user_id', 'sku_id', 'type']]
        df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
        actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame
        actions = actions.groupby('sku_id', as_index=False).sum()
        del actions['type']
        del actions['user_id']
        sku_id=actions['sku_id']
        del actions['sku_id']
        actions=actions.applymap(lambda x: 1 if x>0 else 0)
        actions=pd.concat([sku_id,actions],axis=1)
    return actions

def get_action_product_feat9(start_date,end_date):
    dump_path = './data/cache/product_feat9_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions = pd.read_csv(dump_path)
    else:

        for i in (1,2,3,4,5,7,14,28,56):
            start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i)
            start_days = start_days.strftime('%Y-%m-%d')
            actions = get_action_feat(start_days, end_date)

            if actions is None:
                actions = get_action_feat(start_days, end_date)
            else:
                actions = pd.merge(actions, get_action_feat(start_days, end_date), how='left',
                                   on=['user_id', 'sku_id'])

        pass
    pass


#标签
def get_labels(start_date, end_date):
    dump_path = './data/cache/labels_%s_%s.csv' % (start_date, end_date)
    if os.path.exists(dump_path):
        actions = pd.read_csv(dump_path)
    else:
        actions = get_actions(start_date, end_date)
        actions = actions[actions['type'] == 4]
        actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
        actions['label'] = 1
        actions = actions[['user_id', 'sku_id', 'label']]
        actions.to_csv(dump_path,index=False)
    return actions

#测试集
def make_test_set(train_start_date, train_end_date):
    dump_path = './data/cache/test_set_%s_%s.pkl' % (train_start_date, train_end_date)
    if os.path.exists(dump_path):
        actions = pickle.load(open(dump_path))
    else:
        start_days = "2016-02-01"
        user = get_basic_user_feat()
        product = get_basic_product_feat()
        user_acc = get_accumulate_user_feat(start_days, train_end_date)
        user_feat6=get_action_user_feat6(start_days,train_end_date)
        user_feat7 = get_action_user_feat7(start_days, train_end_date)
        product_acc = get_accumulate_product_feat(start_days, train_end_date)
        product_feat7=get_action_product_feat7(start_days,train_end_date)
        product_feat8 = get_action_product_feat8(start_days, train_end_date)
        comment_acc = get_comments_product_feat(train_start_date, train_end_date)
        #labels = get_labels(test_start_date, test_end_date)


        actions = None
        for i in (1, 2, 3, 5, 7, 10, 15, 21, 30):
            start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)
            start_days = start_days.strftime('%Y-%m-%d')
            if actions is None:
                actions = get_action_feat(start_days, train_end_date)
            else:
                actions = pd.merge(actions, get_action_feat(start_days, train_end_date), how='left',
                                   on=['user_id', 'sku_id'])

        actions = pd.merge(actions, user, how='left', on='user_id')
        actions = pd.merge(actions, user_acc, how='left', on='user_id')
        actions = pd.merge(actions, user_feat6, how='left', on='user_id')
        actions = pd.merge(actions, user_feat7, how='left', on='user_id')

        actions = pd.merge(actions, product, how='left', on='sku_id')
        actions = pd.merge(actions, product_feat7, how='left', on='sku_id')
        actions = pd.merge(actions, product_feat8, how='left', on='sku_id')
        actions = pd.merge(actions, product_acc, how='left', on='sku_id')
        actions = pd.merge(actions, comment_acc, how='left', on='sku_id')

        action = get_accumulate_action_feat(train_start_date, train_end_date)
        actions=pd.merge(actions,action,how='left',on=['user_id','sku_id'])
        #actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])
        actions = actions.fillna(0)
        actions = actions[actions['cate_y'] == 8]

    users = actions[['user_id', 'sku_id']].copy()
    del actions['user_id']
    del actions['sku_id']
    return users, actions
#训练集
def make_train_set(train_start_date, train_end_date, test_start_date, test_end_date):
    dump_path = './data/cache/train_set_%s_%s_%s_%s.pkl' % (train_start_date, train_end_date, test_start_date, test_end_date)
    if os.path.exists(dump_path):
        actions = pickle.load(open(dump_path))
    else:
        start_days = "2016-02-01"
        user = get_basic_user_feat()
        product = get_basic_product_feat()
        user_acc = get_accumulate_user_feat(start_days, train_end_date)
        user_feat6 = get_action_user_feat6(start_days, train_end_date)
        user_feat7= get_action_user_feat7(start_days, train_end_date)

        product_acc = get_accumulate_product_feat(start_days, train_end_date)
        product_feat7 = get_action_product_feat7(start_days, train_end_date)
        product_feat8=get_action_product_feat8(start_days,train_end_date)
        comment_acc = get_comments_product_feat(train_start_date, train_end_date)
        labels = get_labels(test_start_date, test_end_date)


        actions = None
        for i in (1, 2, 3, 5, 7, 10, 15, 21, 30):
            start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)
            start_days = start_days.strftime('%Y-%m-%d')
            if actions is None:
                actions = get_action_feat(start_days, train_end_date)
            else:
                actions = pd.merge(actions, get_action_feat(start_days, train_end_date), how='left',
                                   on=['user_id', 'sku_id'])
        actions = pd.merge(actions, user, how='left', on='user_id')
        actions = pd.merge(actions, user_acc, how='left', on='user_id')
        actions = pd.merge(actions, user_feat6, how='left', on='user_id')
        actions=pd.merge(actions,user_feat7,how='left',on='user_id')

        actions = pd.merge(actions, product, how='left', on='sku_id')
        actions = pd.merge(actions, product_acc, how='left', on='sku_id')
        actions = pd.merge(actions, product_feat7, how='left', on='sku_id')
        actions=pd.merge(actions,product_feat8,how='left',on='sku_id')
        actions = pd.merge(actions, comment_acc, how='left', on='sku_id')
        actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])
        action = get_accumulate_action_feat(train_start_date, train_end_date)
        actions=pd.merge(actions,action,how='left',on=['user_id', 'sku_id'])
        actions = actions.fillna(0)

    users = actions[['user_id', 'sku_id']].copy()
    labels = actions['label'].copy()
    del actions['user_id']
    del actions['sku_id']
    del actions['label']

    return users, actions, labels


def report(pred, label):

    actions = label
    result = pred

    # 所有用户商品对
    all_user_item_pair = actions['user_id'].map(str) + '-' + actions['sku_id'].map(str)
    all_user_item_pair = np.array(all_user_item_pair)
    # 所有购买用户
    all_user_set = actions['user_id'].unique()

    # 所有品类中预测购买的用户
    all_user_test_set = result['user_id'].unique()
    all_user_test_item_pair = result['user_id'].map(str) + '-' + result['sku_id'].map(str)
    all_user_test_item_pair = np.array(all_user_test_item_pair)

    # 计算所有用户购买评价指标
    pos, neg = 0,0
    for user_id in all_user_test_set:
        if user_id in all_user_set:
            pos += 1
        else:
            neg += 1
    all_user_acc = 1.0 * pos / ( pos + neg)
    all_user_recall = 1.0 * pos / len(all_user_set)
    print('所有用户中预测购买用户的准确率为 ' + str(all_user_acc))
    print('所有用户中预测购买用户的召回率' + str(all_user_recall))

    pos, neg = 0, 0
    for user_item_pair in all_user_test_item_pair:
        if user_item_pair in all_user_item_pair:
            pos += 1
        else:
            neg += 1
    all_item_acc = 1.0 * pos / ( pos + neg)
    all_item_recall = 1.0 * pos / len(all_user_item_pair)
    print( '所有用户中预测购买商品的准确率为 ' + str(all_item_acc))
    print ('所有用户中预测购买商品的召回率' + str(all_item_recall))
    F11 = 6.0 * all_user_recall * all_user_acc / (5.0 * all_user_recall + all_user_acc)
    F12 = 5.0 * all_item_acc * all_item_recall / (2.0 * all_item_recall + 3 * all_item_acc)
    score = 0.4 * F11 + 0.6 * F12
    print('F11=' + str(F11))
    print ('F12=' + str(F12))
    print ('score=' + str(score))


# train_start_date = '2016-03-10'
# train_end_date = '2016-04-11'
# test_start_date = '2016-04-11'
# test_end_date = '2016-04-16'
# user, action, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
# print( user.head(10))
# print (action.head(10))








In [None]:
train_start_date = '2016-03-10'
train_end_date = '2016-04-11'
test_start_date = '2016-04-11'
test_end_date = '2016-04-16'

sub_start_date = '2016-03-15'
sub_end_date = '2016-04-16'

user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0)
# #     dtrain=xgb.DMatrix(X_train, label=y_train)
# #     dtest=xgb.DMatrix(X_test, label=y_test)

# #     param = {'learning_rate' : 0.05, 'n_estimators': 1000, 'max_depth': 3,'min_child_weight': 5, 'gamma': 0,
# #               'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
# #     num_round = 300
# #     param['nthread'] = 4
#     #param['eval_metric'] = "auc"
#      list(X2_X5_MAPPINGS.keys())
#     plst = param.items()
#     plst =list(param.keys())
# #     d = dict1.copy()
# # d.update(dict2)
#     plst += [('eval_metric', 'logloss')]

#     evallist = [(dtest, 'eval'), (dtrain, 'train')]

#     bst=xgb.train(plst, dtrain, num_round, evallist)
#     sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,)
#     sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
#     y = bst.predict(sub_trainning_data)

xgb_1=xgb.XGBClassifier(learning_rate=0.05, n_estimators=1000, max_depth=3,min_child_weight= 5,
                        gamma=0,subsample=1.0, colsample_bytree= 0.8,
                        scale_pos_weight= 1, silent=True,objective= 'binary:logistic',nthread=10)

xgb_1.fit(X_train, y_train,eval_set=[(X_test,y_test)],eval_metric="error",early_stopping_rounds =200,verbose=True)

sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,)


y= xgb_1.predict(sub_trainning_data.values)

#     sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,)

sub_user_index['label'] = y
pred = sub_user_index[sub_user_index['label'] >= 0.03]
pred = pred[['user_id', 'sku_id']]
pred = pred.groupby('user_id').first().reset_index()
pred['user_id'] = pred['user_id'].astype(int)
pred.to_csv('./data/sub/submission4.csv', index=False, index_label=False)



In [59]:
print(y_test)

[ 0.  0.  0. ...,  0.  0.  0.]


In [48]:
pred.to_csv('./data/sub/submission4.csv', index=False, index_label=False)

In [49]:


from sklearn.model_selection import train_test_split
import xgboost as xgb
 
def xgboost_make_submission():
    train_start_date = '2016-03-10'
    train_end_date = '2016-04-11'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'

    sub_start_date = '2016-03-15'
    sub_end_date = '2016-04-16'

    user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0)
# #     dtrain=xgb.DMatrix(X_train, label=y_train)
# #     dtest=xgb.DMatrix(X_test, label=y_test)

# #     param = {'learning_rate' : 0.05, 'n_estimators': 1000, 'max_depth': 3,'min_child_weight': 5, 'gamma': 0,
# #               'subsample': 1.0, 'colsample_bytree': 0.8, 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
# #     num_round = 300
# #     param['nthread'] = 4
#     #param['eval_metric'] = "auc"
#      list(X2_X5_MAPPINGS.keys())
#     plst = param.items()
#     plst =list(param.keys())
# #     d = dict1.copy()
# # d.update(dict2)
#     plst += [('eval_metric', 'logloss')]

#     evallist = [(dtest, 'eval'), (dtrain, 'train')]

#     bst=xgb.train(plst, dtrain, num_round, evallist)
#     sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,)
#     sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
#     y = bst.predict(sub_trainning_data)


#     xgb_1=xgb.XGBClassifier(learning_rate=0.05, n_estimators=100, max_depth=3,min_child_weight= 5,
#                             gamma=0,subsample=1.0, colsample_bytree= 0.8,
#                             scale_pos_weight= 1, silent=1,objective= 'binary:logistic',nthread=10)
    xgb_1=xgb.XGBClassifier(learning_rate=0.1, n_estimators=100, max_depth=3,min_child_weight= 5,
                            gamma=0,subsample=1.0, colsample_bytree= 0.8,
                            scale_pos_weight= 1, silent=1,objective= 'binary:logistic',nthread=10)
    
    xgb_1.fit(X_train, y_train)
    
    sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,)

    
    y= xgb_1.predict(sub_trainning_data.values)
    
#     sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,)
    
    sub_user_index['label'] = y
    pred = sub_user_index[sub_user_index['label'] >= 0.03]
    pred = pred[['user_id', 'sku_id']]
    pred = pred.groupby('user_id').first().reset_index()
    pred['user_id'] = pred['user_id'].astype(int)
    pred.to_csv('./sub/submission4.csv', index=False, index_label=False)



def xgboost_cv():
    train_start_date = '2016-03-05'
    train_end_date = '2016-04-06'
    test_start_date = '2016-04-11'
    test_end_date = '2016-04-16'

    sub_start_date = '2016-02-05'
    sub_end_date = '2016-03-05'
    sub_test_start_date = '2016-03-05'
    sub_test_end_date = '2016-03-10'

    user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
    X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0)

    dtrain=xgb.DMatrix(X_train, label=y_train)
    dtest=xgb.DMatrix(X_test, label=y_test)

    param = {'max_depth': 10, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
    num_round = 400
    param['nthread'] = 4
    param['eval_metric'] = "auc"
    plst = param.dict_items
    plst += [('eval_metric', 'logloss')]

    evallist = [(dtest, 'eval'), (dtrain, 'train')]
    bst=xgb.train( plst, dtrain, num_round, evallist)

    sub_user_index, sub_trainning_data, sub_label = make_train_set(sub_start_date, sub_end_date,
                                                                   sub_test_start_date, sub_test_end_date)

    test = xgb.DMatrix(sub_trainning_data.values)
    y = bst.predict(test)
    pred = sub_user_index.copy()
    y_true = sub_user_index.copy()
    pred['label'] = y
    y_true['label'] = label
    report(pred, y_true)


if __name__ == '__main__':
    #xgboost_cv()
    xgboost_make_submission()


FileNotFoundError: [Errno 2] No such file or directory: './sub/submission4.csv'

In [41]:
train_start_date = '2016-03-10'
train_end_date = '2016-04-11'
test_start_date = '2016-04-11'
test_end_date = '2016-04-16'

sub_start_date = '2016-03-15'
sub_end_date = '2016-04-16'

user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0)
print(X_train)
print(y_train)

[[  1.           0.           0.         ...,   0.           0.           5.        ]
 [  4.           0.           0.         ...,   0.           0.           6.        ]
 [  2.           0.           0.         ...,   0.           0.
    1.00638317]
 ..., 
 [  2.           0.           0.         ...,   0.           0.
    3.01276451]
 [ 14.           0.           0.         ...,   0.           0.
   23.85868555]
 [  2.           0.           0.         ...,   0.           0.           3.        ]]
[ 0.  0.  0. ...,  0.  0.  0.]


In [42]:
sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,)
print(sub_trainning_data.values)
print(sub_trainning_data.values)

         user_id  sku_id
1       200006.0  151327
2       200009.0   36307
3       200009.0   52343
4       200009.0   69209
5       200012.0   39253
6       200015.0    6533
8       200015.0   52343
9       200015.0   64974
14      200025.0   58568
15      200025.0   81462
17      200025.0  123578
18      200025.0  169819
20      200028.0   40336
21      200028.0   59175
22      200028.0  103132
23      200029.0   44854
25      200033.0   21147
26      200033.0   44854
27      200034.0   21457
30      200038.0   47193
33      200040.0    5825
34      200040.0   18031
35      200040.0   24065
36      200040.0   39425
37      200040.0   63006
38      200040.0   79636
39      200040.0  140687
40      200040.0  166707
43      200054.0   27069
46      200054.0   37995
...          ...     ...
103624  305269.0   94944
103625  305269.0   95850
103626  305269.0  133436
103627  305269.0  134084
103628  305269.0  149854
103629  305269.0  160208
103630  305269.0  170870
103636  305287.0    4723


In [46]:
print(X_train)

[[  1.           0.           0.         ...,   0.           0.           5.        ]
 [  4.           0.           0.         ...,   0.           0.           6.        ]
 [  2.           0.           0.         ...,   0.           0.
    1.00638317]
 ..., 
 [  2.           0.           0.         ...,   0.           0.
    3.01276451]
 [ 14.           0.           0.         ...,   0.           0.
   23.85868555]
 [  2.           0.           0.         ...,   0.           0.           3.        ]]


In [45]:

print(sub_trainning_data.values)
print("1111")
print(sub_trainning_data.values)

[[  1.           0.           0.         ...,   0.           0.           0.        ]
 [  1.           0.           0.         ...,   0.           0.           0.        ]
 [  1.           0.           0.         ...,   0.           0.           0.        ]
 ..., 
 [  3.           0.           0.         ...,   0.           1.
   16.00000678]
 [  1.           0.           0.         ...,   0.           0.           3.        ]
 [  1.           0.           0.         ...,   0.           0.           1.        ]]
1111
[[  1.           0.           0.         ...,   0.           0.           0.        ]
 [  1.           0.           0.         ...,   0.           0.           0.        ]
 [  1.           0.           0.         ...,   0.           0.           0.        ]
 ..., 
 [  3.           0.           0.         ...,   0.           1.
   16.00000678]
 [  1.           0.           0.         ...,   0.           0.           3.        ]
 [  1.           0.           0.         ..., 