In [7]:
# 用户-商户相关特征，9个，从训练样本date,date_received提取特征： 
#         user_merchant_buy_total # 用户在商户消费次数
#         user_merchant_received # 用户领取商户优惠券次数
#         user_merchant_buy_use_coupon # 用户在商户使用优惠券消费次数
#         user_merchant_any # 用户在商户的所有消费次数
#         user_merchant_buy_common # 用户在商户普通消费次数
#         user_merchant_coupon_transfer_rate # 用户对商户的优惠券转化率
#         user_merchant_coupon_buy_rate # 用户对商户使用优惠券消费占总消费比例
#         user_merchant_rate # 用户对商户消费占总交互比例
#         user_merchant_common_buy_rate # 用户对商户普通消费占总消费比例

In [8]:
import pandas as pd

off_train = pd.read_csv('data/ccf_offline_stage1_train.csv',parse_dates=["Date"],header=0)
off_train.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received','date']

off_test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv',header=0)
off_test.columns = ['user_id','merchant_id','coupon_id','discount_rate','distance','date_received']

on_train = pd.read_csv('data/ccf_online_stage1_train.csv',parse_dates=["Date"],header=0)
on_train.columns = ['user_id','merchant_id','action','coupon_id','discount_rate','date_received','date']

# 【dataset3】20160701~20160731 (113640)   【feature3】20160315~20160630  （测试集）
dataset3 = off_test.dropna(how='any')
feature3 = off_train[((off_train.date >= '20160315') & (off_train.date <= '20160630'))
                   | ((off_train.date == 'null') & (off_train.date_received >= '20160315') 
                      &  (off_train.date_received <= '20160630'))
                   ].dropna(how='any')
# 【dataset2】20160515~20160615 (258446)   【feature2】20160201~20160514  （训练集2）
dataset2 = off_train[((off_train.date_received >= '20160515') & (off_train.date_received <= '20160615'))].dropna(how='any')
feature2 = off_train[((off_train.date >= '20160201') & (off_train.date <= '20160514'))
                   | ((off_train.date == 'null') & (off_train.date_received >= '20160201') 
                      &  (off_train.date_received <= '20160514'))
                   ].dropna(how='any')
# 【dataset1】20160414~20160514 (138303)   【feature1】20160101~20160413  （训练集1）
dataset1 = off_train[((off_train.date_received >= '20160414') & (off_train.date_received <= '20160514'))].dropna(how='any')
feature1 = off_train[((off_train.date >= '20160101') & (off_train.date <= '20160413'))
                   | ((off_train.date == 'null') & (off_train.date_received >= '20160101') 
                      &  (off_train.date_received <= '20160413'))
                   ].dropna(how='any')

In [9]:
from datetime import date
import numpy as np

In [10]:
def userMerchantRelatedFeature(feature,file_name):
    all_user_merchant = feature[['user_id', 'merchant_id']]
    all_user_merchant.drop_duplicates(inplace=True)

    # user_merchant_buy_total # 用户在商户消费次数
    t = feature[['user_id', 'merchant_id','date']]
    t = t[t.date != 'null'][['user_id','merchant_id']]
    t['user_merchant_buy_total'] = 1
    t = t.groupby(['user_id', 'merchant_id']).agg('sum').reset_index()
    t.drop_duplicates(inplace=True)
    t.ix[:5]

    #  user_merchant_received # 用户领取商户优惠券次数
    t1 = feature[['user_id', 'merchant_id','coupon_id']]
    t1 = t1[t1.coupon_id != 'null'][['user_id', 'merchant_id']]
    t1['user_merchant_received'] = 1
    t1 = t1.groupby(['user_id', 'merchant_id']).agg('sum').reset_index()
    t1.drop_duplicates(inplace=True)
    t1.ix[:5]

    # user_merchant_buy_use_coupon # 用户在商户使用优惠券消费次数
    t2 = feature[['user_id', 'merchant_id', 'date', 'date_received']]
    t2 = t2[(t2.date != 'null') & (t2.date_received != 'null')][['user_id', 'merchant_id']]
    t2['user_merchant_buy_use_coupon'] = 1
    t2 = t2.groupby(['user_id', 'merchant_id']).agg('sum').reset_index()
    t2.drop_duplicates(inplace=True)
    t2.ix[:5]

    # user_merchant_any # 用户在商户的所有消费次数
    t3 = feature[['user_id', 'merchant_id']]
    t3['user_merchant_any'] = 1
    t3 = t3.groupby(['user_id', 'merchant_id']).agg('sum').reset_index()
    t3.drop_duplicates(inplace=True)
    t3.ix[:5]

    # user_merchant_buy_common # 用户在商户普通消费次数
    t4 = feature[['user_id', 'merchant_id', 'date', 'coupon_id']]
    t4 = t4[(t4.date != 'null') & (t4.coupon_id == 'null')][['user_id', 'merchant_id']]
    t4['user_merchant_buy_common'] = 1
    t4 = t4.groupby(['user_id', 'merchant_id']).agg('sum').reset_index()
    t4.drop_duplicates(inplace=True)
    t4.ix[:5]   

    # 合并数据
    user_merchant3 = pd.merge(all_user_merchant, t, on=['user_id', 'merchant_id'], how='left')
    user_merchant3 = pd.merge(user_merchant3, t1, on=['user_id', 'merchant_id'], how='left')
    user_merchant3 = pd.merge(user_merchant3, t2, on=['user_id', 'merchant_id'], how='left')
    user_merchant3 = pd.merge(user_merchant3, t3, on=['user_id', 'merchant_id'], how='left')
    user_merchant3 = pd.merge(user_merchant3, t4, on=['user_id', 'merchant_id'], how='left')

    user_merchant3.user_merchant_buy_use_coupon = user_merchant3.user_merchant_buy_use_coupon.replace(np.nan, 0)
    user_merchant3.user_merchant_buy_common = user_merchant3.user_merchant_buy_common.replace(np.nan, 0)
    user_merchant3.ix[:5]

    # user_merchant_coupon_transfer_rate # 用户对商户的优惠券转化率 
    user_merchant3['user_merchant_coupon_transfer_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype(
            'float') / user_merchant3.user_merchant_received.astype('float')
    user_merchant3.ix[:5]

    # user_merchant_coupon_buy_rate # 用户对商户使用优惠券消费占总消费比例
    user_merchant3['user_merchant_coupon_buy_rate'] = user_merchant3.user_merchant_buy_use_coupon.astype(
            'float') / user_merchant3.user_merchant_buy_total.astype('float')
    user_merchant3.ix[:5]

    # user_merchant_rate # 用户对商户消费占总交互比例
    user_merchant3['user_merchant_rate'] = user_merchant3.user_merchant_buy_total.astype(
            'float') / user_merchant3.user_merchant_any.astype('float')
    user_merchant3.ix[:5]

    # user_merchant_common_buy_rate # 用户对商户普通消费占总消费比例
    user_merchant3['user_merchant_common_buy_rate'] = user_merchant3.user_merchant_buy_common.astype(
            'float') / user_merchant3.user_merchant_buy_total.astype('float')
    user_merchant3.ix[:5]

    user_merchant3 = user_merchant3.replace(np.nan, 0)
    user_merchant3.ix[:5]
    
    user_merchant3.to_csv(file_name, index=None)
    return user_merchant3

In [11]:
user_merchant3 = userMerchantRelatedFeature(feature3,'data/user_merchant3.csv')
user_merchant3.ix[:5]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,user_id,merchant_id,user_merchant_buy_total,user_merchant_received,user_merchant_buy_use_coupon,user_merchant_any,user_merchant_buy_common,user_merchant_coupon_transfer_rate,user_merchant_coupon_buy_rate,user_merchant_rate,user_merchant_common_buy_rate
0,1439408,4663,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
1,1439408,2632,2.0,3.0,1.0,4,1.0,0.333333,0.5,0.5,0.5
2,1832624,3381,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
3,2029232,450,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
4,2029232,6459,2.0,1.0,0.0,3,2.0,0.0,0.0,0.666667,1.0
5,2747744,6901,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0


In [12]:
user_merchant2 = userMerchantRelatedFeature(feature2,'data/user_merchant2.csv')
user_merchant2.ix[:5]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,user_id,merchant_id,user_merchant_buy_total,user_merchant_received,user_merchant_buy_use_coupon,user_merchant_any,user_merchant_buy_common,user_merchant_coupon_transfer_rate,user_merchant_coupon_buy_rate,user_merchant_rate,user_merchant_common_buy_rate
0,1439408,2632,1.0,2.0,0.0,3,1.0,0.0,0.0,0.333333,1.0
1,1832624,3381,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
2,73611,2099,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
3,163606,1569,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
4,94107,3381,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
5,253750,8390,1.0,1.0,0.0,2,1.0,0.0,0.0,0.5,1.0


In [13]:
user_merchant1 = userMerchantRelatedFeature(feature1,'data/user_merchant1.csv')
user_merchant1.ix[:5]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return func(*args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,user_id,merchant_id,user_merchant_buy_total,user_merchant_received,user_merchant_buy_use_coupon,user_merchant_any,user_merchant_buy_common,user_merchant_coupon_transfer_rate,user_merchant_coupon_buy_rate,user_merchant_rate,user_merchant_common_buy_rate
0,1439408,2632,1.0,2.0,0.0,3,1.0,0.0,0.0,0.333333,1.0
1,2029232,3381,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
2,2223968,3381,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
3,73611,2099,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
4,3273056,4833,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
5,94107,3381,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0
