In [1]:
import pandas as pd

train_off = pd.read_csv("data/ccf_offline_stage1_train.csv")
train_on = pd.read_csv("data/ccf_online_stage1_train.csv")
test = pd.read_csv("data/ccf_offline_stage1_test_revised.csv")

In [216]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from datetime import datetime

# 1.User_id.用户id
# 2.Merchant_id.商户id
# 3.Coupon_id.优惠券id
# 4.Distance.用户经常活动区域离最近门店距离（单位0.5公里）
# 5.Date_received.领券日期yyyymmdd


# 6.Date.用券日期


train_offline = train_off.copy(deep=True)
train_online = train_on.copy(deep=True)
testCopy = test.copy(deep=True)


## 数据预处理

In [217]:
# 1.优惠券未使用：Date=null & Coupon_id != null
# 2.普通消费：Date!=null & Coupon_id = null    （这部分数据和优惠券无关，可以删除）
# 3.优惠券已使用：Date!=null & Coupon_id != null

# 为训练数据生成label。
# 因为训练集中没有明确的label，需要根据提示生成label：优惠券被使用（1）；优惠券未被使用（0）
train_offline.drop(train_offline[train_offline["Date"].notnull() & train_offline["Coupon_id"].isnull()].index,inplace=True)
train_offline.loc[train_offline["Date"].isnull() & train_offline["Coupon_id"].notnull(),"isUsed"] = 0
train_offline.loc[train_offline["Date"].notnull() & train_offline["Coupon_id"].notnull(),"isUsed"] = 1


In [218]:
# 对训练数据和测试数据做预处理
combine = [train_offline,testCopy]

In [219]:

def moneyOff2Rate(mf):
    if ":" in mf:
        discount = mf.split(":")
#     print("{}{}".format(discount[0],discount[1]))
        return 1 - float(discount[1])/float(discount[0])
    else:
        return float(mf)

def dateToMonth(date):
    d = datetime.strptime(str(int(date)), '%Y%m%d')
    return d.month

for dataset in combine:
    # ①处理Discount_rate，把数据统一为[0,1]
    # 1.NaN -> 取max，
    # 2.[0,1]的，不做处理
    # 3.x:y的，(x-y)/x

    # train_offline.drop(train_offline[train_offline["Discount_rate"].isnull()].index,inplace=True)

    # 把满减转为[0,1]比率
    dataset["Discount_rate"] = dataset["Discount_rate"].apply(moneyOff2Rate)
    # NaN -> 最大值
    dataset.loc[dataset["Discount_rate"].isnull(),"Discount_rate"] = dataset["Discount_rate"].max()
    
    # ②处理Distance
    # NaN -> 0
    dataset.loc[dataset[dataset["Distance"].isnull()].index,"Distance"] = 0
    
    # 对每个用户领取优惠券的次数进行统计
    s = dataset["User_id"].value_counts()
    m = {}
    for v in s.index:
        m[v] = s[v]
    dataset["Quantity"] = dataset["User_id"].map(m)
    # ③添加新字段。根据用户领取优惠券次数，再把次数分为5类。用户领券次数和是否用券可能有关系
    maxQuantity = dataset["Quantity"].max()
    dataset.loc[dataset["Quantity"] == 1,"Quantity"] = 0
    dataset.loc[(dataset["Quantity"] > 1) & (dataset["Quantity"] <= maxQuantity/4),"Quantity"] = 1
    dataset.loc[(dataset["Quantity"] > maxQuantity/4) & (dataset["Quantity"] <= maxQuantity/2),"Quantity"] = 2
    dataset.loc[(dataset["Quantity"] > maxQuantity/2) & (dataset["Quantity"] <= maxQuantity/4 * 3),"Quantity"] = 3
    dataset.loc[(dataset["Quantity"] > maxQuantity/4 * 3) & (dataset["Quantity"] <= maxQuantity),"Quantity"] = 4
    # ④处理Date_received
    # 优惠券领取时间可能和是否用券有关，这里取“月份”来替换原来的值
    dataset["Date_received"] = dataset["Date_received"].apply(dateToMonth)
    
# 删除不用的和临时的column
X_train = train_offline.drop(train_offline[["User_id","Merchant_id","Coupon_id","Date","isUsed"]],axis=1)
y_train = train_offline["isUsed"]
X_test = testCopy.drop(testCopy[["User_id","Merchant_id","Coupon_id"]],axis=1)

In [245]:
# X_train.head()
X_test.shape
test.head(20)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received
0,4129537,450,9983,30:5,1.0,20160712
1,6949378,1300,3429,30:5,,20160706
2,2166529,7113,6928,200:20,5.0,20160727
3,2166529,7113,1808,100:10,5.0,20160727
4,6172162,7605,6500,30:1,2.0,20160708
5,4005121,450,9983,30:5,0.0,20160706
6,4347394,450,9983,30:5,0.0,20160716
7,3094273,760,13602,30:5,1.0,20160727
8,5139970,450,9983,30:5,10.0,20160729
9,3237121,760,13602,30:5,1.0,20160703


## 图表

In [173]:
# g = sns.FacetGrid(train_offline,col="isUsed")
# g.map(plt.hist, 'Distance', alpha=.5, bins=20)
# train_offline[train_offline.Distance == 0].head(15)

## 训练

In [222]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
pred = lr.predict_proba(X_test)



In [241]:
# pred[:,1].mean()
testCopy.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Quantity
0,4129537,450,9983,0.833333,1.0,7,0
1,6949378,1300,3429,0.833333,0.0,7,0
2,2166529,7113,6928,0.9,5.0,7,1
3,2166529,7113,1808,0.9,5.0,7,1
4,6172162,7605,6500,0.966667,2.0,7,0


## 提交

In [276]:
# 合并需要提交的数据
submission = pd.concat([test["User_id"],test["Coupon_id"],test["Date_received"],pd.Series(pred[:,1])],axis=1)
# 概率取小数点后两位
submission[0] = submission[0].round(2)

In [279]:
# submission.head()
submission.to_csv("result/submission.csv",index=None,header=None)

In [277]:
submission.head()

Unnamed: 0,User_id,Coupon_id,Date_received,0
0,4129537,9983,20160712,0.06
1,6949378,3429,20160706,0.08
2,2166529,6928,20160727,0.07
3,2166529,1808,20160727,0.07
4,6172162,6500,20160708,0.05
