In [1]:
# import libraries necessary for this project
import os, sys, pickle

import numpy as np
import pandas as pd

from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## 数据集导入

In [3]:
#导入数据集
dfoff = pd.read_csv("C:/Users/Administrator/Desktop/o2odatasets/ccf_offline_stage1_train.csv")
dfon = pd.read_csv('C:/Users/Administrator/Desktop/o2odatasets/ccf_online_stage1_train.csv')
dftest = pd.read_csv('C:/Users/Administrator/Desktop/o2odatasets/ccf_offline_stage1_test_revised.csv')

#print(dfoff.head(5))
#print(dftest.head(5))
#dfoff[dfoff['Date_received'].isnull()].shape[0]

In [34]:
#做个简单统计，看一看究竟用户是否使用优惠券消费的情况。
print('有优惠卷，购买商品：%d' % dfoff[(dfoff['Date_received'] .notnull()) & (dfoff['Date'] .notnull())].shape[0])
print('有优惠卷，未购商品：%d' % dfoff[(dfoff['Date_received'] .notnull()) & (dfoff['Date'] .isnull())].shape[0])
print('无优惠卷，购买商品：%d' % dfoff[(dfoff['Date_received'] .isnull()) & (dfoff['Date'] .notnull())].shape[0])
print('无优惠卷，未购商品：%d' % dfoff[(dfoff['Date_received'] .isnull()) & (dfoff['Date'] .isnull())].shape[0])

有优惠卷，购买商品：75382
有优惠卷，未购商品：977900
无优惠卷，购买商品：701602
无优惠卷，未购商品：0


### 1.打折率（Discount_rate）
首先，第一个想到的特征应该是优惠卷的打折率。因为很显然，一般情况下优惠得越多，用户就越有可能使用优惠券。那么，我们就来看一下训练集中优惠卷有哪些类型。

In [7]:
print('Discount_rate 类型：\n',dfoff['Discount_rate'].unique())

Discount_rate 类型：
 [nan '150:20' '20:1' '200:20' '30:5' '50:10' '10:5' '100:10' '200:30'
 '20:5' '30:10' '50:5' '150:10' '100:30' '200:50' '100:50' '300:30'
 '50:20' '0.9' '10:1' '30:1' '0.95' '100:5' '5:1' '100:20' '0.8' '50:1'
 '200:10' '300:20' '100:1' '150:30' '300:50' '20:10' '0.85' '0.6' '150:50'
 '0.75' '0.5' '200:5' '0.7' '30:20' '300:10' '0.2' '50:30' '200:100'
 '150:5']


根据打印的结果来看，打折率分为 3 种情况：
- ‘null’ 表示没有打折
- （0,1） 表示折扣率
- x:y 表示满 x 减 y
那我们的处理方式可以构建 4 个函数，分别提取 4 种特征，分别是：
打折类型：getDiscountType()    
折扣率：convertRate()
满多少：getDiscountMan()
减多少：getDiscountJian()

In [65]:
dfoff['Discount_rate'].head(5)

0       NaN
1    150:20
2      20:1
3      20:1
4      20:1
Name: Discount_rate, dtype: object

In [4]:
# Convert Discount_rate and Distance
#注意，dataframe中的列Discount_rate包含的元素为对象数据类型（Name: Discount_rate, dtype: object）
#判断“:”是否在其中时需要先全列转换为字符 row=str(row)
def getDiscountType(row):           #打折类型
   row=str(row)
   if row=='nan' :
       return 'nan'
   elif ":" in row:
       return 1
   else:
       return 0
def convertRate(row):              #折扣率
   """Convert discount to rate"""
   row=str(row)
   if row == 'nan':
       return 1.0
   elif ':' in str(row):
       rows = row.split(':')
       return 1.0 - float(rows[1])/float(rows[0])
   else:
       return float(row)
def getDiscountMan(row):           #满多少
   row=str(row)
   if ':' in row:
       rows = row.split(':')
       return int(rows[0])
   else:
       return 0
def getDiscountJian(row):         #减多少
   row=str(row)
   if ':' in row:
       rows = row.split(':')
       return int(rows[1])
   else:
       return 0
def processData(df):              #包装函数
   # convert discount_rate
   df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
   df['discount_rate'] = df['Discount_rate'].apply(convertRate)
   df['discount_man']  = df['Discount_rate'].apply(getDiscountMan)
   df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)

   print(df['discount_rate'].unique())

   return df

In [5]:
#对训练集和测试集分别进行进行 processData（）函数的处理：
dfoff = processData(dfoff)
dftest = processData(dftest)


[1.         0.86666667 0.95       0.9        0.83333333 0.8
 0.5        0.85       0.75       0.66666667 0.93333333 0.7
 0.6        0.96666667 0.98       0.99       0.975      0.33333333
 0.2        0.4       ]
[0.83333333 0.9        0.96666667 0.8        0.95       0.75
 0.98       0.5        0.86666667 0.6        0.66666667 0.7
 0.85       0.33333333 0.94       0.93333333 0.975      0.99      ]


In [87]:
dfoff.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,discount_jian
0,1439408,2632,,,0.0,,20160217.0,,,0,0
1,1439408,4663,11002.0,150:20,1.0,20160528.0,,1.0,0.866667,150,20
2,1439408,2632,8591.0,20:1,0.0,20160217.0,,1.0,0.95,20,1
3,1439408,2632,1078.0,20:1,0.0,20160319.0,,1.0,0.95,20,1
4,1439408,2632,8591.0,20:1,0.0,20160613.0,,1.0,0.95,20,1


### 2.距离（Distance）
距离字段表示用户与商店的地理距离，显然，距离的远近也会影响到优惠券的使用与否。那么，我们就可以把距离也作为一个特征。首先看一下距离有哪些特征值：

In [88]:
print('Distance 类型：',dfoff['Distance'].unique())

Distance 类型： [ 0.  1. nan  2. 10.  4.  7.  9.  3.  5.  6.  8.]


In [6]:
# convert distance
dfoff['distance'] = dfoff['Distance'].fillna(-1).astype(int)
print(dfoff['distance'].unique())
dftest['distance'] = dftest['Distance'].fillna(-1).astype(int)
print(dftest['distance'].unique())

[ 0  1 -1  2 10  4  7  9  3  5  6  8]
[ 1 -1  5  2  0 10  3  6  7  4  9  8]


3.领劵日期（Date_received）
是还有一点很重要的是领券日期，因为一般而言，周末领取优惠券去消费的可能性更大一些。因此，我们可以构建关于领券日期的一些特征：
weekday : {null, 1, 2, 3, 4, 5, 6, 7}
weekday_type : {1, 0}（周六和周日为1，其他为0）
Weekday_1 : {1, 0, 0, 0, 0, 0, 0}
Weekday_2 : {0, 1, 0, 0, 0, 0, 0}
Weekday_3 : {0, 0, 1, 0, 0, 0, 0}
Weekday_4 : {0, 0, 0, 1, 0, 0, 0}
Weekday_5 : {0, 0, 0, 0, 1, 0, 0}
Weekday_6 : {0, 0, 0, 0, 0, 1, 0}
Weekday_7 : {0, 0, 0, 0, 0, 0, 1}

In [7]:
#其中用到了独热编码（One-Hot Encoding），让特征更加丰富。相应的这 9 个特征的提取函数为：
def getWeekday(row):
   row=str(row)
   if row == 'nan':
       return row
   else:               #date(year,month,day) , weekday(),0=周一,1=周二,6=星期日)
       return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1

dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)

# weekday_type :  周六和周日为1，其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
dftest['weekday_type'] = dftest['weekday'].apply(lambda x: 1 if x in [6,7] else 0)

# change weekday to one-hot encoding 
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
#print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

In [96]:
dfoff.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,...,distance,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1439408,2632,,,0.0,,20160217.0,,,0,...,0,,0,0,0,0,0,0,0,0
1,1439408,4663,11002.0,150:20,1.0,20160528.0,,1.0,0.866667,150,...,1,6.0,1,0,0,0,0,0,1,0
2,1439408,2632,8591.0,20:1,0.0,20160217.0,,1.0,0.95,20,...,0,3.0,0,0,0,1,0,0,0,0
3,1439408,2632,1078.0,20:1,0.0,20160319.0,,1.0,0.95,20,...,0,6.0,1,0,0,0,0,0,1,0
4,1439408,2632,8591.0,20:1,0.0,20160613.0,,1.0,0.95,20,...,0,1.0,0,1,0,0,0,0,0,0


### 标注标签 Label
有了特征之后，我们还需要对训练样本进行 label 标注，即确定哪些是正样本（y = 1），哪些是负样本（y = 0）。我们要预测的是用户在领取优惠券之后 15 之内的消费情况。所以，总共有三种情况：
- Date_received == ‘null’：
表示没有领到优惠券，无需考虑，y = -1
- (Date_received != ‘null’) & (Date != ‘null’) & (Date – Date_received <= 15)：
表示领取优惠券且在15天内使用，即正样本，y = 1
- (Date_received != ‘null’) & ((Date == ‘null’) | (Date – Date_received > 15))：
表示领取优惠券未在在15天内使用，即负样本，y = 0

In [52]:
def label(row):
   if str(row['Date_received']) == 'nan':
       return -1

   elif  str(row['Date']) != 'nan':
       td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
       if td <= pd.Timedelta(15, 'D'):
          return 1 
   return 0
dfoff['label'] = dfoff.apply(label, axis=1)


In [53]:
#使用这个函数对训练集进行标注，看一下正负样本究竟有多少：
print(dfoff['label'].value_counts())
pd.isnull(dfoff['label']).any()
#正样本共有 64395 例，负样本共有 988887 例。显然，正负样本数量差别很大。这也是为什么会使用 AUC 作为模型性能评估标准的原因。

 0    988887
-1    701602
 1     64395
Name: label, dtype: int64


False

### 建立模型

#### 1.划分训练集和验证集
为了验证模型的性能，需要划分验证集进行模型验证，划分方式是按照领券日期，即训练集：20160101-20160515，验证集：20160516-20160615。

In [54]:
# data split
df = dfoff[dfoff['label'] != -1].copy()
train = df[(pd.to_datetime(df['Date_received'], format='%Y%m%d') < pd.to_datetime('20160516', format='%Y%m%d'))].copy()
valid = df[(pd.to_datetime(df['Date_received'], format='%Y%m%d') >=pd.to_datetime('20160516', format='%Y%m%d') ) & (pd.to_datetime(df['Date_received'], format='%Y%m%d') <=pd.to_datetime('20160615', format='%Y%m%d') )].copy()
print('Train Set: \n', train['label'].value_counts())
print('Valid Set: \n', valid['label'].value_counts())

Train Set: 
 0    759172
1     41524
Name: label, dtype: int64
Valid Set: 
 0    229715
1     22871
Name: label, dtype: int64


In [55]:
#选择的特征是上面提取的 14 个特征
# feature
original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type'] + weekdaycols
print('共有特征：',len(original_feature),'个')
print(original_feature)

共有特征： 14 个
['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [56]:
#pd.isnull(train[original_feature]).any()
train[pd.isnull(train['label'])]

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,...,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,label


#### 2.构建模型
模型采用的是 SGDClassifier，使用了 Python 中的 Pipeline 管道机制，可以使参数集在新数据集（比如测试集）上的重复使用，管道机制实现了对全部步骤的流式化封装和管理。交叉验证采用 StratifiedKFold，其用法类似 Kfold，但是 StratifiedKFold 是分层采样，确保训练集，测试集中各类别样本的比例与原始数据集中相同。

In [59]:
def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log',  # loss function: logistic regression
        penalty='elasticnet', # L1 & L2
        fit_intercept=True,  # 是否存在截距，默认存在
        max_iter=100, 
        shuffle=True,  # Whether or not the training data should be shuffled after each epoch
        n_jobs=1, # The number of processors to use
        class_weight=None) # Weights associated with classes. If not given, all classes are supposed to have weight one.
 
    # 管道机制使得参数集在新数据集（比如测试集）上的重复使用，管道机制实现了对全部步骤的流式化封装和管理。
    model = Pipeline(steps=[
        ('ss', StandardScaler()), # transformer
        ('en', classifier())  # estimator
    ])
 
    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }
 
    # StratifiedKFold用法类似Kfold，但是他是分层采样，确保训练集，测试集中各类别样本的比例与原始数据集中相同。
    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    # Exhaustive search over specified parameter values for an estimator.
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1,  # -1 means using all processors
        verbose=1)
    grid_search = grid_search.fit(data[predictors], data['label'])
    
    return grid_search

#### 3.训练
对训练集进行训练了，整个训练过程大概 1-2 分钟的时间。

In [60]:
predictors = original_feature
model = check_model(train, predictors)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  27 | elapsed:  1.7min remaining:   12.7s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  1.7min finished
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


#### 4.验证
对验证集中每个优惠券预测的结果计算 AUC，再对所有优惠券的 AUC 求平均。计算 AUC 的时候，如果 label 只有一类，就直接跳过，因为 AUC 无法计算。

In [61]:
# valid predict
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]
valid1.head(5)

  Xt = transform.transform(Xt)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_type,discount_rate,discount_man,...,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,label,pred_prob
1,1439408,4663,11002.0,150:20,1.0,20160528.0,,1,0.866667,150,...,1,0,0,0,0,0,1,0,0,0.019561
4,1439408,2632,8591.0,20:1,0.0,20160613.0,,1,0.95,20,...,0,1,0,0,0,0,0,0,0,0.100927
6,1439408,2632,8591.0,20:1,0.0,20160516.0,20160613.0,1,0.95,20,...,0,1,0,0,0,0,0,0,0,0.100927
9,2029232,450,1532.0,30:5,0.0,20160530.0,,1,0.833333,30,...,0,1,0,0,0,0,0,0,0,0.096666
10,2029232,6459,12737.0,20:1,0.0,20160519.0,,1,0.95,20,...,0,0,0,0,1,0,0,0,0,0.132716


In [64]:
#注意这里得到的结果 pred_prob 是概率值（预测样本属于正类的概率）。
#最后，就可以对验证集计算 AUC。直接调用 sklearn 库自带的计算 AUC 函数即可。
# avgAUC calculation
vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
   tmpdf = i[1] 
   if len(tmpdf['label'].unique()) != 2:
       continue
   fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
   aucs.append(auc(fpr, tpr))
print('AUC:' ,np.average(aucs))
#值得注意的是，这里得到的结果是概率值，
#最终的 AUC 是提交到官网之后平台计算的。因为测试集真正的 label 我们肯定是不知道的

AUC: 0.5323444694516165


#### 提交结果
在比赛官网上提交我们的预测结果，即这里的 submit.csv 文件。提交完之后，过几个小时就可以看到成绩了。整个比赛的流程就完成了。

#### 优化模型
其实，本文所述的整个比赛思路和算法是比较简单的，得到的结果和成绩也只能算是合格，名次不会很高。我们还可以运用各种手段优化模型，简单来说分为以下三种：
特征工程
机器学习
模型融合