In [6]:
import os, sys, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [7]:
import seaborn as sns
 
from datetime import date
 
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

In [8]:
import xgboost as xgb
import lightgbm as lgb

In [9]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [45]:
dfoff = pd.read_csv('data/ccf_offline_stage1_train.csv')
dftest = pd.read_csv('data/ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('data/ccf_online_stage1_train.csv')

In [11]:
"""
数据分析
"""

'\noffline 数据分析\n'

In [13]:
dfoff.head(5)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,4663,11002.0,150:20,1.0,20160528.0,
2,1439408,2632,8591.0,20:1,0.0,20160217.0,
3,1439408,2632,1078.0,20:1,0.0,20160319.0,
4,1439408,2632,8591.0,20:1,0.0,20160613.0,


In [14]:
dfoff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
User_id          int64
Merchant_id      int64
Coupon_id        float64
Discount_rate    object
Distance         float64
Date_received    float64
Date             float64
dtypes: float64(4), int64(2), object(1)
memory usage: 93.7+ MB


In [None]:
###数据中大量的购物券没有使用

In [21]:
print('有优惠券，购买商品条数', dfoff[pd.notnull(dfoff["Coupon_id"]) & pd.notnull(dfoff["Date"])].shape[0])
print('无优惠券，购买商品条数', dfoff[pd.isnull(dfoff['Date_received']) & pd.notnull(dfoff['Date'])].shape[0])
print('有优惠券，不购买商品条数', dfoff[pd.notnull(dfoff['Date_received']) & pd.isnull(dfoff['Date'])].shape[0])
print('无优惠券，不购买商品条数', dfoff[pd.isnull(dfoff['Date_received']) & pd.isnull(dfoff['Date'])].shape[0])

有优惠券，购买商品条数 75382
无优惠券，购买商品条数 701602
有优惠券，不购买商品条数 977900
无优惠券，不购买商品条数 0


In [27]:
###有两个新用户和一个新商铺出现在测试集上，没有出现在训练集中

In [23]:
# 在测试集中出现的用户但训练集没有出现
print('1. User_id in training set but not in test set', set(dftest['User_id']) - set(dfoff['User_id']))
# 在测试集中出现的商户但训练集没有出现
print('2. Merchant_id in training set but not in test set', set(dftest['Merchant_id']) - set(dfoff['Merchant_id']))

1. User_id in training set but not in test set {2495873, 1286474}
2. Merchant_id in training set but not in test set {5920}


In [28]:
###优惠券和距离的关系有哪些类型，用pd 的unique

In [25]:
print('Discount_rate 类型:',dfoff['Discount_rate'].unique())
print('Distance 类型:', dfoff['Distance'].unique())

Discount_rate 类型: [nan '150:20' '20:1' '200:20' '30:5' '50:10' '10:5' '100:10' '200:30'
 '20:5' '30:10' '50:5' '150:10' '100:30' '200:50' '100:50' '300:30'
 '50:20' '0.9' '10:1' '30:1' '0.95' '100:5' '5:1' '100:20' '0.8' '50:1'
 '200:10' '300:20' '100:1' '150:30' '300:50' '20:10' '0.85' '0.6' '150:50'
 '0.75' '0.5' '200:5' '0.7' '30:20' '300:10' '0.2' '50:30' '200:100'
 '150:5']
Distance 类型: [ 0.  1. nan  2. 10.  4.  7.  9.  3.  5.  6.  8.]


In [29]:
###查看到 discount_rate 中有两种类型的折扣： 50：5 和 0.9 这种，统一成 0.9 这种类型
### 构造discount 的特征

In [66]:
def getDiscountType(row):
    if pd.isnull(row):
        return 0
    elif ':' in row:
        return 1
    else:
        return 0
def convertRate(row):
    if pd.isnull(row):
        return 1.0
    elif ':' in row:
        rows = row.split(":")
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)
    
def getDiscountMan(row):
    if pd.isnull(row):
        return 0
    elif ':' in row:      
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if pd.isnull(row):
        return 0
    elif ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
def processData(df):
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    df['distance']= df['Distance'].replace(np.nan,-1).astype(int)
    return df

In [67]:
dfoff = processData(dfoff)

In [68]:
dfoff.head(2)

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,discount_rate,discount_man,discount_jian,discount_type,distance
0,1439408,2632,,,0.0,,20160217.0,1.0,0,0,0,0
1,1439408,4663,11002.0,150:20,1.0,20160528.0,,0.866667,150,20,1,1


In [None]:
###处理时间特征

In [74]:
date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[pd.notnull(date_received)])
print('优惠券收到日期从',date_received[0],'到', date_received[-1])

优惠券收到日期从 20160101.0 到 20160615.0


In [73]:
date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[pd.notnull(date_buy)])
print('消费日期从',date_buy[0],'到', date_buy[-1])

消费日期从{}到{} 20160101.0 到 20160630.0


In [None]:
### 查看收到优惠券的数目和 使用优惠券的数目