In [1]:
# 文件根目录
root_path = r'D:\softfiles\workspace\pycharm\O2O-Coupon-Usage-Forecast-master\code\wepon\season one\data'

In [2]:
import pandas as pd
import os, sys, pickle
import numpy as np
import matplotlib.pyplot as plt

from datetime import date
from sklearn.linear_model import SGDClassifier, LogisticRegression

In [3]:
# 读取数据文件
dfoff = pd.read_csv(os.path.join(root_path, 'ccf_offline_stage1_train.csv'))
dftest = pd.read_csv(os.path.join(root_path, 'ccf_offline_stage1_test_revised.csv'))
dfon = pd.read_csv(os.path.join(root_path, 'ccf_online_stage1_train.csv'))

print('data read end.')

data read end.


In [6]:
# 获取折扣类型，满减为1，直接折扣为0，如果为null则设为nan
def getDiscountType(row):
    if pd.isnull(row):
        return np.nan
    elif ':' in row:
        return 1
    else:
        return 0

# 将满减形式改为折扣率，没有折扣则为1.0
def convertRate(row):
    if pd.isnull(row):
        return 1.0
    elif ':' in str(row):
        rows = row.split(':')
        return 1.0 - float(rows[1]) / float(rows[0])
    else:
        return float(row)

# 获取满减形式的满金额
def getDiscontMan(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

# 获取满减形式的减金额
def getDiscountJian(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

print('tool is ok.')

tool is ok.


In [7]:
# 以pd.apply()函数对折扣数据进行处理
def processData(df):
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscontMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    
    # 输出所有折扣率
    print(df['discount_rate'].unique())
    
    # 处理distance
    df['distance'] = df['Distance'].fillna(-1).astype(int)
    return df

In [None]:
# 对线下数据和测试数据进行处理
dfoff = processData(dfoff)
dftest = processData(dftest)

# 线下优惠券获得日期20160101 - 20160615
date_received = dfoff['Date_received'].unique()
# 去除nan，并排序
date_received = sorted(date_received[pd.notnull(date_received)])
 
# 20160101 - 20160630
# date_buy = dfoff['Date'].unique()
# date_buy = sorted(date_buy[pd.notnull(date_buy)])

# 获取核销日期
date_buy = sorted(dfoff[dfoff['Date'].notnull()]['Date'])

# 统计每天发放的优惠券数量
couponbydate = dfoff[dfoff['Date_received'].notnull()][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
couponbydate.columns = ['Date_received','count']

# 统计每天核销的优惠券数量
buybydate = dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received','count']
 
print("end")

In [11]:
# 将日期转为周格式[1-7]
def getWeekday(row):
    if row == 'nan':
        return np.nan
    else:
        return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1

# 对date_received进行处理
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)
 
# weekday_type :  周六和周日为1，其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
 
# 把weekday转成one-hot编码
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
tmpdf = pd.get_dummies(dfoff['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

# 把测试数据进行相应编码
tmpdf = pd.get_dummies(dftest['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

# label=-1 没有优惠券
# label=1 :15天内对优惠卷进行核销
# label=0 : 15天内没有核销
def label(row):
    if pd.isnull(row['Date_received']):
        return -1
    if pd.notnull(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0
dfoff['label'] = dfoff.apply(label, axis = 1)
 
print("end")

end


In [12]:
# data split
# 仅训练有优惠券的样本
# 训练数据为Date_received[20160101 ,20160516] 
# val数据为：Date_received[20160516 ,20160615]
print("-----data split------")
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < 20160516)].copy()
valid = df[(df['Date_received'] >= 20160516) & (df['Date_received'] <= 20160615)].copy()
print("end")

-----data split------
end


In [13]:
# feature
original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type'] + weekdaycols
print("----train-----")
model = SGDClassifier(#lambda:
    loss='log',
    penalty='elasticnet',
    fit_intercept=True,
    max_iter=100,
    shuffle=True,
    alpha = 0.01,
    l1_ratio = 0.01,
    n_jobs=1,
    class_weight=None
)
model.fit(train[original_feature], train['label'])

----train-----


SGDClassifier(alpha=0.01, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.01,
       learning_rate='optimal', loss='log', max_iter=100, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [14]:
# #### 预测以及结果评价
print(model.score(valid[original_feature], valid['label']))

0.909452622077


In [15]:
print("---save model---")
with open('1_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('1_model.pkl', 'rb') as f:
    model = pickle.load(f)

---save model---


In [16]:
# test prediction for submission
y_test_pred = model.predict_proba(dftest[original_feature])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('submit1.csv', index=False, header=False)
dftest1.head()

Unnamed: 0,User_id,Coupon_id,Date_received,label
0,4129537,9983,20160712,0.100387
1,6949378,3429,20160706,0.145101
2,2166529,6928,20160727,0.003591
3,2166529,1808,20160727,0.015164
4,6172162,6500,20160708,0.072369
