In [129]:
import pandas as pd
import numpy as np

# load data
dfoff = pd.read_csv('./ccf_offline_stage1_train.csv', parse_dates=[5, 6])
dfTest = pd.read_csv('./ccf_offline_stage1_test_revised.csv')

In [130]:
# 删除 Coupon_id 为空的样本
dfoff = dfoff.dropna(subset=['Coupon_id'])

In [131]:
# define function

def getDiscountType(row):
    if pd.isnull(row):
        return np.nan
    elif ':' in row:
        return 1
    else:
        return 0

def covertRate(row):
    if pd.isnull(row):
        return 1.0
    elif ':' in str(row):
        rows = row.split(':')
        return 1.0 - float(rows[1]) / float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0
    
def getDiscountJian(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

In [132]:
def processData(df):
    # coverate discount_rate
    df['discount_rate'] = df['Discount_rate'].apply(covertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    df['distance'] = df['Distance'].fillna(-1).astype(int)

In [133]:
processData(dfoff)
processData(dfTest)

# 计算 lable
# Date - Date_received <=15 赋值1 正样本；其他为0 负样本
dfoff['lable'] = np.where((dfoff['Date'] - dfoff['Date_received']).dt.days <= 15, 1, 0)

In [134]:
# 处理 空值
#dfoff['Coupon_id'] = dfoff['Coupon_id'].fillna(0).astype(int)
dfoff['discount_type'] = dfoff['discount_type'].fillna(-1).astype(int)

In [135]:
dfoff.info(null_counts=True)
dfTest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1053282 entries, 1 to 1754883
Data columns (total 13 columns):
User_id          1053282 non-null int64
Merchant_id      1053282 non-null int64
Coupon_id        1053282 non-null float64
Discount_rate    1053282 non-null object
Distance         947279 non-null float64
Date_received    1053282 non-null datetime64[ns]
Date             75382 non-null datetime64[ns]
discount_rate    1053282 non-null float64
discount_man     1053282 non-null int64
discount_jian    1053282 non-null int64
discount_type    1053282 non-null int64
distance         1053282 non-null int64
lable            1053282 non-null int64
dtypes: datetime64[ns](2), float64(3), int64(7), object(1)
memory usage: 112.5+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113640 entries, 0 to 113639
Data columns (total 11 columns):
User_id          113640 non-null int64
Merchant_id      113640 non-null int64
Coupon_id        113640 non-null int64
Discount_rate    113640 non-null o

In [166]:
# 创建样本特征和标签
features = ['User_id', 'Merchant_id', 'Coupon_id', 'discount_rate', 
        'discount_man', 'discount_jian', 'discount_type', 'distance']
X = dfoff[features]
y = dfoff['lable']

In [167]:
# 将样本数据拆分
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [168]:
# 训练模型
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(
    loss='log',
#     penalty='elasticnet',
#     fit_intercept=True,
#     max_iter=100,
#     shuffle=True,
#     alpha = 0.01,
#     l1_ratio = 0.01,
#     n_jobs=1,
#     class_weight=None
)

model.fit(X_train, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [169]:
score = model.score(X_test, y_test)
print(score)

0.8244765894098838


In [170]:
# 预测
proba = model.predict_proba(dfTest[features])
proba

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]])