In [116]:
# load data
import pandas as pd

df_off = pd.read_csv('./ccf_offline_stage1_train.csv')

In [117]:
print(df_off.info(null_counts=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
User_id          1754884 non-null int64
Merchant_id      1754884 non-null int64
Coupon_id        1053282 non-null float64
Discount_rate    1053282 non-null object
Distance         1648881 non-null float64
Date_received    1053282 non-null float64
Date             776984 non-null float64
dtypes: float64(4), int64(2), object(1)
memory usage: 93.7+ MB
None


In [118]:
# del samples when coupon_id is NaN
S = df_off.copy()
S = S.dropna(subset=['Coupon_id'])

print(S.info(null_counts=True))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1053282 entries, 1 to 1754883
Data columns (total 7 columns):
User_id          1053282 non-null int64
Merchant_id      1053282 non-null int64
Coupon_id        1053282 non-null float64
Discount_rate    1053282 non-null object
Distance         947279 non-null float64
Date_received    1053282 non-null float64
Date             75382 non-null float64
dtypes: float64(4), int64(2), object(1)
memory usage: 64.3+ MB
None


In [119]:
# Convert datatype to datetime
S['Date_received'] = pd.to_datetime(S['Date_received'], format='%Y%m%d')
S['Date'] = pd.to_datetime(S['Date'], format='%Y%m%d')

S.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1053282 entries, 1 to 1754883
Data columns (total 7 columns):
User_id          1053282 non-null int64
Merchant_id      1053282 non-null int64
Coupon_id        1053282 non-null float64
Discount_rate    1053282 non-null object
Distance         947279 non-null float64
Date_received    1053282 non-null datetime64[ns]
Date             75382 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(2), int64(2), object(1)
memory usage: 64.3+ MB


In [120]:
S.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
1,1439408,4663,11002.0,150:20,1.0,2016-05-28,NaT
2,1439408,2632,8591.0,20:1,0.0,2016-02-17,NaT
3,1439408,2632,1078.0,20:1,0.0,2016-03-19,NaT
4,1439408,2632,8591.0,20:1,0.0,2016-06-13,NaT
6,1439408,2632,8591.0,20:1,0.0,2016-05-16,2016-06-13


In [121]:
# handle NaT
S['Date_received'] = S['Date_received'].fillna(pd.to_datetime('1970-01-01'))
S['Date'] = S['Date'].fillna(pd.to_datetime('2100-01-01'))
S['Distance'] = S['Distance'].fillna(-1).astype(int)

S.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
1,1439408,4663,11002.0,150:20,1,2016-05-28,2100-01-01
2,1439408,2632,8591.0,20:1,0,2016-02-17,2100-01-01
3,1439408,2632,1078.0,20:1,0,2016-03-19,2100-01-01
4,1439408,2632,8591.0,20:1,0,2016-06-13,2100-01-01
6,1439408,2632,8591.0,20:1,0,2016-05-16,2016-06-13


In [122]:
S.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1053282 entries, 1 to 1754883
Data columns (total 7 columns):
User_id          1053282 non-null int64
Merchant_id      1053282 non-null int64
Coupon_id        1053282 non-null float64
Discount_rate    1053282 non-null object
Distance         1053282 non-null int64
Date_received    1053282 non-null datetime64[ns]
Date             1053282 non-null datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(3), object(1)
memory usage: 64.3+ MB


In [123]:
# Date - Date_received <=15 赋值1 正样本；其他为0 负样本
import numpy as np
S['lable'] = np.where((S['Date'] - S['Date_received']).dt.days <= 15, 1, 0)

In [124]:
S.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,lable
1,1439408,4663,11002.0,150:20,1,2016-05-28,2100-01-01,0
2,1439408,2632,8591.0,20:1,0,2016-02-17,2100-01-01,0
3,1439408,2632,1078.0,20:1,0,2016-03-19,2100-01-01,0
4,1439408,2632,8591.0,20:1,0,2016-06-13,2100-01-01,0
6,1439408,2632,8591.0,20:1,0,2016-05-16,2016-06-13,0


In [125]:
S.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1053282 entries, 1 to 1754883
Data columns (total 8 columns):
User_id          1053282 non-null int64
Merchant_id      1053282 non-null int64
Coupon_id        1053282 non-null float64
Discount_rate    1053282 non-null object
Distance         1053282 non-null int64
Date_received    1053282 non-null datetime64[ns]
Date             1053282 non-null datetime64[ns]
lable            1053282 non-null int64
dtypes: datetime64[ns](2), float64(1), int64(4), object(1)
memory usage: 72.3+ MB


In [154]:
# features = ['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
#        'Date_received', 'Date']
features = ['User_id', 'Merchant_id', 'Coupon_id', 'Distance', 'Discount_rate']
X = S[features]
y = S['lable']

In [155]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [156]:
# 训练模型
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(
    loss='log',
    penalty='elasticnet',
    fit_intercept=True,
    max_iter=100,
    shuffle=True,
    alpha = 0.01,
    l1_ratio = 0.01,
    n_jobs=1,
    class_weight=None
)

model.fit(X_train, y_train)
#X_train

ValueError: could not convert string to float: '100:10'

In [147]:
# 预测
proba = model.predict_proba(X_test)
proba

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [153]:
float('2018-01-01')

ValueError: could not convert string to float: '2018-01-01'