In [None]:
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [None]:
data_path = 'data/'

df_train = pd.read_csv(data_path + 'train_offline.csv')
df_test = pd.read_csv(data_path + 'test_offline.csv')

df_test = df_test[~df_test.Coupon_id.isna()]  # 排除 Coupon_id 為空的資料
df_test.reset_index(drop=True, inplace=True)

print(df_train.shape)
print(df_test.shape)

df_train.head(20)

In [None]:
df_test.head(20)  # df_test 為測試資料，沒有 date 欄位（購買商品時間）

In [None]:
## Creat target label

"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

df_train["label"] = df_train.apply(label, axis=1)  # 增加一個 label 欄位
df_train["label"].value_counts()

In [None]:
df_train.head(20)

In [None]:
# Generate features - weekday acquired coupon

def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

df_train['weekday'] = df_train['Date_received'].apply(getWeekday)  # 新增一個 weekday 欄位，用 Date_received 來轉
df_test['weekday'] = df_test['Date_received'].apply(getWeekday)  # 新增一個 weekday 欄位，用 Date_received 來轉

In [None]:
# weekday_type (weekend = 1)
df_train['weekday_type'] = df_train['weekday'].apply(lambda x: 1 if x in [6, 7]  else 0) # apply to trainset
df_test['weekday_type'] = df_test['weekday'].apply(lambda x: 1 if x in [6, 7]  else 0) # apply to testset

In [None]:
df_train.head(20)

In [None]:
df_test.head(20)

In [None]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(df_train['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
df_train[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(df_test['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
df_test[weekdaycols] = tmpdf

In [None]:
df_train.head(20)

In [None]:
df_test.head(20)

In [None]:
# Generate features - coupon discount and distance

def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1  # 有折扣
    else:
        return 0  # 沒有折扣 

def convertRate(row):  # 折扣率
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])  # 如 1-(20 / 200) = 0.9
    else:
        return float(row)

def getDiscountMan(row):   # 購物滿 x 元
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):  # 減 y 元
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

df_train = processData(df_train)
df_test = processData(df_test)

In [None]:
df_train.head(20)

In [None]:
df_test.head(20)

In [None]:
## Naive model

def split_train_valid(row, date_cut="20160405"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = df_train[df_train['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)

train = df[df["is_train"]]
valid = df[~df["is_train"]]

train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)

print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

In [None]:
train.head(20)

In [None]:
valid.head(20)

In [None]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type'] + weekdaycols

print(len(original_feature),original_feature)

In [None]:
predictors = original_feature
print(predictors)

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        average=False,
        epsilon=0.1,
        eta0=0.0,
        learning_rate='optimal',
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=1000, 
        power_t=0.5, 
        random_state=None,
        shuffle=True, 
        n_jobs=1,
        class_weight=None,
        verbose=0, 
        warm_start=False)

    # 在 StandardScaler 的构造函数中设置 with_mean=False 或者 with_std=False，可以禁止均值中心化（centering）和归一化（scaling）。
    
    model = Pipeline(steps=[
        ('ss', StandardScaler(copy=True, with_mean=False, with_std=False)),
        ('en', classifier())        
    ])

    parameters = {
        'en__alpha': [0.0001, 0.001, 0.01, 0.1],
        'en__l1_ratio': [0.0001, 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1,
        scoring=None, 
        iid=True, 
        refit=True,
        pre_dispatch='2*n_jobs',
        error_score='raise')
    
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

In [None]:
model = check_model(train, predictors)

In [None]:
y_valid_pred = model.predict_proba(valid[predictors])  # 基于 GridSearchCV 对验证集做出预测，y_valid_pred 为预测的概率
print(y_valid_pred)

valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]
valid1.head(10)

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score

auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))

print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

In [None]:
targetset = df_test.copy()

print(targetset.shape)

targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])

test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]

print(test1.shape)

In [None]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)

print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

In [None]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label

out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("baseline.csv", header=["uid", "label"], index=False) # submission format
out.head()