In [None]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Retreive training/testing datasets
train_df = pd.read_csv("../input/ml100marathon-02-01/train_offline.csv")
test_df = pd.read_csv("../input/ml100marathon-02-01/test_offline.csv")

# Drop testing data without Coupon_id
test_df = test_df[~test_df.Coupon_id.isna()]
test_df.reset_index(drop=True, inplace=True)

print(f'train_df: {train_df.shape}')
print(train_df.head())
print("\n")
print(f'test_df:  {test_df.shape}')
print(test_df.head())

In [None]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(df):
    if np.isnan(df['Date_received']):
        return -1
    if not np.isnan(df['Date']):
        td = pd.to_datetime(df['Date'], format='%Y%m%d') -  pd.to_datetime(df['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

train_df['label'] = train_df.apply(label, axis=1)
train_df['label'].value_counts()

In [None]:
# Generate features - weekday acquired coupon
def getWeekday(df):
    if (np.isnan(df)) or (df==-1):
        return df
    else:
        return pd.to_datetime(df, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

train_df['weekday'] = train_df['Date_received'].apply(getWeekday)
test_df['weekday'] = test_df['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
train_df['weekday_type'] = train_df['weekday'].astype('str').apply(lambda x : 1 if x in [6, 7] else 0)
test_df['weekday_type'] = test_df['weekday'].astype('str').apply(lambda x : 1 if x in [6, 7] else 0)

In [None]:
print(train_df.head())
print("\n")
print(test_df.head())

In [None]:
weekday_cols = ['weekday_' + str(i) for i in range(1,8)]
print(weekday_cols)

temp_df = pd.get_dummies(train_df['weekday'].replace(-1, np.nan))
temp_df.columns = weekday_cols
train_df[weekday_cols] = temp_df

temp_df = pd.get_dummies(test_df['weekday'].replace(-1, np.nan))
temp_df.columns = weekday_cols
test_df[weekday_cols] = temp_df

In [None]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

train_df = processData(train_df)
test_df = processData(test_df)

In [None]:
print(train_df.head())
print("\n")
print(test_df.head())

In [None]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = train_df[train_df['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

In [None]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type'] + weekday_cols
print(len(original_feature),original_feature)

In [None]:
predictors = original_feature
print(predictors)

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

In [None]:
model = check_model(train, predictors)

In [None]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

In [None]:
target = test_df.copy()
print(target.shape)
target = target[~target.Coupon_id.isna()]
target.reset_index(drop=True, inplace=True)
test = target[predictors].copy()

y_pred = model.predict_proba(test[predictors])
test1 = test.copy()
test1['pred_prob'] = y_pred[:, 1]
print(test1.shape)

In [None]:
output = pd.concat((target[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

In [None]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
# out.to_csv("baseline_example.csv", header=["uid", "label"], index=False) # submission format
out.head()

In [None]:
submission = out.to_csv("submission.csv", header=["uid", "label"], index=False)