In [4]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.linear_model import SGDClassifier, LogisticRegression
from IPython.display import display
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

DATA_ROOT = "data/"
#讀取數據
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
# Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Coupon_id']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0
#從字串轉成浮點數
def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)
#轉成星期
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6-> 1~7
#增加特徵期望值
#1.距離越遠 期望值越低 
#2.折扣越高 期望值越低
def getExpecteValue(row):
    result = 0.0
    temp = 0.0
    if row['Distance'] == 0:
       temp = 0.1
    else:
       temp = row['Distance']
    result = (1/row['Discount_rate'])*(1/temp)
    return result
#產生lable
dfoff["label"] = dfoff.apply(label, axis=1)
#--------------------------------------------------
#label = -1 刪掉數據
train_x = dfoff[dfoff["label"] != -1].drop(["Date","label"], axis = 1).copy()
train_y = dfoff["label"][dfoff["label"] != -1].copy()
#---------------------------
#合併訓練集及測試集
df = pd.concat([train_x, dftest],sort=True)
#----------------------------------------
#資料預處理 補缺失值及特徵轉換
df["Discount_rate"] = df['Discount_rate'].astype('str').apply(convertRate)
df["Distance"][df["Distance"].isnull()] = df["Distance"][~df["Distance"].isnull()].median()
df["Discount_rate"][df["Discount_rate"]< 0.7] = df["Discount_rate"][~df["Discount_rate"].isnull()].median()
df["Discount_rate_square"] = df["Discount_rate"].apply(lambda x:x*x)
df["Discount_rate_third"] = df["Discount_rate_square"].apply(lambda x:x*x)
df["Distance_square"] = df["Distance"].apply(lambda x:x*x)
df["Discount_third"] = df["Distance_square"].apply(lambda x:x*x)
df['weekday'] = df['Date_received'].apply(getWeekday)
df['Expect_Value'] = df.apply(getExpecteValue,axis = 1)
#-------------------------------------------------------
#以下為離散化 及 ONE_HOT_ENCODING 
df["C_W_D"] = pd.cut(df["Distance"], [0,1,2,4,9,10])
df = pd.get_dummies(df, columns = ["C_W_D"], prefix="C_W_D")
df["C_W_DR"] = pd.cut(df["Discount_rate"], [0.3,0.72,0.82,0.85,0.87,0.9,1.0])
df = pd.get_dummies(df, columns = ["C_W_DR"], prefix="C_W_DR")
df['received_month'] = df['Date_received'].apply(lambda x: pd.to_datetime(x, format = "%Y%m%d").month)
df = pd.get_dummies(df, columns = ["received_month"], prefix="received_month")
df['received_day'] = df['Date_received'].apply(lambda x: pd.to_datetime(x, format = "%Y%m%d").day)
df["C_R_D"] = pd.cut(df["received_day"], [0,5,10,15,20,25,30])
df = pd.get_dummies(df, columns = ["C_R_D"], prefix="C_R_D")
#---------------------------
#特徵交叉
Expect_Value_mean = df.groupby(['User_id'])['Expect_Value'].mean().reset_index()
Expect_Value_mean.columns = ['User_id', 'Expect_Value_mean']
Discount_rate_Usermean = df.groupby(['User_id'])['Discount_rate'].mean().reset_index()
Discount_rate_Usermean.columns = ['User_id','Discount_rate_Usermean']
Discount_rate_Squaremean = df.groupby(['User_id'])['Discount_rate_square'].mean().reset_index()
Discount_rate_Squaremean.columns = ['User_id','Discount_rate_Squaremean']
Discount_Squaremean = df.groupby(['User_id'])['Distance_square'].mean().reset_index()
Discount_Squaremean.columns = ['User_id','Discount_Squaremean']
Discount_rate_thirdmean = df.groupby(['User_id'])['Discount_rate_third'].mean().reset_index()
Discount_rate_thirdmean.columns = ['User_id','Discount_rate_thirdmean']
Discount_thirdmean = df.groupby(['User_id'])['Discount_third'].mean().reset_index()
Discount_thirdmean.columns = ['User_id','Discount_thirdmean']
temp = pd.merge(Discount_rate_Usermean, Expect_Value_mean, how='left', on=['User_id'])
temp = pd.merge(temp, Discount_rate_Squaremean, how='left', on=['User_id'])
temp = pd.merge(temp, Discount_Squaremean, how='left', on=['User_id'])
temp = pd.merge(temp, Discount_rate_thirdmean, how='left', on=['User_id'])
temp = pd.merge(temp, Discount_thirdmean, how='left', on=['User_id'])
df = pd.merge(df,temp, how='left', on=['User_id'])
Expect_Value_sum = df.groupby(['Date_received','Coupon_id'])['Expect_Value'].sum().reset_index()
Expect_Value_sum.columns = ['Date_received','Coupon_id','Expect_Value_sum']
Expect_Value_mean = df.groupby(['Date_received','Coupon_id'])['Expect_Value'].mean().reset_index()
Expect_Value_mean.columns = ['Date_received','Coupon_id','Expect_Value_sum_user']
Expect_Value_mean_user = df.groupby(['User_id','Coupon_id'])['Expect_Value'].mean().reset_index()
Expect_Value_mean_user.columns = ['User_id','Coupon_id','Expect_Value_mean_user']
Expect_Value_sum_user = df.groupby(['User_id','Coupon_id'])['Expect_Value'].sum().reset_index()
Expect_Value_sum_user.columns = ['User_id','Coupon_id','Expect_Value_sum']
temp1 = pd.merge(Expect_Value_mean,Expect_Value_sum, how='left', on=['Date_received','Coupon_id'])
temp2 = pd.merge(Expect_Value_mean_user,Expect_Value_sum_user, how='left', on=['User_id','Coupon_id'])
df = pd.merge(df,temp1, how='left', on=['Date_received','Coupon_id'])
df = pd.merge(df,temp2, how='left', on=['User_id','Coupon_id'])
Discount_rate_MerchantSum = df.groupby(['Merchant_id'])['Discount_rate'].sum().reset_index()
Discount_rate_MerchantSum.columns = ['Merchant_id', 'Discount_rate_MerchantSum']
df =pd.merge(df, Discount_rate_MerchantSum, how='left', on=['Merchant_id'])
Discount_rate_MerchantMean = df.groupby(['Merchant_id'])['Discount_rate'].mean().reset_index()
Discount_rate_MerchantMean.columns = ['Merchant_id', 'Discount_rate_MerchantMean']
df =pd.merge(df, Discount_rate_MerchantMean, how='left', on=['Merchant_id'])
#-----------------------------------------------------------------------------
# df['Merchant_id']= LabelEncoder().fit_transform(df['Merchant_id'])
# df['User_id']= LabelEncoder().fit_transform(df['User_id'])
# df['Coupon_id']= LabelEncoder().fit_transform(df['Coupon_id'])
#-----------------------------------------
# print(df.shape)
# 將資料最大最小化
df = MinMaxScaler().fit_transform(df)
train_num = train_y.shape[0]
real_train_x = df[:train_num]
real_test_x = df[train_num:]
#以下為用隨機森林做特徵選取
# 建立模型 (使用 20 顆樹，每棵樹的最大深度為 4)
# clf = RandomForestClassifier(n_estimators=20,max_depth = 4)

# 訓練模型
# clf.fit(real_train_x, train_y)
# threshold = 0.1
# importances = clf.feature_importances_
# print(importances)
# df_select = df[:, importances > threshold]
# print(df_select.shape[1])
# real_train_x= df_select[:train_num]
# real_test_x = df_select[train_num:]

#以下為模型擬合資料+輸出預測結果
lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
rf = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1, 
                            max_features='sqrt', max_depth=6, bootstrap=True)
xgb = XGBClassifier(n_estimators = 100,min_child_weight = 5, max_depth = 5,subsample = 0.7,learning_rate = 0.2)
meta_estimator = XGBClassifier(n_estimators = 50,min_child_weight = 3, max_depth = 3,subsample = 0.8,learning_rate = 0.3)
sclf = StackingClassifier(classifiers=[lr, xgb, rf],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=meta_estimator)
sclf.fit(real_train_x, train_y)
y_test_pred = sclf.predict_proba(real_test_x)
# modelfit = xgb.fit(real_train_x,train_y)
# accuracy = np.mean(cross_val_score(modelfit, real_train_x, train_y, cv=3, scoring='accuracy'))
# print("XGBClassifier accuracy: ", accuracy)
# y_test_pred = modelfit.predict_proba(real_test_x)
test1 = dftest.loc[:,['Discount_rate','Expect_Value']].copy()
test1['pred_prob'] = y_test_pred[:, 1]
output = pd.concat((dftest[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("StackingClassifier_Modify_2.csv", header=["uid", "label"], index=False) # submission format
#-----------------------------------------------------------------------------------
#以下為資料視覺化+數據清洗
# df = pd.merge(df, Expect_Value_mean, how='left', on=['User_id'])
# df.head(10)
# print(df['User_id'].nunique())
# print(df['Merchant_id'].nunique())
# print(df['Coupon_id'].nunique())
# df['Expect_Value_log'] = np.log1p(df["Expect_Value"])
# df['Distance_log'] = np.log1p(df['Distance'])
# df['received_month'].hist()
# plt.show()
# df['received_day'].hist()
# plt.show()
# plt.boxplot(df['Expect_Value_log'],
#             notch=True,  # notch shape
#             sym='bs',     # blue squares for outliers
#             vert=True,   # vertical box aligmnent
#             patch_artist=True)   # fill with color 
# plt.title('Expect_Value_log') 
# plt.show()


#遇到均值編碼時 連id也一起算平均 所以merge不起來

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [None]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.linear_model import SGDClassifier, LogisticRegression
from IPython.display import display
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

DATA_ROOT = "data/"
#讀取數據
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
# Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
#產生label
def label(row):
    if np.isnan(row['Coupon_id']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0
#字串轉成比率
def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6-> 1~7
#增加特徵期望值   
#1.距離越遠 期望值越低 
#2.折扣越高 期望值越低
def getExpecteValue(row):
    result = 0.0
    temp = 0.0
    if row['Distance'] == 0:
       temp = 0.1
    else:
       temp = row['Distance']
    result = (1/row['Discount_rate'])*(1/temp)
    return result  
#產生label
dfoff["label"] = dfoff.apply(label, axis=1)
#label = -1 刪掉數據
train_x = dfoff[dfoff["label"] != -1].drop(["Date","label"], axis = 1).copy()
train_y = dfoff["label"][dfoff["label"] != -1].copy()
#------------------------------------------
#資料預處理 補缺失值及特徵轉換
df = pd.concat([train_x, dftest],sort=True)
df["Discount_rate"] = df['Discount_rate'].astype('str').apply(convertRate)
df["Distance"][df["Distance"].isnull()] = df["Distance"][~df["Distance"].isnull()].median()
df["Discount_rate"][df["Discount_rate"]< 0.7] = df["Discount_rate"][~df["Discount_rate"].isnull()].median()
df['weekday'] = df['Date_received'].apply(getWeekday)
df['Expect_Value'] = df.apply(getExpecteValue,axis = 1)
#-----------------------------------------------------------
#以下為離散化 AND ONE_HOT_ENCODING 
df["C_W_D"] = pd.cut(df["Distance"], [0,1,2,4,9,10])
df = pd.get_dummies(df, columns = ["C_W_D"], prefix="C_W_D")
df["C_W_DR"] = pd.cut(df["Discount_rate"], [0.3,0.72,0.82,0.85,0.87,0.9,1.0])
df = pd.get_dummies(df, columns = ["C_W_DR"], prefix="C_W_DR")
df['received_month'] = df['Date_received'].apply(lambda x: pd.to_datetime(x, format = "%Y%m%d").month)
df = pd.get_dummies(df, columns = ["received_month"], prefix="received_month")
df['received_day'] = df['Date_received'].apply(lambda x: pd.to_datetime(x, format = "%Y%m%d").day)
df["C_R_D"] = pd.cut(df["received_day"], [0,5,10,15,20,25,30])
df = pd.get_dummies(df, columns = ["C_R_D"], prefix="C_R_D")
#---------------------------
#特徵交叉
Expect_Value_mean = df.groupby(['User_id'])['Expect_Value'].mean().reset_index()
Expect_Value_mean.columns = ['User_id', 'Expect_Value_mean']
Discount_rate_Usermean = df.groupby(['User_id'])['Discount_rate'].mean().reset_index()
Discount_rate_Usermean.columns = ['User_id','Discount_rate_Usermean']
temp = pd.merge(Discount_rate_Usermean, Expect_Value_mean, how='left', on=['User_id'])
df = pd.merge(df,temp, how='left', on=['User_id'])
Expect_Value_sum = df.groupby(['Date_received','Coupon_id'])['Expect_Value'].sum().reset_index()
Expect_Value_sum.columns = ['Date_received','Coupon_id','Expect_Value_sum']
Expect_Value_mean = df.groupby(['Date_received','Coupon_id'])['Expect_Value'].mean().reset_index()
Expect_Value_mean.columns = ['Date_received','Coupon_id','Expect_Value_sum_user']
Expect_Value_mean_user = df.groupby(['User_id','Coupon_id'])['Expect_Value'].mean().reset_index()
Expect_Value_mean_user.columns = ['User_id','Coupon_id','Expect_Value_mean_user']
Expect_Value_sum_user = df.groupby(['User_id','Coupon_id'])['Expect_Value'].sum().reset_index()
Expect_Value_sum_user.columns = ['User_id','Coupon_id','Expect_Value_sum']
temp1 = pd.merge(Expect_Value_mean,Expect_Value_sum, how='left', on=['Date_received','Coupon_id'])
temp2 = pd.merge(Expect_Value_mean_user,Expect_Value_sum_user, how='left', on=['User_id','Coupon_id'])
df = pd.merge(df,temp1, how='left', on=['Date_received','Coupon_id'])
df = pd.merge(df,temp2, how='left', on=['User_id','Coupon_id'])
Discount_rate_MerchantSum = df.groupby(['Merchant_id'])['Discount_rate'].sum().reset_index()
Discount_rate_MerchantSum.columns = ['Merchant_id', 'Discount_rate_MerchantSum']
df =pd.merge(df, Discount_rate_MerchantSum, how='left', on=['Merchant_id'])
Discount_rate_MerchantMean = df.groupby(['Merchant_id'])['Discount_rate'].mean().reset_index()
Discount_rate_MerchantMean.columns = ['Merchant_id', 'Discount_rate_MerchantMean']
df =pd.merge(df, Discount_rate_MerchantMean, how='left', on=['Merchant_id'])
df['Merchant_id']= LabelEncoder().fit_transform(df['Merchant_id'])
df['User_id']= LabelEncoder().fit_transform(df['User_id'])
df['Coupon_id']= LabelEncoder().fit_transform(df['Coupon_id'])
#-----------------------------------------
# print(df.shape)
# 將資料最大最小化
df = MinMaxScaler().fit_transform(df)
train_num = train_y.shape[0]
real_train_x = df[:train_num]
real_test_x = df[train_num:]
#以下為用隨機森林做特徵選取
# 建立模型 (使用 20 顆樹，每棵樹的最大深度為 4)
# clf = RandomForestClassifier(n_estimators=20,max_depth = 4)

# 訓練模型
# clf.fit(real_train_x, train_y)
# threshold = 0.1
# importances = clf.feature_importances_
# print(importances)
# df_select = df[:, importances > threshold]
# print(df_select.shape[1])
# real_train_x= df_select[:train_num]
# real_test_x = df_select[train_num:]

# #以下為模型擬合資料+輸出預測結果
lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
rf = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1, 
                            max_features='sqrt', max_depth=6, bootstrap=True)
xgb = XGBClassifier(n_estimators = 100,min_child_weight = 5, max_depth = 5,subsample = 0.7,learning_rate = 0.2)
meta_estimator = XGBClassifier(n_estimators = 50,min_child_weight = 3, max_depth = 3,subsample = 0.8,learning_rate = 0.3)
sclf = StackingClassifier(classifiers=[lr, xgb, rf],
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=meta_estimator)
sclf.fit(real_train_x, train_y)
y_test_pred = sclf.predict_proba(real_test_x)
# modelfit = xgb.fit(real_train_x,train_y)
# accuracy = np.mean(cross_val_score(modelfit, real_train_x, train_y, cv=3, scoring='accuracy'))
# print("XGBClassifier accuracy: ", accuracy)
# y_test_pred = modelfit.predict_proba(real_test_x)
test1 = dftest.loc[:,['Discount_rate','Expect_Value']].copy()
test1['pred_prob'] = y_test_pred[:, 1]
output = pd.concat((dftest[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("StackingClassifier.csv", header=["uid", "label"], index=False) # submission format
#-----------------------------------------------------------------------------------
#以下為資料視覺化+數據清洗
# df = pd.merge(df, Expect_Value_mean, how='left', on=['User_id'])
# df.head(10)
# print(df['User_id'].nunique())
# print(df['Merchant_id'].nunique())
# print(df['Coupon_id'].nunique())
# df['Expect_Value_log'] = np.log1p(df["Expect_Value"])
# df['Distance_log'] = np.log1p(df['Distance'])
# df['received_month'].hist()
# plt.show()
# df['received_day'].hist()
# plt.show()
# plt.boxplot(df['Expect_Value_log'],
#             notch=True,  # notch shape
#             sym='bs',     # blue squares for outliers
#             vert=True,   # vertical box aligmnent
#             patch_artist=True)   # fill with color 
# plt.title('Expect_Value_log') 
# plt.show()


#遇到均值編碼時 連id也一起算平均 所以merge不起來

In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler,StandardScaler,LabelEncoder
from sklearn.linear_model import SGDClassifier, LogisticRegression
from IPython.display import display
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from mlxtend.classifier import StackingClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

DATA_ROOT = "data/"
#讀取數據
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
# Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Coupon_id']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0
#從字串轉成浮點數
def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6-> 1~7
#增加特徵期望值   
#1.距離越遠 期望值越低 
#2.折扣越高 期望值越低
def getExpecteValue(row):
    result = 0.0
    temp = 0.0
    if row['Distance'] == 0:
       temp = 0.1
    else:
       temp = row['Distance']
    result = (1/row['Discount_rate'])*(1/temp)
    return result  
#產生lable
dfoff["label"] = dfoff.apply(label, axis=1)
#-------------------------------------------
#label = -1 刪掉數據
train_x = dfoff[dfoff["label"] != -1].drop(["Date","label"], axis = 1).copy()
train_y = dfoff["label"][dfoff["label"] != -1].copy()
#-------------------------------------------------
#合併測試集及訓練集
df = pd.concat([train_x, dftest],sort=True)
#--------------------------------------------
#資料預處理 補缺失值及特徵轉換
df["Discount_rate"] = df['Discount_rate'].astype('str').apply(convertRate)
df["Distance"][df["Distance"].isnull()] = df["Distance"][~df["Distance"].isnull()].median()
df["Discount_rate"][df["Discount_rate"]< 0.7] = df["Discount_rate"][~df["Discount_rate"].isnull()].median()
df['weekday'] = df['Date_received'].apply(getWeekday)
df['Expect_Value'] = df.apply(getExpecteValue,axis = 1)
#------------------------------------------------------
#以下為離散化 AND ONE_HOT_ENCODING 
df["C_W_D"] = pd.cut(df["Distance"], [0,1,2,4,9,10])
df = pd.get_dummies(df, columns = ["C_W_D"], prefix="C_W_D")
df["C_W_DR"] = pd.cut(df["Discount_rate"], [0.3,0.72,0.82,0.85,0.87,0.9,1.0])
df = pd.get_dummies(df, columns = ["C_W_DR"], prefix="C_W_DR")
df['received_month'] = df['Date_received'].apply(lambda x: pd.to_datetime(x, format = "%Y%m%d").month)
df = pd.get_dummies(df, columns = ["received_month"], prefix="received_month")
df['received_day'] = df['Date_received'].apply(lambda x: pd.to_datetime(x, format = "%Y%m%d").day)
df["C_R_D"] = pd.cut(df["received_day"], [0,5,10,15,20,25,30])
df = pd.get_dummies(df, columns = ["C_R_D"], prefix="C_R_D")
#---------------------------
#特徵交叉
Expect_Value_mean = df.groupby(['User_id'])['Expect_Value'].mean().reset_index()
Expect_Value_mean.columns = ['User_id', 'Expect_Value_mean']
Discount_rate_Usermean = df.groupby(['User_id'])['Discount_rate'].mean().reset_index()
Discount_rate_Usermean.columns = ['User_id','Discount_rate_Usermean']
temp = pd.merge(Discount_rate_Usermean, Expect_Value_mean, how='left', on=['User_id'])
df = pd.merge(df,temp, how='left', on=['User_id'])
Expect_Value_sum = df.groupby(['Date_received','Coupon_id'])['Expect_Value'].sum().reset_index()
Expect_Value_sum.columns = ['Date_received','Coupon_id','Expect_Value_sum']
Expect_Value_mean = df.groupby(['Date_received','Coupon_id'])['Expect_Value'].mean().reset_index()
Expect_Value_mean.columns = ['Date_received','Coupon_id','Expect_Value_sum_user']
Expect_Value_mean_user = df.groupby(['User_id','Coupon_id'])['Expect_Value'].mean().reset_index()
Expect_Value_mean_user.columns = ['User_id','Coupon_id','Expect_Value_mean_user']
Expect_Value_sum_user = df.groupby(['User_id','Coupon_id'])['Expect_Value'].sum().reset_index()
Expect_Value_sum_user.columns = ['User_id','Coupon_id','Expect_Value_sum']
temp1 = pd.merge(Expect_Value_mean,Expect_Value_sum, how='left', on=['Date_received','Coupon_id'])
temp2 = pd.merge(Expect_Value_mean_user,Expect_Value_sum_user, how='left', on=['User_id','Coupon_id'])
df = pd.merge(df,temp1, how='left', on=['Date_received','Coupon_id'])
df = pd.merge(df,temp2, how='left', on=['User_id','Coupon_id'])
Discount_rate_MerchantSum = df.groupby(['Merchant_id'])['Discount_rate'].sum().reset_index()
Discount_rate_MerchantSum.columns = ['Merchant_id', 'Discount_rate_MerchantSum']
df =pd.merge(df, Discount_rate_MerchantSum, how='left', on=['Merchant_id'])
Discount_rate_MerchantMean = df.groupby(['Merchant_id'])['Discount_rate'].mean().reset_index()
Discount_rate_MerchantMean.columns = ['Merchant_id', 'Discount_rate_MerchantMean']
df =pd.merge(df, Discount_rate_MerchantMean, how='left', on=['Merchant_id'])
#-------------------------------------------------------------
# 將資料最大最小化
df = MinMaxScaler().fit_transform(df)
train_num = train_y.shape[0]
real_train_x = df[:train_num]
real_test_x = df[train_num:]
#以下為用隨機森林做特徵選取
# 建立模型 (使用 20 顆樹，每棵樹的最大深度為 4)
# clf = RandomForestClassifier(n_estimators=20,max_depth = 4)

# 訓練模型
# clf.fit(real_train_x, train_y)
# threshold = 0.1
# importances = clf.feature_importances_
# print(importances)
# df_select = df[:, importances > threshold]
# print(df_select.shape[1])
# real_train_x= df_select[:train_num]
# real_test_x = df_select[train_num:]
#-------------------------------------------

#以下為模型擬合資料+輸出預測結果
# lr = LogisticRegression(tol=0.001, penalty='l2', fit_intercept=True, C=1.0)
# rf = RandomForestClassifier(n_estimators=100, min_samples_split=2, min_samples_leaf=1, 
#                             max_features='sqrt', max_depth=6, bootstrap=True)
xgb = XGBClassifier()
# meta_estimator = XGBClassifier(n_estimators = 50,min_child_weight = 3, max_depth = 3,subsample = 0.8,learning_rate = 0.3)
# sclf = StackingClassifier(classifiers=[lr, xgb, rf],
#                           use_probas=True,
#                           average_probas=False,
#                           meta_classifier=meta_estimator)
# sclf.fit(real_train_x, train_y)
# y_test_pred = sclf.predict_proba(real_test_x)
modelfit = xgb.fit(real_train_x,train_y)
# accuracy = np.mean(cross_val_score(modelfit, real_train_x, train_y, cv=3, scoring='accuracy'))
# print("XGBClassifier accuracy: ", accuracy)
y_test_pred = modelfit.predict_proba(real_test_x)
test1 = dftest.loc[:,['Discount_rate','Expect_Value']].copy()
test1['pred_prob'] = y_test_pred[:, 1]
output = pd.concat((dftest[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.to_csv("xgb_MinMaxScaler_Modify3.csv", header=["uid", "label"], index=False) # submission format
#-----------------------------------------------------------------------------------
#以下為資料視覺化+數據清洗
# df = pd.merge(df, Expect_Value_mean, how='left', on=['User_id'])
# df.head(10)
# print(df['User_id'].nunique())
# print(df['Merchant_id'].nunique())
# print(df['Coupon_id'].nunique())
# df['Expect_Value_log'] = np.log1p(df["Expect_Value"])
# df['Distance_log'] = np.log1p(df['Distance'])
# df['received_month'].hist()
# plt.show()
# df['received_day'].hist()
# plt.show()
# plt.boxplot(df['Expect_Value_log'],
#             notch=True,  # notch shape
#             sym='bs',     # blue squares for outliers
#             vert=True,   # vertical box aligmnent
#             patch_artist=True)   # fill with color 
# plt.title('Expect_Value_log') 
# plt.show()


#遇到均值編碼時 連id也一起算平均 所以merge不起來

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [3]:
import numpy as np
import pandas as pd

xgb_MinMaxScaler_Modify2 = pd.read_csv('xgb_MinMaxScaler_Modify2.csv')
StackingClassifier_Modify = pd.read_csv('StackingClassifier_Modify.csv')
xgb_StandardScaler_modify = pd.read_csv('xgb_StandardScaler_modify.csv')

blend = xgb_MinMaxScaler_Modify2.copy()
blend['label'] =0.2*xgb_MinMaxScaler_Modify2['label'] + 0.5*StackingClassifier_Modify['label'] + 0.3*xgb_StandardScaler_modify['label']
blend.to_csv("blend.csv", index=False)




