In [17]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "./input/"

In [18]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(20)

(1160742, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [19]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [20]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7

dfoff['weekday'] = dfoff['Date_received'].apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
dfoff['weekday_type'] = dfoff['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to trainset
dftest['weekday_type'] = dftest['weekday'].astype('str').apply(lambda x : 1 if x in [6,7] else 0 ) # apply to testset

In [21]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [22]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

In [23]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = dfoff[dfoff['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum()))

Train size: 667753, #positive: 32472
Valid size: 79216, #positive: 3832


In [24]:
original_feature = ['discount_rate',
                    'discount_type',
                    'discount_man', 
                    'discount_jian',
                    'Distance', 
                    'weekday', 
                    'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)

14 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [25]:
predictors = original_feature
print(predictors)

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'Distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [26]:
model = check_model(train, predictors)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:  4.0min finished
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


In [27]:
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]

  Xt = transform.transform(Xt)


In [28]:
from sklearn.metrics import roc_auc_score, accuracy_score
auc_score = roc_auc_score(y_true=valid.label, y_score=y_valid_pred[:,1])
acc = accuracy_score(y_true=valid.label, y_pred=y_valid_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.743, Accuracy: 0.952


In [29]:
targetset = dftest.copy()
print(targetset.shape)
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)
testset = targetset[predictors].copy()

y_test_pred = model.predict_proba(testset[predictors])
test1 = testset.copy()
test1['pred_prob'] = y_test_pred[:, 1]
print(test1.shape)

(306313, 19)
(306313, 15)


  Xt = transform.transform(Xt)


In [30]:
output = pd.concat((targetset[["User_id", "Coupon_id", "Date_received"]], test1["pred_prob"]), axis=1)
print(output.shape)

output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

(306313, 4)


In [31]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
# out.to_csv("baseline_example.csv", header=["uid", "label"], index=False) # submission format
out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.114213
1,1000020_8192_20160513,0.088202
2,1000065_1455_20160527,0.067612
3,1000085_8067_20160513,0.070576
4,1000086_2418_20160613,0.061117


In [32]:
out.to_csv('mid.csv', index=False)

---

In [36]:
import numpy as np 
import pandas as pd 
import os
print(os.listdir("./input"))
import copy
import datetime
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from lightgbm import LGBMRegressor
from sklearn.linear_model import LogisticRegression
from mlxtend.regressor import StackingRegressor
import sklearn.metrics
import warnings

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

['column_description.csv', 'column_description_en.csv', 'sample_submission.csv', 'test_offline.csv', 'train_offline.csv']


In [38]:
train_df = pd.read_csv('./input/train_offline.csv')
test_df = pd.read_csv('./input/test_offline.csv')

test_ids = test_df[['User_id','Coupon_id','Date_received']]

print(f' training shape : {train_df.shape} ')
print(f' testing shape : {test_df.shape} ')

 training shape : (1160742, 7) 
 testing shape : (594142, 6) 


In [39]:
def CheckMissingVals(data):
    for col in data.columns:
        if np.sum(data[col].isnull()) != 0:
            print(f' Missing values in {col} : {np.sum(data[col].isnull())}')

print("Count of missing data in training dataset: ")
CheckMissingVals(train_df)
print('\n')
print("Count of missing data in testing dataset: ")
CheckMissingVals(test_df)

Count of missing data in training dataset: 
 Missing values in Coupon_id : 413773
 Missing values in Discount_rate : 413773
 Missing values in Distance : 69826
 Missing values in Date_received : 413773
 Missing values in Date : 704033


Count of missing data in testing dataset: 
 Missing values in Coupon_id : 287829
 Missing values in Discount_rate : 287829
 Missing values in Distance : 36177
 Missing values in Date_received : 287829


In [40]:
total_df = pd.concat([train_df, test_df], axis = 0)

In [41]:
DistanceFilling_UM = total_df.groupby(['User_id','Merchant_id'])['Distance'].mean().reset_index()
DistanceFilling_UM.columns = ['User_id','Merchant_id','DistanceFilling_UM']
DistanceFilling_U = total_df.groupby(['User_id'])['Distance'].mean().reset_index()
DistanceFilling_U.columns = ['User_id','DistanceFilling_U']
DistanceFilling_M = total_df.groupby(['Merchant_id'])['Distance'].mean().reset_index()
DistanceFilling_M.columns = ['Merchant_id','DistanceFilling_M']

total_df = pd.merge(total_df,DistanceFilling_UM,on = ['User_id','Merchant_id'], how = 'left')
total_df = pd.merge(total_df,DistanceFilling_U,on = ['User_id'], how = 'left')
total_df = pd.merge(total_df,DistanceFilling_M,on = ['Merchant_id'], how = 'left')
total_df.head(10)

Unnamed: 0,Coupon_id,Date,Date_received,Discount_rate,Distance,Merchant_id,User_id,DistanceFilling_UM,DistanceFilling_U,DistanceFilling_M
0,,20160217.0,,,0.0,2632,1439408,0.0,0.142857,1.54
1,8591.0,,20160217.0,20:1,0.0,2632,1439408,0.0,0.142857,1.54
2,1078.0,,20160319.0,20:1,0.0,2632,1439408,0.0,0.142857,1.54
3,7610.0,,20160429.0,200:20,0.0,3381,1832624,0.0,0.0,2.713991
4,11951.0,,20160129.0,200:20,1.0,3381,2029232,1.0,0.2,2.713991
5,9776.0,,20160129.0,10:5,2.0,3381,2223968,2.0,2.0,2.713991
6,12034.0,,20160207.0,100:10,,2099,73611,,,1.844211
7,5054.0,,20160421.0,200:30,10.0,1569,163606,10.0,10.0,6.163654
8,7802.0,,20160130.0,200:20,10.0,4833,3273056,10.0,10.0,6.927665
9,7610.0,,20160412.0,200:20,2.0,3381,94107,2.0,2.0,2.713991


In [42]:
def DistanceMissingFill(data):
    if np.isnan(data['Distance']):
        if not np.isnan(data['DistanceFilling_UM']):
            return int(data['DistanceFilling_UM'])
        elif not np.isnan(data['DistanceFilling_U']):
            return int(data['DistanceFilling_U'])
        elif not  np.isnan(data['DistanceFilling_M']):
            return int(data['DistanceFilling_M'])
    return data['Distance']

total_df['Distance'] = total_df.apply(DistanceMissingFill, axis = 1)

In [43]:
total_df = total_df.drop(['DistanceFilling_UM','DistanceFilling_U','DistanceFilling_M'], axis = 1)
train_df = total_df[:len(train_df)]
test_df = total_df[len(train_df):]

print("Count of missing data in training dataset: ")
CheckMissingVals(train_df)
print('\n')
print("Count of missing data in testing dataset: ")
CheckMissingVals(test_df)

Count of missing data in training dataset: 
 Missing values in Coupon_id : 413773
 Missing values in Date : 704033
 Missing values in Date_received : 413773
 Missing values in Discount_rate : 413773
 Missing values in Distance : 7471


Count of missing data in testing dataset: 
 Missing values in Coupon_id : 287829
 Missing values in Date : 594142
 Missing values in Date_received : 287829
 Missing values in Discount_rate : 287829
 Missing values in Distance : 980


In [44]:
# coupon id
train_df = train_df[~train_df.Coupon_id.isnull()]
test_df = test_df[~test_df.Coupon_id.isnull()]

print(f' training shape after dropping unwant rows: {train_df.shape} ')
print(f' testing shape after dropping unwant rows: {test_df.shape} ')

 training shape after dropping unwant rows: (746969, 7) 
 testing shape after dropping unwant rows: (306313, 7) 


In [45]:
test_df = test_df.drop(['Date'], axis = 1)

In [46]:
def fifteenDaysChecking(data):
    if not np.isnan(data['Date']):
        #Means the user had used the coupon
        time_diff = pd.to_datetime(data['Date'], format = "%Y%m%d") - pd.to_datetime(data['Date_received'],
                                                                                   format = "%Y%m%d")
        if time_diff <= pd.Timedelta(15,'D'):
            return 1
    return 0
train_df['label'] = train_df.apply(fifteenDaysChecking, axis = 1)
print(train_df['label'].value_counts())
train_df.head(5)

0    710665
1     36304
Name: label, dtype: int64


Unnamed: 0,Coupon_id,Date,Date_received,Discount_rate,Distance,Merchant_id,User_id,label
1,8591.0,,20160217.0,20:1,0.0,2632,1439408,0
2,1078.0,,20160319.0,20:1,0.0,2632,1439408,0
3,7610.0,,20160429.0,200:20,0.0,3381,1832624,0
4,11951.0,,20160129.0,200:20,1.0,3381,2029232,0
5,9776.0,,20160129.0,10:5,2.0,3381,2223968,0


In [47]:
train_df = train_df.drop(['Date'], axis = 1)
#Check the missing values again
print("Count of missing data in training dataset: ")
CheckMissingVals(train_df)
print('\n')
print("Count of missing data in testing dataset: ")
CheckMissingVals(test_df)

Count of missing data in training dataset: 
 Missing values in Distance : 7471


Count of missing data in testing dataset: 
 Missing values in Distance : 980


In [48]:
#Merge training & testing dataset for processing
train_label = train_df.label
train_df = train_df.drop(['label'], axis = 1)
total_df = pd.concat([train_df,test_df], axis = 0)
total_df.head(10)

Unnamed: 0,Coupon_id,Date_received,Discount_rate,Distance,Merchant_id,User_id
1,8591.0,20160217.0,20:1,0.0,2632,1439408
2,1078.0,20160319.0,20:1,0.0,2632,1439408
3,7610.0,20160429.0,200:20,0.0,3381,1832624
4,11951.0,20160129.0,200:20,1.0,3381,2029232
5,9776.0,20160129.0,10:5,2.0,3381,2223968
6,12034.0,20160207.0,100:10,1.0,2099,73611
7,5054.0,20160421.0,200:30,10.0,1569,163606
8,7802.0,20160130.0,200:20,10.0,4833,3273056
9,7610.0,20160412.0,200:20,2.0,3381,94107
11,7531.0,20160327.0,20:5,0.0,8390,253750


In [49]:
#Do a copy of original data , in case we do something wrong for the original data
temp_total_df = copy.deepcopy(total_df)

In [50]:
#Check the unique counts of each features
total_df.nunique()

Coupon_id          9738
Date_received       167
Discount_rate        45
Distance             11
Merchant_id        5599
User_id          510698
dtype: int64

In [51]:
#Convert Date_received to str type for datetime processing
total_df['Date_received'] = total_df['Date_received'].astype('int').astype('str')
#Convert Date_received to other time information
total_df['Date_received'] = total_df['Date_received'].apply(lambda x : datetime.datetime.strptime(x,"%Y%m%d"))
total_df['Month_received'] = total_df['Date_received'].apply(lambda x : datetime.datetime.strftime(x,"%m")).astype("int64")
total_df['Day_received'] = total_df['Date_received'].apply(lambda x : datetime.datetime.strftime(x,"%d")).astype("int64")

total_df.head(10)

Unnamed: 0,Coupon_id,Date_received,Discount_rate,Distance,Merchant_id,User_id,Month_received,Day_received
1,8591.0,2016-02-17,20:1,0.0,2632,1439408,2,17
2,1078.0,2016-03-19,20:1,0.0,2632,1439408,3,19
3,7610.0,2016-04-29,200:20,0.0,3381,1832624,4,29
4,11951.0,2016-01-29,200:20,1.0,3381,2029232,1,29
5,9776.0,2016-01-29,10:5,2.0,3381,2223968,1,29
6,12034.0,2016-02-07,100:10,1.0,2099,73611,2,7
7,5054.0,2016-04-21,200:30,10.0,1569,163606,4,21
8,7802.0,2016-01-30,200:20,10.0,4833,3273056,1,30
9,7610.0,2016-04-12,200:20,2.0,3381,94107,4,12
11,7531.0,2016-03-27,20:5,0.0,8390,253750,3,27


In [52]:
total_df['Month_Cycle'] = total_df.Day_received.map(lambda x : 1 if x <= 15 else 0)

In [53]:
# Record the total days to June
def set2June(data):
    if data['Month_received'] < 6:
        return (6-data['Month_received'])*30 - data['Day_received']
    return 1
        
total_df['CloseToJune'] = total_df.apply(set2June , axis = 1)


In [54]:
total_df.Discount_rate.value_counts()

30:5       270712
100:10     182554
200:20     111046
20:5        91013
20:1        51705
50:5        47379
100:30      38196
200:30      29327
300:30      28979
50:10       28452
10:5        25925
0.95        20568
10:1        17842
30:1        17654
150:20      17437
100:20      14297
30:10       12692
50:20        8203
0.9          8085
200:50       5585
150:10       5325
100:5        5053
0.8          3441
50:1         3354
5:1          2526
100:50       1774
150:30        654
0.85          649
200:10        575
100:1         537
20:10         514
150:50        306
300:50        206
0.5           186
0.75          121
0.2           110
0.6            58
200:5          57
300:20         56
0.7            54
30:20          24
300:10         23
200:100        12
50:30           9
150:5           7
Name: Discount_rate, dtype: int64

In [55]:
total_df['DiscountType'] = total_df.Discount_rate.map(lambda x: 1 if (':' in x) else 0)
total_df.DiscountType.value_counts()

1    1020010
0      33272
Name: DiscountType, dtype: int64

In [56]:
total_df['DiscountBound'] = total_df.Discount_rate.map(lambda x: int(x.split(':')[0]) if (':' in x) else 0)
total_df['DirectPriceCut'] = total_df.Discount_rate.map(lambda x: int(x.split(':')[1]) if (':' in x) else 0)
total_df['DiscountRatio'] = total_df.Discount_rate.map(lambda x: (1 - float(x.split(':')[1])/float(x.split(':')[0])) if(':' in x) else float(x) )
total_df['MoneyCost'] = total_df['DiscountBound'] - total_df['DirectPriceCut']
total_df = total_df.drop(['Discount_rate'], axis = 1)
total_df.head(10)

Unnamed: 0,Coupon_id,Date_received,Distance,Merchant_id,User_id,Month_received,Day_received,Month_Cycle,CloseToJune,DiscountType,DiscountBound,DirectPriceCut,DiscountRatio,MoneyCost
1,8591.0,2016-02-17,0.0,2632,1439408,2,17,0,103,1,20,1,0.95,19
2,1078.0,2016-03-19,0.0,2632,1439408,3,19,0,71,1,20,1,0.95,19
3,7610.0,2016-04-29,0.0,3381,1832624,4,29,0,31,1,200,20,0.9,180
4,11951.0,2016-01-29,1.0,3381,2029232,1,29,0,121,1,200,20,0.9,180
5,9776.0,2016-01-29,2.0,3381,2223968,1,29,0,121,1,10,5,0.5,5
6,12034.0,2016-02-07,1.0,2099,73611,2,7,1,113,1,100,10,0.9,90
7,5054.0,2016-04-21,10.0,1569,163606,4,21,0,39,1,200,30,0.85,170
8,7802.0,2016-01-30,10.0,4833,3273056,1,30,0,120,1,200,20,0.9,180
9,7610.0,2016-04-12,2.0,3381,94107,4,12,1,48,1,200,20,0.9,180
11,7531.0,2016-03-27,0.0,8390,253750,3,27,0,63,1,20,5,0.75,15


In [None]:
cuttingArr = np.array([-1,80,150,250,total_df.DiscountBound.max()+1])
total_df['DiscountBound_Group'] = pd.cut(total_df.DiscountBound, cuttingArr)

print(total_df.DiscountBound_Group.value_counts())
print('\n')
#encoding intervals into integer
DiscountBound_intervals = total_df.DiscountBound_Group.unique()
total_df.DiscountBound_Group.replace(to_replace = DiscountBound_intervals[0],value = 0, inplace = True)
total_df.DiscountBound_Group.replace(to_replace = DiscountBound_intervals[1],value = 1, inplace = True)
total_df.DiscountBound_Group.replace(to_replace = DiscountBound_intervals[2],value = 2, inplace = True)
total_df.DiscountBound_Group.replace(to_replace = DiscountBound_intervals[3],value = 3, inplace = True)
print(total_df.DiscountBound_Group.value_counts())

(-1, 80]      611276
(80, 150]     266140
(150, 250]    146602
(250, 301]     29264
Name: DiscountBound_Group, dtype: int64




In [None]:
Same_Merchant_User_received = total_df[['User_id','Merchant_id']]
Same_Merchant_User_received['temp'] = 1
Same_Merchant_User_received = Same_Merchant_User_received.groupby(['User_id','Merchant_id']).agg('sum').reset_index()
Same_Merchant_User_received.columns = ['User_id','Merchant_id','Same_Merchant_User_received']

total_df = pd.merge(total_df, Same_Merchant_User_received, on = ['User_id','Merchant_id'], how = 'left')

In [59]:

Total_Coupon_User_received = total_df.groupby(['User_id'])['Coupon_id'].count().reset_index()
Total_Coupon_User_received.columns = ['User_id', 'Total_Coupon_User_received']
total_df = pd.merge(total_df,Total_Coupon_User_received, on = ['User_id'], how = 'left')

In [58]:
Same_Coupon_User_received = total_df[['User_id', 'Coupon_id']]
Same_Coupon_User_received['temp'] = 1
Same_Coupon_User_received = Same_Coupon_User_received.groupby(['User_id','Coupon_id']).agg('sum').reset_index()
Same_Coupon_User_received.columns = ['User_id','Coupon_id','Same_Coupon_User_received']

total_df = pd.merge(total_df,Same_Coupon_User_received , on = ['User_id', 'Coupon_id'], how = 'left')