In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 商品回購預測

資料前處理：
由於原始資料是以所有交易商品作為主體,將資料型態整理成一個 dataframe,當中只提取曾經買過衛生紙的消費者的所有訂單交易資訊,並由原始資料新增變數,變數有以下四種:
I. 該商品上次購買至現在的時間長
II. 商品 30 天內的購買頻率
III. 顧客購買商品時為星期幾
IV. 購買的時段(一天以每 4 小時劃分)

模型配適:
由於當中被有購買衛生紙的資料和未購買衛生紙的資料,比例相差巨大,因此採用半監督式學習(Semi-supervised Learning)的方式,搭配下採樣(undersampling)隨機取出 10000 筆衛生紙交易資訊與 15000 筆非衛生紙交易資訊進行機器學習模型的訓練,由於單一模型訓練不佳,因此我們使用 KNN、Logistic Regression、Random Forest、MLP、LGBM、XGBoost 六個模型預測,並從中挑選適合該資料的模型,只要其中 2 個模型判斷該特徵為會購買衛生紙我們及判定為會購買衛生紙,並重複抽樣 5 次取得平均。

# 預測購買週期

以預測消費者是否會在 30 天購買為例

資料前處理：
    切割資料：切割成三份資料
            以1~3月的資料為訓練集，預測消費者4月是否回購
            以5~7月的資料為訓練集，預測消費者8月是否回購
            以9~11月的資料為訓練集，預測消費者12月是否回購
    取RFM資訊：將第一、二、三份切割資料分別探索其各自的 RFM 資訊,並將各自的 R、F、M 資訊分別使用正規化處理、正規化處理後使用Kmeans 分群的分群數,因此共有六個自變數,並分別獲得第一、二、三 RFM 資訊資料。
模型配適:
    將訓練及測試資料比例取 8:2,使用五種機器學習模型,分別為Random Forest Classifier(RF) 、 Multilayer Perceptron Classifier(MLP) 、eXtreme Gradient Boosting Classifier(XGBoost)、Light Gradient BoostingMachine Classifier(LightGBM) 、Logistic Regression Classifier(LR)進行配適。

In [None]:
import pandas as pd
import numpy as np
import sklearn.metrics as metrics
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import svm
from xgboost import XGBClassifier
import lightgbm as lgb

In [None]:
pchome = pd.read_csv('/content/drive/MyDrive/Co-lab/dataset/2021datapilot_PChome.csv')
pchome.rename(columns={'Unnamed: 0':'MEM_ID', 'Unnamed: 1':'POSTAL_CD','Unnamed: 2':'ORDER_ID'}, inplace = True)
pchome['DATE_CD'] = pd.to_datetime(pchome['DATE_CD'])
pchome['TIME_CD'] = pd.to_datetime(pchome['TIME_CD'])
pchome['PRICE'] = pchome['PRICE'].str.replace(',','').str.replace('$','').astype('int')
pchome.PRIME=pchome.PRIME.replace('是',1)
pchome.PRIME=pchome.PRIME.replace('否',0)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
first = pd.read_csv('/content/drive/MyDrive/Co-lab/dataset/tissue.csv')

In [None]:
def num(sub,pchome):
  customer=np.unique(pchome.iloc[np.where(pchome.GOODS.str.find(sub)!=-1)].MEM_ID)
  return(len(customer))

def inform(sub,pchome,num):     #找出個別客戶的資料
  customer=np.unique(pchome.iloc[np.where(pchome.GOODS.str.find(sub)!=-1)].MEM_ID)

  pchome_milk=pchome
  pchome_milk['milk_tea']=0
  pchome_milk['milk_tea'][pchome.GOODS.str.find(sub)!=-1]=1
  test=pchome_milk.loc[pchome['MEM_ID']==customer[num]]
  return(test)

In [None]:
def cus(test):    #建立個別顧客的dataframe
  testR=test.groupby(['ORDER_ID']).agg({'DATE_CD':'max'})
  c1=testR['DATE_CD'].iloc[:-1]
  c2=testR['DATE_CD'].iloc[1:]
  c1.index=c2.index
  milk_r=pd.DataFrame(c2-c1)
  milk_r['date']=100
  for i in range(milk_r.shape[0]):
    milk_r['date'].iat[i]=milk_r['DATE_CD'].iat[i].days

  testF=pd.concat([testR,milk_r,test.groupby(['ORDER_ID']).agg({'milk_tea':'max'})],axis=1)
  testF.columns=['DATE_CD','diff','diff_day','milk_tea']
  testF['Frequency']=100

  testF['weekday']=100
  for i in range(len(testF)):
    testF['weekday'].iat[i]=testF['DATE_CD'].iat[i].isoweekday()

  import collections
  for row in range(1,len(testF)):
    i=testF.iat[row,1].days
    r=row
    count=0
    while i<31:  #30天內曾經購買過的
      if r>1:
        r=r-1
        i+=testF.iat[r,1].days   #c=1
        count+=1
      elif r==1:
        count+=1
        break
    partial=testF['milk_tea'].iloc[row-count:row]
    count_0=collections.Counter(partial)[0]

    testF['Frequency'].iat[row]=partial.shape[0]-count_0

  testT=test.groupby(['ORDER_ID']).agg({'TIME_CD':'max'})
  testT['hour']=100
  for i in range(0,testT.shape[0]):
    testT['hour'].iat[i]=testT['TIME_CD'].iat[i].hour
  t1=testT["hour"]<5  #1-4
  t22=testT["hour"]>4  #5-8
  t2=testT["hour"]<9
  t33=testT["hour"]>8  #9-12
  t3=testT["hour"]<13
  t44=testT["hour"]>12 #13-16
  t4=testT["hour"]<17
  t55=testT["hour"]>17 #17-20
  t5=testT["hour"]<21
  t66=testT["hour"]>20 #21-24
  t6=testT["hour"]<25
  testT.loc[testT[t1].index,'hour']=1
  testT.loc[testT[t22*t2].index,'hour']=2
  testT.loc[testT[t33*t3].index,'hour']=3
  testT.loc[testT[t44*t4].index,'hour']=4
  testT.loc[testT[t55*t5].index,'hour']=5
  testT.loc[testT[t66*t6].index,'hour']=6

  final=pd.concat([testF,testT],axis=1)
  final=final.drop(labels=[final[0:1].index[0]],axis=0)
  return(final)


In [None]:
sub='咖啡'
number=num(sub,pchome)
for i in range(number):
  print(i)
  if i!=0:
    other_test=inform(sub,pchome,i)
    other=cus(other_test)
    first=pd.concat([first,other],axis=0)
  else:
    first_test=inform(sub,pchome,i)
    first=cus(first_test)

## 測試關聯分析

In [None]:
def num_cor(sub,pchome):
  customer=np.unique(pchome.iloc[np.where(pchome.GOODS.str.find(sub)!=-1)].MEM_ID)
  return(len(customer))

def inform_correlation(sub,pchome,num,sub2,sub3,sub4):     #找出個別客戶的資料
  customer=np.unique(pchome.iloc[np.where(pchome.GOODS.str.find(sub)!=-1)].MEM_ID)

  pchome_milk=pchome
  pchome_milk['milk_tea']=0
  pchome_milk['milk_tea'][pchome.GOODS.str.find(sub)!=-1]=1
  pchome_milk['sub2']=0
  pchome_milk['sub3']=0
  pchome_milk['sub4']=0
  pchome_milk['sub2'][pchome.GOODS.str.find(sub2)!=-1]=1
  pchome_milk['sub3'][pchome.GOODS.str.find(sub3)!=-1]=1
  pchome_milk['sub4'][pchome.GOODS.str.find(sub4)!=-1]=1

  test=pchome_milk.loc[pchome['MEM_ID']==customer[num]]
  return(test)
def cus_cor(test):    #建立個別顧客的dataframe
  testR=test.groupby(['ORDER_ID']).agg({'DATE_CD':'max'})
  c1=testR['DATE_CD'].iloc[:-1]
  c2=testR['DATE_CD'].iloc[1:]
  c1.index=c2.index
  milk_r=pd.DataFrame(c2-c1)
  milk_r['date']=100
  for i in range(milk_r.shape[0]):
    milk_r['date'].iat[i]=milk_r['DATE_CD'].iat[i].days

  testF=pd.concat([testR,milk_r,test.groupby(['ORDER_ID']).agg({'milk_tea':'max'})],axis=1)
  testF.columns=['DATE_CD','diff','diff_day','milk_tea']
  testF['Frequency']=100

  testF['weekday']=100
  for i in range(len(testF)):
    testF['weekday'].iat[i]=testF['DATE_CD'].iat[i].isoweekday()

  import collections
  for row in range(1,len(testF)):
    i=testF.iat[row,1].days
    r=row
    count=0
    while i<31:  #30天內曾經購買過的
      if r>1:
        r=r-1
        i+=testF.iat[r,1].days   #c=1
        count+=1
      elif r==1:
        count+=1
        break
    partial=testF['milk_tea'].iloc[row-count:row]
    count_0=collections.Counter(partial)[0]

    testF['Frequency'].iat[row]=partial.shape[0]-count_0

  testT=test.groupby(['ORDER_ID']).agg({'TIME_CD':'max'})
  testT['hour']=100
  for i in range(0,testT.shape[0]):
    testT['hour'].iat[i]=testT['TIME_CD'].iat[i].hour
  t1=testT["hour"]<5  #1-4
  t22=testT["hour"]>4  #5-8
  t2=testT["hour"]<9
  t33=testT["hour"]>8  #9-12
  t3=testT["hour"]<13
  t44=testT["hour"]>12 #13-16
  t4=testT["hour"]<17
  t55=testT["hour"]>17 #17-20
  t5=testT["hour"]<21
  t66=testT["hour"]>20 #21-24
  t6=testT["hour"]<25
  testT.loc[testT[t1].index,'hour']=1
  testT.loc[testT[t22*t2].index,'hour']=2
  testT.loc[testT[t33*t3].index,'hour']=3
  testT.loc[testT[t44*t4].index,'hour']=4
  testT.loc[testT[t55*t5].index,'hour']=5
  testT.loc[testT[t66*t6].index,'hour']=6

  final=pd.concat([testF,testT,test.groupby(['ORDER_ID']).agg({'sub2':'max'}),test.groupby(['ORDER_ID']).agg({'sub3':'max'}),test.groupby(['ORDER_ID']).agg({'sub4':'max'})],axis=1)
  final=final.drop(labels=[final[0:1].index[0]],axis=0)
  return(final)

In [None]:
#找出買過奶茶的顧客的交易資料
sub='奶茶'
number=num_cor(sub,pchome)

## 跑模型

In [None]:
f=first.drop(labels=['diff','TIME_CD'],axis=1)
time='2020-9-10'
f['DATE_CD']=pd.to_datetime(f['DATE_CD'])
f_up=f[f['DATE_CD']<pd.to_datetime(time)]
f_down=f[f['DATE_CD']>pd.to_datetime(time)]
f_up=f_up.drop(labels=['DATE_CD'],axis=1)
f_down=f_down.drop(labels=['DATE_CD'],axis=1)

In [None]:
train_x=f_up.drop(labels=['milk_tea'],axis=1)
train_y=f_up['milk_tea']
test_x=f_down.drop(labels=['milk_tea'],axis=1)
test_y=f_down['milk_tea']

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
LR = LogisticRegression(penalty = 'none')
RF = RandomForestClassifier(n_estimators=100)
MLP = MLPClassifier(hidden_layer_sizes = (256,128), activation="relu", random_state=1)
SVM = svm.SVC(gamma=0.001, C=10., kernel='linear', max_iter=10000, probability=True)
XGB = XGBClassifier(n_estimators=100, learning_rate= 0.3, max_depth=6)
LGBM = lgb.LGBMClassifier(application='multiclass', boosting='gbdt', learning_rate=0.1,
                            max_depth=-5, feature_fraction=0.5, random_state=42)

In [None]:
#訓練
index1=np.random.choice(f_up.loc[f_up['milk_tea']==1].index,10000)
index0=np.random.choice(f_up.loc[f_up['milk_tea']==0].index,15000)
choice1=f_up.loc[index1]
choice0=f_up.loc[index0]

from sklearn.utils import shuffle
train=shuffle(pd.concat([choice1,choice0], axis=0))
test=f_down

In [None]:
x_train=train.drop(labels=['milk_tea'],axis=1)
y_train=train['milk_tea']
x_test=test.drop(labels=['milk_tea'],axis=1)
y_test=test['milk_tea']

In [None]:
knn.fit(x_train, y_train)
LR.fit(x_train, y_train)
RF.fit(x_train, y_train)
MLP.fit(x_train, y_train)
LGBM.fit(x_train, y_train)
XGB.fit(x_train, y_train)

knn_predicted = knn.predict(x_test)
LR_predicted = LR.predict(x_test)
RF_predicted = RF.predict(x_test)
MLP_predicted = MLP.predict(x_test)
LGBM_predicted = LGBM.predict(x_test)
XGB_predicted = XGB.predict(x_test)

In [None]:
LGBM_predicted+XGB_predicted+RF_predicted

array([0, 0, 1, ..., 0, 1, 0])

In [None]:
ttt=LGBM_predicted+XGB_predicted+RF_predicted
ttt[np.where(ttt>1)]=1