In [1]:
'''
此模型针对新用户8到14天以及84到90天之内是否会活动，输入的特征为新用户在前7天的表现，得出的结果为该用户在8到14天是否有活动（0和1，用概率表示，1表示有活动）
输入特征（参见newuser_needs.csv)：
--输入特征根据前七天新用户的表现得到，以其中第二天为例子，剩下六天与之相同：

1. 第2天在JollyChic平台上停留的时间
2. 第2天浏览页面数
3. 第2天是否注册
4. 第2天加购次数
5. 第2天搜索次数
6. 第2天被埋点track到的次数
7. 第2天访问了多少次详情页（单个产品的详情页多次也算多次）
8. 第2天访问了多少产品，以详情页为准（同一产品的多次访问算一次）
9. 第2天访问的产品，涉及多少个大类
10. 第2天下了多少单
11. 第2天订单总额

--模型的预测标签为：

1. purchased_in_8_14---第8到14天内有下单：1为有下单，0为没有
2. active_in_8_14---第2周如果依旧有任何活动，都为1，否则为0
3. activity_half_reduced_8_14---第二周内被埋点track到的活动减半
4. purchased_in_8_90---第8到90天内有下单：1为有下单，0为没有
5. active_in_84_90---90天内最后一周仍有活动，为1，否则为0

注：搭建好的模型之后由于业务考虑只用来预测active_in_8_14。其他标签不用预测
=================================================================================


模型构建大体遵循：
输入数据--随机抽样--训练特征--调优参数--评估预测结果--输入新数据进行预测

=================================================================================
'''
'''
导入必要的library
'''
import csv
import numpy as np
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, roc_auc_score
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

from sklearn.externals import joblib
import pandas as pd
import sys
import os
sys.path.append('/home/nick/newuser')
from imblearn.over_sampling import SMOTE

In [2]:

'''
指定数据的路径并输入数据
'''
Location=r'C:/intern/new_user_prediction/newuser_model_sampled/0713_data_sampled.csv'
df = pd.read_csv(Location,header=0,low_memory=False,delimiter='\t')
df_platform=df.copy()


In [3]:

'''
新建df_platform并将android设为0，ios设为1
'''
df_platform.loc[df_platform['platform']=='android','platform']=0
df_platform.loc[df_platform['platform']=='ios','platform']=1
'''
新建备份，备份前17列的数据
'''
backup=df.iloc[:,0:17].copy()

'''
训练数据只保留第18列之后的
'''
df1=df.copy()
df1=df1.drop(df.columns[0:17],axis=1)

'''
由于不需要预测activity_half_reduced_8_14，所以直接让它等于active_in_8_14，如果需要预测activity_half_reduced_8_14，注释下面一行
'''
df1['activity_half_reduced_8_14'] = df1['active_in_8_14']

'''
将标签0和1互换，1表示8到14天活动没有减半，0表示8到14天活动减半
'''
df1.loc[df1['activity_half_reduced_8_14']==0, 'activity_half_reduced_8_14']=2
df1.loc[df1['activity_half_reduced_8_14']==1, 'activity_half_reduced_8_14']=0
df1.loc[df1['activity_half_reduced_8_14']==2, 'activity_half_reduced_8_14']=1

'''
插入新建的df_platform，让手机平台也作为其中一个特征
'''
df1.insert(loc=0, column='platform', value=df_platform['platform'])


In [4]:

'''
打印用户在两个手机型号上的数量分布
'''
print('andriod:{}'.format(len(backup[backup['platform']=='android'])))
print('ios:{}'.format(len(backup[backup['platform']=='ios'])))
print()
print('andriod=> 1:{}'.format(len(df1[(df1['platform']==0) & (df1['active_in_8_14']==1)])))
print('andriod=> 0:{}'.format(len(df1[(df1['platform']==0) & (df1['active_in_8_14']==0)])))

print('ios=> 1:{}'.format(len(df1[(df1['platform']==1) & (df1['active_in_8_14']==1)])))
print('ios=> 0:{}'.format(len(df1[(df1['platform']==1) & (df1['active_in_8_14']==0)])))


andriod:2388
ios:2611

andriod=> 1:669
andriod=> 0:1719
ios=> 1:979
ios=> 0:1632


In [5]:

'''
feature_result_split: 此函数用于分离特征和标签（注意标签有三个），data为上面得到的原始数据，df_num表示标签的种类
df_num=1: 表示active_in_8_14(第-4列)
df_num=2: 表示active_in_84_90（第-1列）
df_num=3: 表示activity_half_reduced_8_14（第-3列）
'''
def feature_result_split(data, df_num):
    '''
    把后边5列用作标签的列过滤掉，feature为生成的特征
    '''
    feature=data.iloc[:,:-5]

    '''
    提取三个标签单独成三列
    '''
    if df_num==1:
        result=data.iloc[:,-4]


    if df_num==2:
        result=data.iloc[:,-1]


    if df_num==3:
        result=data.iloc[:,-3]


    print('df:',df_num, '1 vs. 0:',len(result[result==1]), len(result[result==0]))

    '''
    输出生成的特征和标签
    '''
    return feature, result


In [6]:

#feature and result split
'''
调用feature_result_split函数进行标签-特征分离
'''
X_1, Y_1 = feature_result_split(df1, 1)
X_2, Y_2 = feature_result_split(df1, 2)
X_3, Y_3 = feature_result_split(df1, 3)


df: 1 1 vs. 0: 1648 3351
df: 2 1 vs. 0: 41 4958
df: 3 1 vs. 0: 3351 1648


In [7]:

'''
train_test_smote: 对输入的特征和标签做随机采样划分，feature_set为特征，result_set为标签，backup为之前备份的
此函数将输入的数据随机划分为三大类：用于训练的数据，用于调试模型的数据，用于做模型表现评估的数据
'''
def train_test_smote(feature_set, result_set, backup):

    smt=SMOTE(ratio='auto', random_state=10, k=None, k_neighbors=5, m=None, m_neighbors=10, out_step=0.5, kind='regular', svm_estimator=None, n_jobs=-1)

    #do the train test data split randomly

    '''
    对特征和标签做SMOTE采样（Synthetic Minority Over-sampling Technique），目的是让不平衡的数据（标签为0的数据大于标签为1的数据）平衡
    这里采用K nearest neighbors 的方法
    注：SMOTE只能针对训练数据做数据扩充，否则会造成Data Leakage
    这里，60%数据用来训练模型
    20%的数据用来调试模型
    20%的数据用来评估
    '''
    subset=np.floor(len(feature_set)*0.6).astype('int')

    '''
    training_f:训练用的特征
    training_r:训练用的标签
    validate_f:调模型用的特征
    validate_r:调模型用的标签
    test_f:评估用的特征
    test_r:评估用的标签
    '''
    np.random.seed(1)
    index_whole=np.random.choice(feature_set.index,subset,replace=False)

    np.random.seed(1)
    index_validate=np.random.choice(pd.Index(index_whole), np.floor(subset/2).astype('int'), replace=False)

    index_test=pd.Index(index_whole).difference(pd.Index(index_validate))

    index_train=feature_set.index.difference(pd.Index(index_whole))

    training_f=feature_set.loc[index_train,:]
    training_r=result_set.loc[index_train]

    validate_f=feature_set.loc[index_validate,:]
    validate_r=result_set.loc[index_validate]
    backup_validate=backup.loc[index_validate,:]

    test_f=feature_set.loc[index_test,:]
    test_r=result_set.loc[index_test]
    backup_test=backup.loc[index_test,:]


    #SMOTE
    '''
    只针对之前划分出的训练数据做SMOTE，training_f和training_r做完采样后生成training_f_af和training_r_af
    '''
    training_f_af, training_r_af=smt.fit_sample(training_f, training_r)

    '''
    打印做过SMOTE的训练数据和未做过SMOTE的剩余数据
    '''
    print('balanced data ratio in training set:')
    print(len(training_r_af[training_r_af==0]) / len(training_r_af[training_r_af==1]))
    print('balanced data ratio in test set:')
    print(len(validate_r[validate_r==0]) / len(validate_r[validate_r==1]))

    return training_f_af, training_r_af, validate_f, validate_r, backup_validate, test_f, test_r, backup_test



In [8]:

'''
针对三个标签分别三次调用train_test_smote函数进行采样
'''
X1_train, Y1_train, X1_validate, Y1_validate, backup1_validate, X1_test, Y1_test, backup1_test = train_test_smote(X_1, Y_1, backup)
X2_train, Y2_train, X2_validate, Y2_validate, backup2_validate, X2_test, Y2_test, backup2_test = train_test_smote(X_2, Y_2, backup)
X3_train, Y3_train, X3_validate, Y3_validate, backup3_validate, X3_test, Y3_test, backup3_test = train_test_smote(X_3, Y_3, backup)



balanced data ratio in training set:
1.0
balanced data ratio in test set:
2.090721649484536
balanced data ratio in training set:
1.0
balanced data ratio in test set:
123.91666666666667
balanced data ratio in training set:
1.0
balanced data ratio in test set:
0.47830374753451677


In [9]:

'''
train: 用来训练模型的函数 （这里用random forest)
X_train: 训练数据的特征
X_test: 调模型用的数据的特征
Y_train: 训练数据的标签
Y_test: 调模型用的数据的标签
backup_test: 调模型用的数据对应的备份特征
name: 标签的名称
'''

def train(X_train, X_test, Y_train, Y_test, backup_test,name):
    print('Training models...')
    best_model = None
    best_ratio, best_ne, best_md = 0, 0, 0
    #for ne in np.arange(30,500,50):
    #    for md in np.arange(10,200,50):
    for ne in [100]:
        for md in [10]:

            rf = RandomForestClassifier(max_depth=md, n_estimators=ne,n_jobs=-1, random_state = 10)
            rf_fit=rf.fit(X_train, Y_train)
            prediction=rf_fit.predict(X_test)
            prediction_pro=rf_fit.predict_proba(X_test)
            '''
            用ROC_AUC作为指标调参数
            '''
            roc = roc_auc_score(Y_test, prediction_pro[:,1])
            '''
            也可以0.7的概率以上的准确率去调参，这里没有用这个方法
            '''
            #acc = accuracy_score(Y_test,prediction)
            #temp=pd.DataFrame()

            #temp['Y_test']=Y_test
            #temp['Y_predict']=prediction
            #temp['prob']=prediction_pro[:,1]
            #temp=temp.sort_values(by='prob',ascending=False)
            #temp=temp[temp['prob']>=0.7]

            #count=len(temp[(temp['Y_test']==1) & (temp['Y_predict']==1)])
            #ratio=count / len(temp)
            ratio=roc
            print('-----------------------------------------------------------')
            if ratio > best_ratio:
                best_md = md
                best_ne = ne
                best_ratio = ratio
                best_model = rf_fit
            '''
            打印每个模型对应的参数和confusion matrix
            '''
            print('n_estimators:  max_depth:',(ne, md))
            print('confusion matrix\n', confusion_matrix(Y_test,prediction))
            print('ratio:{0}'.format(ratio))
            #print('roc_auc',roc_auc_score(Y_test, prediction_pro[:,1]))
            #print('accuracy',accuracy_score(Y_test,prediction))
            print()
            print()
    '''
    打印最佳模型对应的参数和confusion matrix
    '''
    print('best_n:{0}, best_depth:{1}'.format(best_ne, best_md))
    print('best_ratio using validation set:{0}'.format(best_ratio))

    '''
    输出最佳模型（共三个模型，名称被储存在name变量里）
    '''
    joblib.dump(best_model, 'newuser_best_model_{}_0715.pkl'.format(name))

    print('best_model written to file!')
    return best_model


In [14]:


'''
evaluate: 用来评估模型的函数 （这里用random forest)
X_test: 评估数据的特征
Y_test: 评估数据的标签
backup_test: 评估数据对应的备份特征
'''
def evaluate(model, X_test, Y_test, backup_test):
    print('Evalluating models...')
    prediction_pro=model.predict_proba(X_test)
    prediction=model.predict(X_test)
    '''
    得到ROC_AUC score
    '''
    roc = roc_auc_score(Y_test, prediction_pro[:,1])
    ratio=roc

    #temp=backup_test.copy()
    #temp['Y_test']=Y_test
    #temp['Y_predict']=prediction
    #temp['prob']=prediction_pro[:,1]
    #temp=temp.sort_values(by='prob',ascending=False)
    #temp=temp[temp['prob']>=0.7]
    #count=len(temp[(temp['Y_test']==1) & (temp['Y_predict']==1)])
    #ratio=count / len(temp)
    print('The ratio using evaluation set:{0}'.format(ratio))

    outfile=pd. DataFrame()
    outfile=backup_test.copy()
    outfile['label']=Y_test
    outfile['prob']=prediction_pro[:,1]

    '''
    打印用户在两个不同平台（android和ios)的准确率

    准确率计算方法：
    1. 生成临时dataframe存储当输入用于调试模型的数据后模型给出的结果 （真实标签，模型预测的标签，模型预测的概率）
    2. 把生成的dataframe按照模型预测的概率按从大到小排列
    3. 选取概率0.64作为门槛
    4. 统计概率大于等于0.64的这些数据中模型预测正确的数量 N
    5. M为概率大于等于0.64对应的所有用户，N/M为模型的准确率

    '''

    #print('andriod=> 1:{}'.format(len(outfile[(outfile['platform']=='android') & (outfile['label']==1) & (outfile['prob']>=0.64)])  /  len(outfile[(outfile['platform']=='android')  & (outfile['prob']>=0.64)])))

    #print('ios=> 1:{}'.format(len(outfile[(outfile['platform']=='ios') & (outfile['label']==1) & (outfile['prob']>=0.64)])  /  len(outfile[(outfile['platform']=='ios')  & (outfile['prob']>=0.64)])))


    print('Evaluate Done!')
    return outfile



In [11]:

'''
三次调用train函数训练模型
'''

model1 = train(X1_train, X1_validate, Y1_train, Y1_validate, backup1_validate,'active_in_8_14')
model2 = train(X2_train, X2_validate, Y2_train, Y2_validate, backup2_validate,'active_in_84_90')
model3 = train(X3_train, X3_validate, Y3_train, Y3_validate, backup3_validate,'activity_half_reduced_8_14')


Training models...
-----------------------------------------------------------
n_estimators:  max_depth: (100, 10)
confusion matrix
 [[868 146]
 [196 289]]
ratio:0.7979096768946095


best_n:100, best_depth:10
best_ratio using validation set:0.7979096768946095
best_model written to file!
Training models...
-----------------------------------------------------------
n_estimators:  max_depth: (100, 10)
confusion matrix
 [[1474   13]
 [  12    0]]
ratio:0.6978536202645146


best_n:100, best_depth:10
best_ratio using validation set:0.6978536202645146
best_model written to file!
Training models...
-----------------------------------------------------------
n_estimators:  max_depth: (100, 10)
confusion matrix
 [[289 196]
 [146 868]]
ratio:0.7979096768946095


best_n:100, best_depth:10
best_ratio using validation set:0.7979096768946095
best_model written to file!


In [15]:

'''
三次调用evaluate函数进行模型评估，得到三个dataframe
'''


out_file1 = evaluate(model1, X1_test, Y1_test, backup1_test)
out_file2 = evaluate(model2, X2_test, Y2_test, backup2_test)
out_file3 = evaluate(model3, X3_test, Y3_test, backup3_test)


Evalluating models...
The ratio using evaluation set:0.8255334877531911
Evaluate Done!
Evalluating models...
The ratio using evaluation set:0.6865771812080538
Evaluate Done!
Evalluating models...
The ratio using evaluation set:0.825533487753191
Evaluate Done!


In [16]:

'''
将三个评估得出的dataframe合并成一个dataframe,加上对应的标签名
'''
out_file1=out_file1.rename(columns={'label':'active_in_8_14_label','prob':'active_in_8_14_probability'})
out_file2=out_file2.rename(columns={'label':'active_in_84_90_label','prob':'active_in_84_90_probability'})
out_file3=out_file3.rename(columns={'label':'activity_half_reduced_8_14_label','prob':'activity_half_reduced_8_14_probability'})
out_file=pd.DataFrame()
out_file=pd.merge(out_file1, out_file2[['active_in_84_90_label','active_in_84_90_probability']],left_index=True, right_index=True)
out_file=pd.merge(out_file, out_file3[['activity_half_reduced_8_14_label','activity_half_reduced_8_14_probability']],left_index=True, right_index=True)

'''
输出最终的dataframe
'''
path = 'C:/intern/new_user_prediction/newuser_model_sampled'
out_file.to_csv(os.path.join(path,'0713_data_evaluation.csv'),index=False)
print('Evaluation written to file!')



Evaluation written to file!


In [17]:

'''
predict: 训练好的模型用于预测测试数据
path: 输入原始数据的路径
pklpath: 三个最佳模型PKL文件的路径
model_name: 三个最佳模型PKL的名称
'''
def predict(path, pklpath, model_name):
    print('Starting prediction...')
    from sklearn.externals import joblib
    df = pd.read_csv(path,header=0,low_memory=False,delimiter='\t')


    '''
    重复最早处理原始数据的步骤，得到特征
    '''
    df_platform=df.copy()
    df_platform.loc[df_platform['platform']=='android','platform']=0
    df_platform.loc[df_platform['platform']=='ios','platform']=1


    df1=df.copy()
    df1=df1.drop(df.columns[0:17],axis=1)

    df1.insert(loc=0, column='platform', value=df_platform['platform'])
    df2=df.copy()

    '''
    备份原始数据中的指定列，需要保留这些列作为输出
    '''
    df_new=pd.DataFrame()
    df_new['site_id']=df2['site_id']
    df_new['device_id']=df2['device_id']
    df_new['cookie_id']=df2['cookie_id']
    df_new['data_installed']=df2['data_installed']
    df_new['platform']=df2['platform']
    df_new['country']=df2['country']

    df_new['day1_prod_num']=df2['day1_prod_num']
    df_new['day2_prod_num']=df2['day2_prod_num']
    df_new['day3_prod_num']=df2['day3_prod_num']
    df_new['day4_prod_num']=df2['day4_prod_num']
    df_new['day5_prod_num']=df2['day5_prod_num']
    df_new['day6_prod_num']=df2['day6_prod_num']
    df_new['day7_prod_num']=df2['day7_prod_num']


    feature=df1.iloc[:,:]

    '''
    根据指定PKL路径和名称读取最佳模型（共三个）
    '''
    pkl = open(os.path.join(pklpath,model_name),'rb')
    model = joblib.load(pkl)

    '''
    做模型预测
    '''
    prediction = model.predict(feature)
    prediction_pro = model.predict_proba(feature)

    #feature importance

    '''
    计算测试数据的特征重要性
    '''
    featurename=feature.columns
    importances=model.feature_importances_
    indices = np.argsort(importances)[::-1]
    fi = pd.DataFrame()
    fi['feature_name'] = featurename[indices]
    fi['importance'] = importances[indices]


    print('Prediction complete!')

    '''
    返回模型预测的标签，模型预测的概率，备份的数据，和特征重要性
    '''
    return prediction, prediction_pro[:,1], df_new, fi
   


In [19]:

'''
指定预测文件的路径和名称，调用predict函数做模型预测
注：这里只需要预测active_in_8_14，其余两个标签不需要做预测，因此只调用predict函数一次
'''

path='C:/intern/new_user_prediction/newuser_model_sampled/0715_test_sampled.csv'
prediction1, prediction1_proba, backup, fi1= predict(path,'C:/intern/new_user_prediction/newuser_model_sampled','newuser_best_model_active_in_8_14_0715.pkl')

outfile=pd.DataFrame()
outfile=backup
outfile['active_in_8_14_probability'] = prediction1_proba


Starting prediction...
Prediction complete!


In [20]:

'''
在模型的输出中做A/B TEST，步骤为：
新建两个dataframe, outfile_A和outfile_B
如果cookie_id第八位数字在0到7，则把这部分用户数据放入A组
如果cookie_id的八位不在0到7，则把这部分用户数据放入B组
业务部门将会对A组做例如短信推送，广告，之类的
B组将不做任何处理
对比A B组未来一个星期的活动
'''
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#for A B test
index_A=[]
index_B=[]
for i in range(0,len(outfile)):
    if outfile.iloc[i]['cookie_id'][7] not in ['0','1','2','3','4','5','6','7']:
        index_B.append(i)
    else:
        index_A.append(i)

outfile_A=outfile.loc[index_A,:]
outfile_B=outfile.loc[index_B,:]


outfile_A=outfile_A.sort_values(by='active_in_8_14_probability', ascending=False)
outfile_B=outfile_B.sort_values(by='active_in_8_14_probability', ascending=False)
outfile=outfile.sort_values(by='active_in_8_14_probability', ascending=False)

In [21]:
'''
输出A组预测数据和B组预测数据
'''

print('Writing to file...')
path = 'C:/intern/new_user_prediction/newuser_model_sampled'
outfile_A.to_csv(os.path.join(path, '0715_test_A_prediction.csv'),index=False)
outfile_B.to_csv(os.path.join(path, '0715_test_B_prediction.csv'),index=False)
outfile.to_csv(os.path.join(path, '0715_test_prediction.csv'),index=False)
print('Finish writing to file.')


'''
输出特征重要性
'''
fi = pd.DataFrame()
fi['feature_active_in_8_14'] = fi1['feature_name']
fi['importance_active_in_8_14'] = fi1['importance']


fi.to_csv(os.path.join(path, '0715_test_prediction_feature_importances.csv'),index=False)
print('Finish writing feature importances to file.')
print('All complete!')

'''
注释结束
'''

Writing to file...
Finish writing to file.
Finish writing feature importances to file.
All complete!


'\n注释结束\n'