# 项目概述

**背景**

智能营销工具可以帮助商家预测用户购买的行为，这里我们提供了品牌商家的历史订单数据，请构建一个 预测模型，预估用户人群在规定时间内产生购买行为的概率。该模型可应用于各种电商数据分析，以及 百度电商开放平台 ，不仅可以帮助商家基于平台流量，进行商品售卖、支付，还可以通过MarTech技术更精准地锁定核心用户，对用户的购买行为进行预测。

**任务**

预测下下个月用户是否购买
1. 训练集train.csv
2. 测试集test2.csv

**标准**

提交的结果文件可以是概率，也就是0-1之间，而不是0或者1。

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score

# 数据探索

In [2]:
data_raw = pd.read_csv('./train.csv')
data_raw.head(5)

Unnamed: 0,order_detail_id,order_id,order_total_num,order_amount,order_total_payment,order_total_discount,order_pay_time,order_status,order_count,is_customer_rate,...,customer_gender,member_status,is_member_actived,goods_id,goods_class_id,goods_price,goods_status,goods_has_discount,goods_list_time,goods_delist_time
0,1000000,1000000,1.0,239.9,96.9,0.0,2012-11-01 00:10:56,6,1.0,0.0,...,,,,998,998,54.909289,1.0,0.0,2014-10-25 11:08:07,2014-11-01 11:08:07
1,1001530,1001327,2.0,288.0,96.9,0.0,2013-08-31 23:14:42,6,2.0,0.0,...,,,,1953,1953,45.961352,0.0,1.0,2013-08-28 17:27:50,2013-09-01 00:38:17
2,1001531,1001327,2.0,288.0,96.9,0.0,2013-08-31 23:14:42,6,2.0,0.0,...,,,,1083,1083,53.035439,1.0,0.0,2014-10-29 18:21:05,2014-11-05 18:21:05
3,1001532,1001328,3.0,180.0,89.7,0.0,2013-08-31 22:06:35,6,1.0,0.0,...,,,,1013,1013,46.046917,1.0,1.0,2014-10-25 11:00:00,2014-11-01 11:00:00
4,1001533,1001329,1.0,159.9,65.9,0.0,2013-08-31 21:33:36,6,1.0,0.0,...,,,,1628,1628,50.722161,1.0,0.0,2014-10-23 15:35:33,2014-10-30 15:35:33


In [3]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2306871 entries, 0 to 2306870
Data columns (total 29 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_detail_id         int64  
 1   order_id                int64  
 2   order_total_num         float64
 3   order_amount            float64
 4   order_total_payment     float64
 5   order_total_discount    float64
 6   order_pay_time          object 
 7   order_status            int64  
 8   order_count             float64
 9   is_customer_rate        float64
 10  order_detail_status     float64
 11  order_detail_goods_num  float64
 12  order_detail_amount     float64
 13  order_detail_payment    float64
 14  order_detail_discount   float64
 15  customer_province       object 
 16  customer_city           object 
 17  member_id               float64
 18  customer_id             int64  
 19  customer_gender         float64
 20  member_status           float64
 21  is_member_actived       float64

In [4]:
# 缺失值比例查看
data_raw.isnull().sum()[data_raw.isnull().sum() !=0] / len(data_raw)

customer_province    0.000494
customer_city        0.000499
customer_gender      0.724393
member_status        0.724393
is_member_actived    0.724393
goods_price          0.000189
dtype: float64

In [7]:
data_sub = pd.read_csv('./submittion.csv')
data_sub.head(10)

Unnamed: 0,customer_id,result
0,1000000,0.0
1,1000014,0.0
2,1000034,0.0
3,1000046,0.0
4,1000048,0.0
5,1000069,0.0
6,1000084,0.0
7,1000099,0.0
8,1000105,0.0
9,1000109,0.0


In [8]:
# 非法值查看
data_raw.select_dtypes(include='object').columns.tolist()

['order_pay_time',
 'customer_province',
 'customer_city',
 'goods_list_time',
 'goods_delist_time']

# 数据清洗

In [9]:
# 复制一份
data_preprocessing = data_raw.copy()
data_preprocessing = data_preprocessing.sort_values('customer_id')

In [10]:
# 重复值
data_preprocessing.drop_duplicates(inplace=True)

In [11]:
# 空值处理
data_preprocessing.fillna({
    'customer_gender': 0, 
    'member_status': 4, 
    'is_member_actived': 2,
    'customer_province': data_preprocessing['customer_province'].mode(),
    'customer_city': data_preprocessing['customer_city'].mode(),
    'goods_price': data_preprocessing['goods_price'].mean()
}, inplace=True)

In [12]:
# 非法值处理
data_preprocessing['customer_province'] = pd.factorize(data_preprocessing['customer_province'])[0]
data_preprocessing['customer_city'] = pd.factorize(data_preprocessing['customer_city'])[0]

In [13]:
# 时间格式转换
data_preprocessing['order_pay_time'] = pd.to_datetime(data_preprocessing['order_pay_time'], format='%Y-%m-%d')
data_preprocessing['goods_list_time'] = pd.to_datetime(data_preprocessing['goods_list_time'], format='%Y-%m-%d')
data_preprocessing['goods_delist_time'] = pd.to_datetime(data_preprocessing['goods_delist_time'], format='%Y-%m-%d')

In [15]:
data_preprocessing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2306871 entries, 0 to 2306870
Data columns (total 29 columns):
 #   Column                  Dtype         
---  ------                  -----         
 0   order_detail_id         int64         
 1   order_id                int64         
 2   order_total_num         float64       
 3   order_amount            float64       
 4   order_total_payment     float64       
 5   order_total_discount    float64       
 6   order_pay_time          datetime64[ns]
 7   order_status            int64         
 8   order_count             float64       
 9   is_customer_rate        float64       
 10  order_detail_status     float64       
 11  order_detail_goods_num  float64       
 12  order_detail_amount     float64       
 13  order_detail_payment    float64       
 14  order_detail_discount   float64       
 15  customer_province       int64         
 16  customer_city           int64         
 17  member_id               float64       
 18  cu

# 特征构造

In [55]:
def creat_set(df, history_time, target_time=None):    

    # ---customer_id作为索引构建features--------------------------

    history_start, history_end = history_time
    df_features = df[(df['order_pay_time'] >= history_start) & (df['order_pay_time'] <= history_end)]
    features = pd.DataFrame(index = df_features['customer_id'].unique())

    
    
    # ---用户身份属性特征---------------------------------------------
    
    ## 最新的用户身份属性
    col_customer = ['customer_gender', 'customer_province', 'customer_city', 'member_status', 'is_member_actived']
    features[col_customer] = df.groupby('customer_id')[col_customer].last()
        
    
    
    # ---订单属性特征----------------------------------------------
    
    ## 关于订单的列
    col_order_numeric = [ 'order_total_num', 'order_amount','order_total_payment', 'order_total_discount',
        'order_count', 'order_detail_goods_num', 'order_detail_amount', 'order_detail_payment', 'order_detail_discount']
    col_order_categorical = ['order_status', 'is_customer_rate', 'order_detail_status']
    
    ## 用户最后一次订单特征
    col_order = col_order_numeric + col_order_categorical
    col_order_last = ['{}_last'.format(x) for x in col_order]
    features[col_order_last] = df_features.groupby('customer_id')[col_order].last()

    ## 用户历史订单连续变量特征
    agg_method = ('max', 'min', 'mean', 'count')
    for col in col_order_numeric:
        col_temp  = ['{}_history_{}'.format(col, x) for x in agg_method]
        features[col_temp] = df_features.groupby('customer_id')[col].agg(agg_method)
        
    ## 用户历史订单离散变量特征
    ## 。。。。。。。
    
    
    
    # ---商品属性特征---------------------------------------------

    ## 关于商品的列
    col_good_numeric = ['goods_price', ]
    col_good_categorical = ['goods_class_id' ,'goods_status', 'goods_has_discount']
    col_good_date = ['goods_list_time', 'goods_delist_time']
    
    ## 用户最后一个商品特征
    col_good = col_good_numeric + col_good_categorical +  col_good_date
    col_good_last = ['{}_last'.format(x) for x in col_good]
    features[col_good_last] = df_features.groupby('customer_id')[col_good].last()
    
    ## 用户最后一个商品持续时间
    features['good_dur_day_last'] = features[col_good_last[-1]] - features[col_good_last[-2]]
    features['good_dur_day_last'] = features['good_dur_day_last'].apply(lambda x: x.days)
    features.drop(col_good_last[-2:], axis=1, inplace=True)
    
    ## 用户历史商品价格特征
    agg_method = ('max', 'min', 'mean', 'count')
    col_temp = ['{}_history_{}'.format('goods_price', x) for x in agg_method]
    features[col_temp] = df_features.groupby('customer_id')['goods_price'].agg(agg_method)
    
    ## 用户历史商品离散变量特征
    ## 。。。。。。。
    
    
    
    # ---构建label----------------------------------------------
    if target_time:
        target_start, target_end = target_time
        labels = pd.DataFrame(0, index=features.index, columns=['target'])
        df_target = df[(df['order_pay_time'] >= target_start) & (df['order_pay_time'] <= target_end)]['customer_id']
        target_index = [x for x in labels.index.tolist() if x in df_target]
        labels.loc[target_index, 'target'] = 1
    else:
        labels = None
    
    return features, labels

待处理，
2. 历史订单离散特征
4. 历史商品离散特征

# 数据生成

In [56]:
# 预测9月（下个月）训练集和标签
x_train_9, y_train_9 = creat_set(
    data_preprocessing, ('2000-01-01 00:00:00', '2013-07-31 23:59:59'), ('2013-08-01 00:00:00','2013-08-31 23:59:59'))

# 预测10月（下下个月）训练集和标签
x_train_10, y_train_10 = creat_set(
    data_preprocessing, ('2000-01-01 00:00:00', '2013-06-30 23:59:59'), ('2013-08-01 00:00:00','2013-08-31 23:59:59'))

# 预测数据集
predict_set, _ = creat_set(data_preprocessing, ('2000-01-01 00:00:00', '2013-08-31 23:59:59'))

# 数据规范化

In [36]:
# 9月数据
ss_9 = StandardScaler()
x_9 = ss_9.fit_transform(x_train_9)
predict_set_9 = ss_9.transform(predict_set)

# 10月数据
ss_10 = StandardScaler()
x_10 = ss_10.fit_transform(x_train_10)
predict_set_10 = ss_10.transform(predict_set)

# 模型训练

In [38]:
# 9月训练与预测
lr_9 = LogisticRegression()
lr_9.fit(x_9, y_train_9)
predict_pro_9 = lr_9.predict_proba(predict_set_9)

# 10月训练与预测
lr_10 = LogisticRegression()
lr_10.fit(x_10, y_train_10)
predict_pro_10 = lr_10.predict_proba(predict_set_10)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# 结果输出

In [40]:
# 9月
data_sub['result'] = predict_pro_9[:,1]
data_sub.to_csv('./result_9.csv', index=False)

# 10月
data_sub['result'] = predict_pro_10[:,1]
data_sub.to_csv('./result_10.csv', index=False)