In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import datetime
from pyspark.sql import Window
from pyspark.sql.types import *
import sys
import os
import pandas as pd
import numpy as np
import warnings
import time 
from pyspark.sql.types import StructType 
from pyspark.sql.types import StructField
import pyspark.sql.types as sql_type
name = locals()
from statsmodels.tsa.seasonal import seasonal_decompose
os.environ['PYSPARK_PYTHON'] = "/usr/bin/python3"
os.environ['PYSPARK_DRIVER_PYTHON'] = "/usr/bin/python3"

spark = (SparkSession\
         .builder\
         .appName("test")\
         .enableHiveSupport()\
         .config("spark.executor.instances", "200")\
         .config("spark.executor.memory","16g")\
         .config("spark.executor.cores","4")\
         .config("spark.driver.memory","40g")\
         .config("spark.sql.shuffle.partitions","800")\
         .config("spark.default.parallelism","800")\
         .config("spark.driver.maxResultSize", "8g")\
         .config("spark.pyspark.python", "/usr/bin/python3")\
         .config("spark.yarn.appMasterEnv.yarn.nodemanager.container-executor.class","DockerLinuxContainer")\
         .config("spark.executorEnv.yarn.nodemanager.container-executor.class","DockerLinuxContainer")\
         .config("spark.yarn.appMasterEnv.yarn.nodemanager.docker-container-executor.image-name","bdp-docker.jd.com:5000/wise_mart_rmb_py36:latest")\
         .config("spark.executorEnv.yarn.nodemanager.docker-container-executor.image-name","bdp-docker.jd.com:5000/wise_mart_rmb_py36:latest")\
         .getOrCreate())


import spa_utils

spark.sql("set hive.exec.dynamic.partition=true")
spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
spark.sql('''set hive.exec.max.dynamic.partitions=200551''')
spark.sql('''set hive.exec.max.dynamic.partitions.pernode=200551''')
params = {'author':'xiaoxiao10'}

In [2]:
latest_dt = '2019-09-13'
sku_list = spark.sql('''select 
sku_id as item_sku_id,
cid3
from dev.self_sku_det_da
where dt='%s'
and sku_type = 1
and cid3 in('740','2676') '''%latest_dt)

# 非日期特征

In [4]:
# rolling 销量特征
features_from_luyuan =  spark.sql('''select * from app.app_pa_simulation_feature_evaluation_2019_06_30_self''')
features_qtty_rolling = features_from_luyuan.select(['item_sku_id','dt','netprice','sale_qtty','rolling360mean', 'rolling180mean', 'rolling90mean','rolling28mean',\
                                                     'rolling14mean','rolling7mean','rolling5mean','rolling3mean',\
                                                    'rolling2mean','rolling1mean','rolling14median','rolling7median',\
                                                    'rolling360decaymean','rolling180decaymean','rolling90decaymean','rolling28decaymean',\
                                                    'rolling14decaymean','rolling7decaymean','rolling3decaymean','decomposedtrend'])
features_qtty_rolling = features_qtty_rolling.join(sku_list,'item_sku_id','inner')

In [5]:
features_qtty_rolling_2 = features_qtty_rolling.withColumn('gmv', F.col('netprice')*F.col('sale_qtty')).groupBy('cid3','dt').agg(F.sum('gmv').alias('cid3_gmv'))

features_qtty_rolling_2_pd = features_qtty_rolling_2.toPandas()

features_qtty_rolling_3 = features_qtty_rolling_2_pd.sort_values(['cid3','dt']).set_index('dt')

features_qtty_rolling_4 = features_qtty_rolling_3.groupby('cid3').apply(lambda x: pd.concat([seasonal_decompose(x[['cid3_gmv']], freq=365, model='additive', two_sided=False).trend.rename(columns={'cid3_gmv':'cid3trend'})\
                                                         ,seasonal_decompose(x[['cid3_gmv']], freq=365, model='additive', two_sided=False).seasonal.rename(columns={'cid3_gmv':'cid3seasonal'})],axis=1))

features_qtty_rolling_5 = spark.createDataFrame(features_qtty_rolling_4.reset_index().fillna(0))
features_qtty_rolling_6 = features_qtty_rolling.join(features_qtty_rolling_5,['cid3','dt'],'left')

In [6]:
# rolling uv特征
df_uv = spark.sql('''select sku_id as item_sku_id, uv, dt from dev.all_sku_traffic''')
df_sku_list_uv = sku_list.join(df_uv, ['item_sku_id'], 'inner').filter(F.col('uv').isNotNull())
# rolling 1
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-1,-1)
df_sku_list_uv = df_sku_list_uv.withColumn('uv_1',F.mean('uv').over(window))
# rolling 3
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-3,-1)
df_sku_list_uv = df_sku_list_uv.withColumn('uv_3',F.mean('uv').over(window))
# rolling 7
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-7,-1)
df_sku_list_uv = df_sku_list_uv.withColumn('uv_7',F.mean('uv').over(window))
# rolling 15 
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-15,-1)
df_sku_list_uv = df_sku_list_uv.withColumn('uv_15',F.mean('uv').over(window))
# rolling 30
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-30,-1)
df_sku_list_uv = df_sku_list_uv.withColumn('uv_30',F.mean('uv').over(window))

In [7]:
# rolling stock_qtty & stock_status 特征
df_stock = spark.sql('''select sku_id as item_sku_id,dt,stock_status,stock_qtty from dev.dp_pl_es_ext_v2''')
df_sku_list_stock = sku_list.join(df_stock, ['item_sku_id'], 'inner').filter(F.col('stock_qtty').isNotNull())
# rolling 1
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-1,-1)
df_sku_list_stock = df_sku_list_stock.withColumn('stock_qtty_1',F.mean('stock_qtty').over(window)).withColumn('stock_status_1',F.mean('stock_status').over(window))
# rolling 3
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-3,-1)
df_sku_list_stock = df_sku_list_stock.withColumn('stock_qtty_3',F.mean('stock_qtty').over(window)).withColumn('stock_status_3',F.mean('stock_status').over(window))
# rolling 7
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-7,-1)
df_sku_list_stock = df_sku_list_stock.withColumn('stock_qtty_7',F.mean('stock_qtty').over(window)).withColumn('stock_status_7',F.mean('stock_status').over(window))
# rolling 15 
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-15,-1)
df_sku_list_stock = df_sku_list_stock.withColumn('stock_qtty_15',F.mean('stock_qtty').over(window)).withColumn('stock_status_15',F.mean('stock_status').over(window))
# rolling 30
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-30,-1)
df_sku_list_stock = df_sku_list_stock.withColumn('stock_qtty_30',F.mean('stock_qtty').over(window)).withColumn('stock_status_30',F.mean('stock_status').over(window))

In [8]:
# rolling red_price
df_redprice = spark.sql('''select sku_id as item_sku_id, max_price as redprice, dt from %s ''' % ('dev.self_sku_redprice_group'))
df_sku_list_redprice = sku_list.join(df_redprice, ['item_sku_id'], 'inner').filter(F.col('redprice').isNotNull())
# rolling 1
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-1,-1)
df_sku_list_redprice = df_sku_list_redprice.withColumn('redprice_1',F.mean('redprice').over(window))
# rolling 3
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-3,-1)
df_sku_list_redprice = df_sku_list_redprice.withColumn('redprice_3',F.mean('redprice').over(window))
# rolling 7
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-7,-1)
df_sku_list_redprice = df_sku_list_redprice.withColumn('redprice_7',F.mean('redprice').over(window))
# rolling 15 
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-15,-1)
df_sku_list_redprice = df_sku_list_redprice.withColumn('redprice_15',F.mean('redprice').over(window))
# rolling 30
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-30,-1)
df_sku_list_redprice = df_sku_list_redprice.withColumn('redprice_30',F.mean('redprice').over(window))

In [9]:
# rolling base_price & deal_price
df_price = spark.sql('''select item_sku_id, price as dealprice, baseprice, dt, base_qtty from app.app_pa_baseline_baseprice_60_xiaoxiao_21''')
df_sku_list_price = sku_list.join(df_price, ['item_sku_id'], 'inner').filter(F.col('baseprice').isNotNull()).filter(F.col('dealprice').isNotNull())
# rolling 1 
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-1,-1)
df_sku_list_price = df_sku_list_price.withColumn('baseprice_deal_1',F.mean('baseprice').over(window)).withColumn('dealprice_1',F.mean('dealprice').over(window))
# rolling 3
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-3,-1)
df_sku_list_price = df_sku_list_price.withColumn('baseprice_deal_3',F.mean('baseprice').over(window)).withColumn('dealprice_3',F.mean('dealprice').over(window))
# rolling 7
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-7,-1)
df_sku_list_price = df_sku_list_price.withColumn('baseprice_deal_7',F.mean('baseprice').over(window)).withColumn('dealprice_7',F.mean('dealprice').over(window))
# rolling 15 
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-15,-1)
df_sku_list_price = df_sku_list_price.withColumn('baseprice_deal_15',F.mean('baseprice').over(window)).withColumn('dealprice_15',F.mean('dealprice').over(window))
# rolling 30
window = Window.partitionBy('item_sku_id','cid3').orderBy('dt').rowsBetween(-30,-1)
df_sku_list_price = df_sku_list_price.withColumn('baseprice_deal_30',F.mean('baseprice').over(window)).withColumn('dealprice_30',F.mean('dealprice').over(window))

In [10]:
# next day red_price = 路远表中的红价（most popular red price in last 30 days）
df_baseprice_red = spark.sql('''select item_sku_id, dt, baseprice as baseprice_red from app.app_pa_price_baseprice_self''')
df_baseprice_red = sku_list.join(df_baseprice_red, ['item_sku_id'], 'inner').filter(F.col('baseprice_red').isNotNull())

In [11]:
df_raw = features_qtty_rolling_6.join(df_sku_list_uv,['item_sku_id','cid3','dt'],'inner').join(df_sku_list_stock,['item_sku_id','cid3','dt'],'inner').join(df_sku_list_redprice,['item_sku_id','cid3','dt'],'inner')\
.join(df_sku_list_price,['item_sku_id','cid3','dt'],'inner').join(df_baseprice_red,['item_sku_id','cid3','dt'],'inner')
df_raw.cache()
df_raw.count()

1245968

# 日期特征

In [12]:
# time = spark.sql('''select * from app.app_pa_time''') 这个不能用，因为只有一天有标识
# 端午节取消，因为会属于618
# 清明节给0，因为从历史看清明节 gmv没什么特别的
# 中秋节也给0，因为从历史看中秋节 gmv也没什么特别的

holidays = pd.DataFrame({'2017-01-01':[0.2],'2017-01-27':[-0.2],'2017-01-28':[-0.2],'2017-04-04':[0],'2017-05-01':[0.2],
                         '2017-10-04':[0],'2017-06-01':[0.4],'2017-06-18':[0.8],'2017-11-01':[0.5],'2017-11-11':[1],
                         '2018-01-01':[0.2],'2019-01-01':[0.2],'2018-02-15':[-0.2],'2018-02-16':[-0.2],'2019-02-04':[-0.2],
                         '2019-02-05':[-0.2],'2018-04-05':[0],'2019-04-05':[0],'2018-05-01':[0.2],'2019-05-01':[0.2],'2018-09-22':[0],
                         '2019-09-13':[0],'2018-10-01':[0.2],'2019-10-01':[0.2],'2018-06-01':[0.4],'2018-06-18':[0.8],'2019-06-01':[0.4],
                         '2019-06-18':[0.8],'2018-11-01':[0.5],'2018-11-11':[1],'2019-11-01':[0.5],'2019-11-11':[1]})
holidays = pd.DataFrame(holidays.unstack()).reset_index()[['level_0',0]].rename(columns={'level_0':'dt',0:'days_flag'})

In [13]:
import datetime

def dateRange(start, end, step=1, format="%Y-%m-%d"):
    strptime, strftime = datetime.datetime.strptime, datetime.datetime.strftime
    days = (strptime(end, format) - strptime(start, format)).days
    return [strftime(strptime(start, format) + datetime.timedelta(i), format) for i in range(0, days+1, step)]

for day in dateRange('2017-06-02','2017-06-17'):
    holidays.loc[len(holidays)] = [day,0.2]
for day in dateRange('2017-11-02','2017-11-10'):
    holidays.loc[len(holidays)] = [day,0.2] 
    
for day in dateRange('2018-06-02','2018-06-17'):
    holidays.loc[len(holidays)] = [day,0.2]
for day in dateRange('2018-11-02','2018-11-10'):
    holidays.loc[len(holidays)] = [day,0.2]
    
for day in dateRange('2019-06-02','2019-06-17'):
    holidays.loc[len(holidays)] = [day,0.2]
for day in dateRange('2019-11-02','2019-11-10'):
    holidays.loc[len(holidays)] = [day,0.2]

In [14]:
all_day = pd.DataFrame({'dt':dateRange('2017-01-01','2019-12-31')})

time_feature = spark.createDataFrame(pd.merge(all_day,holidays,on = 'dt',how='left').fillna(0))
time_feature.cache()
time_feature.count()

1095

# 特征合并

In [15]:
feature = df_raw.join(time_feature,'dt','inner')
feature.cache()
feature.count()

1245968

# 取未来的label与未来的某些特征

In [16]:
df_data = feature.select('item_sku_id', 'cid3', 'netprice', 'sale_qtty','days_flag','decomposedtrend','cid3trend' ,'dt').fillna(0)

In [17]:
# 预测未来30天的
future_days =45
# 未来30天中，该价格在1%的误差范围内一共出现了5次
price_least_times = 5
window = Window.partitionBy('item_sku_id', 'cid3').orderBy('dt').rowsBetween(15,future_days)
df_final = df_data.withColumn('dt_price_qtty', F.collect_list(F.array(F.col('netprice'),F.col('sale_qtty'),F.col('decomposedtrend'),F.col('cid3trend'),F.col('days_flag'),F.col('dt'))).over(window))

In [18]:
def get_median(data):
    data.sort()
    half = len(data) // 2
    return (data[half] + data[~half]) / 2

# 必须是连续的未来30天，不然就截断至30天前的最后一天
def truncates(dt, price_list):
    if price_list == []:
        return([])
    else:
        for i in range(len(price_list)):
            if price_list[i][5] > ((datetime.datetime.strptime(dt, '%Y-%m-%d')) + datetime.timedelta(30)).strftime('%Y-%m-%d'):
                return(price_list[:i])
        return(price_list)

    
def price_qtty(dt,price_list):

    price_list = truncates(dt,price_list)
    if price_list ==[]:
        return([])
    else:
        price_list_sorted = sorted(price_list,reverse = True)
        batch_mean_price = []
        batch_num_count = []
        batch_mean_qtty = []
        batch_num_trend = []
        batch_num_cid3trend = []
        batch_num_days = []        

        while price_list_sorted:
            batch = []
            qtty = []
            trend = []
            cid3trend = []
            days = []
            for i in range(len(price_list_sorted)):
                if round(float(price_list_sorted[i][0])-0.99*float(price_list_sorted[0][0]),4)>=0:
                    batch.append(float(price_list_sorted[i][0]))
                    qtty.append(float(price_list_sorted[i][1]))
                    trend.append(float(price_list_sorted[i][2]))
                    cid3trend.append(float(price_list_sorted[i][3]))
                    days.append(float(price_list_sorted[i][4]))
                    max_i = i
            batch_mean_price.append(sum(batch) / len(batch))
            batch_num_count.append(max_i+1)
            batch_mean_qtty.append(sum(qtty)/len(qtty))
            batch_num_trend.append(sum(trend)/len(trend))
            batch_num_cid3trend.append(sum(cid3trend)/len(cid3trend))
            batch_num_days.append(sum(days)/len(days))       
            price_list_sorted = price_list_sorted[max_i+1:]

        batch_mean_copy = batch_mean_price.copy()
        medium = get_median(batch_mean_copy)
        ks = [[x,y,z,a,b,c] for x,y,z,a,b,c in zip(batch_mean_price,batch_num_count,batch_mean_qtty,batch_num_trend,batch_num_cid3trend,batch_num_days) if x>=medium*0.1]
        price_qtty = [x for x in ks if x[1]>=5]
        return(price_qtty)


price_qtty_udf = F.udf(price_qtty,ArrayType(ArrayType(FloatType())))

In [19]:
df_final_1 = df_final.withColumn('price_qtty', price_qtty_udf(F.col('dt'),F.col('dt_price_qtty')))
df_final_1.cache()
df_final_1.count()

1245968

In [20]:
df_final_2 = df_final_1.withColumn('explode', F.explode('price_qtty'))\
.withColumn('target_price', F.col('explode')[0])\
.withColumn('target_qtty', F.col('explode')[2])\
.withColumn('target_trend', F.col('explode')[3])\
.withColumn('target_cid3trend', F.col('explode')[4])\
.withColumn('target_days_flag', F.col('explode')[5])

df_final_2.cache()
df_final_2.count()

1043972

# 特征+label合并存表

In [21]:
model_data = feature.join(df_final_2.select('item_sku_id','cid3','dt','target_trend','target_cid3trend','target_days_flag','target_price','target_qtty'),['item_sku_id','cid3','dt'],'inner')
model_data.cache()
model_data.count()

1043972

上一版的xiaoxiao_3是一直使用的训练数据，时间是到8月份的  
现在最新使用的xiaoxiao_6不会用来训练模型，只是单纯用作最后的测试集，但只多刷到9月份，刨去最后45天不能用，其实也没有多少测试数据  
未来可以再刷xiaoxiao_6以得到更多的测试数据

In [22]:

model_data2 = model_data.withColumn('dt1',F.col('dt')).drop('dt').withColumnRenamed('dt1','dt')

spa_utils.save_hive_result(model_data2,'dev.dev_xianzhi_model_data_xiaoxiao_6',partitioning_columns=['dt'],write_mode='save',spark=spark,params=params)