In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import datetime
from pyspark.sql import Window
from pyspark.sql.types import *
import sys
import os
import pandas as pd
import numpy as np
import warnings
import time 
from pyspark.sql.types import StructType 
from pyspark.sql.types import StructField
import pyspark.sql.types as sql_type
name = locals()
from statsmodels.tsa.seasonal import seasonal_decompose
os.environ['PYSPARK_PYTHON'] = "/usr/bin/python3"
os.environ['PYSPARK_DRIVER_PYTHON'] = "/usr/bin/python3"

spark = (SparkSession\
         .builder\
         .appName("test")\
         .enableHiveSupport()\
         .config("spark.executor.instances", "200")\
         .config("spark.executor.memory","16g")\
         .config("spark.executor.cores","4")\
         .config("spark.driver.memory","40g")\
         .config("spark.sql.shuffle.partitions","800")\
         .config("spark.default.parallelism","800")\
         .config("spark.driver.maxResultSize", "8g")\
         .config("spark.pyspark.python", "/usr/bin/python3")\
         .config("spark.yarn.appMasterEnv.yarn.nodemanager.container-executor.class","DockerLinuxContainer")\
         .config("spark.executorEnv.yarn.nodemanager.container-executor.class","DockerLinuxContainer")\
         .config("spark.yarn.appMasterEnv.yarn.nodemanager.docker-container-executor.image-name","bdp-docker.jd.com:5000/wise_mart_rmb_py36:latest")\
         .config("spark.executorEnv.yarn.nodemanager.docker-container-executor.image-name","bdp-docker.jd.com:5000/wise_mart_rmb_py36:latest")\
         .getOrCreate())


import spa_utils

spark.sql("set hive.exec.dynamic.partition=true")
spark.sql("set hive.exec.dynamic.partition.mode=nonstrict")
spark.sql('''set hive.exec.max.dynamic.partitions=200551''')
spark.sql('''set hive.exec.max.dynamic.partitions.pernode=200551''')
params = {'author':'xiaoxiao10'}

In [2]:
import datetime
import sys
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import ElasticNetCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error,f1_score,precision_score,recall_score,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost
from sklearn.neural_network import MLPRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from scipy.spatial.distance import cosine
name = locals()

In [3]:
df = spark.sql('''select * from dev.xianzhi_v2_basedata_1''').filter(F.col('item_sku_id')=='100000022911')

In [4]:
df.cache().count()

406

## 该版本目标：仅通过price这唯一features来进行预测，将全部流程跑通

In [5]:
df_1 = df.select('item_sku_id','dt','netprice','qtty')
df_1_pandas = df_1.toPandas()
df_1_pandas['netprice'] = df_1_pandas['netprice'].map(lambda x: 60 if x <0.1 else x)

In [6]:
df_1_pandas_1 = df_1_pandas.sort_values('dt')

In [7]:
df_1_pandas_2  = df_1_pandas_1.rename(columns={'netprice':'target_price','qtty':'target_qtty'})

In [26]:
spark.sparkContext.addPyFile('SPA_simulation_functions.py')

selected_columns = ['item_sku_id', 'dt','target_price','target_qtty','test_flag','valid_flag']
X_SCHEMA_SKU = ['target_price']
rolling_columns = []
SCHEMA_OUTPUT_SKU = StructType([
    StructField("dt", sql_type.StringType()),
    StructField("item_sku_id", sql_type.StringType()),
    StructField("target_price", sql_type.FloatType()),
    StructField("prediction", sql_type.FloatType()),
    StructField("r2_predict", sql_type.FloatType()),
    StructField("r2_test", sql_type.FloatType()),
    StructField("mse", sql_type.FloatType()),
    StructField("mape", sql_type.FloatType()),
    StructField("model_type", sql_type.StringType()),
    StructField("model", sql_type.StringType()),
    StructField("feature_importance", sql_type.StringType()),
    StructField("valid_flag", sql_type.FloatType()),
    StructField("target_qtty", sql_type.FloatType())
])


def format_result_sku(row):
    return (
        str(row['dt']),
        str(row['item_sku_id']),
        float(row['target_price']),
        float(row['prediction']),
        float(row['r2_predict']),
        float(row['r2_test']),
        float(row['mse']),
        float(row['mape']),
        str(row['model_type']),
        str(row['model']),
        str(row['feature_importance']),
        float(row['valid_flag']),
        float(row['target_qtty'])
    )

In [18]:
df_1_pandas_3 = spark.createDataFrame(df_1_pandas_2)
df_1_pandas_3.cache().count()

406

In [19]:
# update_predict = ['lr', 'hr', 'rf', 'prophet']
# 使用4种模型
update_predict = ['lr']

# 按照dt打标，valid_flag test_flag
df_dt = df_1_pandas_3.select('item_sku_id','dt').distinct()
df_dt = df_dt.withColumn('rank',F.row_number().over(Window.partitionBy('item_sku_id').orderBy(F.col('dt').desc())))\
             .withColumn('valid_flag',F.when(F.col('rank')<=30,1)\
                                       .when((F.col('rank')<=45)&(F.col('rank')>=30),2).otherwise(F.lit(0)))

df_lu = df_1_pandas_3.join(df_dt.select('item_sku_id','dt','valid_flag'),['item_sku_id','dt'],'inner')

# df_sku_count = df_lu.filter(F.col('valid_flag').isin([0,4,5])).groupBy('item_sku_id')\
df_sku_count = df_lu.filter(F.col('valid_flag').isin([0])).groupBy('item_sku_id')\
                    .agg(F.count('dt').alias('count'))\
                    .filter(F.col('count')>=30)
# 过滤出训练集数据足够多的天
df_2 = df_lu.join(df_sku_count.select('item_sku_id'), ['item_sku_id'], 'inner')
df_2.cache()
df_2.count()

406

In [20]:
df_2.show()

+------------+----------+------------------+-----------+----------+
| item_sku_id|        dt|      target_price|target_qtty|valid_flag|
+------------+----------+------------------+-----------+----------+
|100000022911|2018-10-29| 297.5365853658537|         41|         0|
|100000022911|2019-05-12|251.15621621621622|         37|         0|
|100000022911|2019-03-16|277.57142857142856|         14|         0|
|100000022911|2019-09-27|218.38636363636363|         44|         1|
|100000022911|2019-01-18|254.07142857142858|         28|         0|
|100000022911|2019-04-25| 248.3776397515528|        161|         0|
|100000022911|2018-10-27|297.97297297297297|         37|         0|
|100000022911|2019-05-10| 251.0738888888889|         18|         0|
|100000022911|2018-10-23|288.04545454545456|         44|         0|
|100000022911|2019-05-06|248.78050000000002|         40|         0|
|100000022911|2018-12-28|257.05882352941177|         17|         0|
|100000022911|2019-07-11| 266.3478260869565|    

In [21]:
from SPA_simulation_functions import *
spark.sparkContext.addPyFile('SPA_simulation_functions.py')

In [22]:
# 输出验证集结果
# 每个sku有多个模型结果
# df_2 = df_2.withColumn('test_flag',F.when(F.col('valid_flag').isin([0,4,5]),F.lit(0)).otherwise(F.lit(1)))
df_2 = df_2.withColumn('test_flag',F.when(F.col('valid_flag').isin([0]),F.lit(0)).otherwise(F.lit(1)))

In [27]:
result2 = df_2.select(selected_columns).rdd.map(lambda row: ((row['item_sku_id']), row)).groupByKey()\
    .flatMap(lambda row : calculate_baseline_sku(row, update_predict, selected_columns, 
                                                 X_SCHEMA_SKU, rolling_columns, 'self'))

result_df2 = spark.createDataFrame(result2.map(format_result_sku), schema=SCHEMA_OUTPUT_SKU)

# result_df2 = result_df2.na.drop(subset=['prediction'])
# result_df2.cache()
# result_df2.count()

In [28]:
result_df2.cache().count()

45