In [2]:
import os.path as osp
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
from scipy.special import inv_boxcox
from scipy.stats import skew, norm, boxcox
import matplotlib.pyplot as plt
from myutils import JXPP_ETA, Plot, cal_distance, PKL
 

import warnings
warnings.filterwarnings("ignore")

In [3]:
from pyspark import SparkContext, SparkConf, HiveContext
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
import os
import time
def getPsyData(dt_input):
    print("getting"+dt_input+"psyData!!!")
    string = 'hive -e "' +"select * from fdm.fdm_sl1000000003147_ql_gis_gps where dt="+"'"+dt_input+"'"+";" + '"> psy7.txt'
    print(string)
    os.system(string)

# 模型

In [4]:
df_reg = pd.read_csv('prc/df_gt_sample.csv')
df_reg = df_reg.reset_index(drop=True)

In [5]:
num_cols = ['distance', 'sum_skus', 'sum_volume', 'sum_weight', 'sum_price', 'club_order', 'club_total']
# num_cols = ['sum_skus', 'sum_volume', 'sum_weight', 'sum_price', 'sum_volume_2', 'sum_weight_2', 'sum_price_2', 'driver_avg_time', 'club_order', 'club_total', 'club_finish', 'skus_finish', 'weight_finish', 'volume_finish', 'price_finish']
# num_cols = ['club_avg_time']
# ord_cols = ['核心团']

In [6]:
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

num_processor = make_pipeline(StandardScaler(), SimpleImputer(strategy='mean'))
cat_processor = OrdinalEncoder()
linear_preprocessor = make_column_transformer((num_processor, num_cols)) # (OrdinalEncoder(), ord_cols), 
linear_preprocessor = make_column_transformer((num_processor, num_cols))

In [7]:
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.linear_model import Lasso, Ridge, RidgeCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor, AdaBoostRegressor
from sklearn.experimental import enable_hist_gradient_boosting 

ridge_pipeline = make_pipeline(linear_preprocessor, Ridge())
lasso_pipeline = make_pipeline(linear_preprocessor, Lasso())
svr_pipeline = make_pipeline(linear_preprocessor, SVR())
gbdt_pipeline = make_pipeline(linear_preprocessor, GradientBoostingRegressor())
rf_pipeline = make_pipeline(linear_preprocessor, RandomForestRegressor())

## cross_validate用法

In [8]:
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold
num_cols = ['distance', 'sum_skus', 'sum_volume', 'sum_weight', 'sum_price', 'club_order', 'club_total']
X = df_reg[num_cols]
y = df_reg['driving_time']
y, lambda0 = boxcox(y, lmbda=None, alpha=None)
# split_ = 51014
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2333)
estimators = [('Ridge', ridge_pipeline), ('Lasso', lasso_pipeline), ('gbdt', gbdt_pipeline)]#, ('randomforest', rf_pipeline)]
# estimators = [('gbdt', gbdt_pipeline)]
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=Ridge())


# est.fit(X_train, y_train)
# y_pred = est.predict(X_test)
# print('MAPE={:.2f}'.format(mean_absolute_percentage_error(inv_boxcox(y_test, lambda0), inv_boxcox(y_pred, lambda0))))

for name, est in estimators: #+ [('Stacking Regressor', stacking_regressor)]:
#     cv = KFold(n_splits=10, shuffle=False)
    score = cross_validate(est, X, y,
                           scoring=['neg_mean_absolute_percentage_error'],
                           cv=10,
                           verbose=0,
                           return_train_score=True,
                          return_estimator=True)
#     print('MAPE={:.2f} std={:.2f}'.format(-np.mean(score['test_neg_mean_absolute_percentage_error']),
#                 np.std(score['test_neg_mean_absolute_percentage_error'])))
#     print(score['test_neg_mean_absolute_percentage_error'])
    for j in range(10):
        y_pred_by_one = score['estimator'][j].predict(X)
        print(j, 'MAPE={:.2f}'.format(mean_absolute_percentage_error(inv_boxcox(y, lambda0), inv_boxcox(y_pred_by_one, lambda0))), 
             'MAE={:.2f}'.format(np.mean(np.abs(inv_boxcox(y_pred_by_one, lambda0) - inv_boxcox(y, lambda0)))))
    
    y_pred = cross_val_predict(est, X, y, cv=10, verbose=0)   
#     print('MAPE={:.2f}'.format(mean_absolute_percentage_error(y, y_pred)))
    print('MAPE={:.2f}'.format(mean_absolute_percentage_error(inv_boxcox(y, lambda0), inv_boxcox(y_pred, lambda0))))
    

0 MAPE=0.87 MAE=7.57
1 MAPE=0.87 MAE=7.57
2 MAPE=0.87 MAE=7.56
3 MAPE=0.90 MAE=7.57
4 MAPE=0.91 MAE=7.57
5 MAPE=0.91 MAE=7.57
6 MAPE=0.91 MAE=7.57
7 MAPE=0.91 MAE=7.57
8 MAPE=0.91 MAE=7.57
9 MAPE=0.91 MAE=7.57
MAPE=0.93


In [68]:
PKL.output(score['estimator'][0], 'models/{}'.format('drive'))
PKL.output(lambda0, 'models/{}'.format('drive_lambda0'))