## 引入所需库

In [4]:
import pandas as pd
import datetime
import numpy as np
import scipy as sp
import csv
import os
import xgboost as xgb
import itertools
import operator
import warnings 
warnings.filterwarnings("ignore")

In [5]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import TransformerMixin
from sklearn.model_selection import cross_validate
from matplotlib import pylab as plt
plot = True
goal = 'Sales'
myid = 'Id'

## 定义一些变换和评判准则

In [6]:
def ToWeight(y):
    w = np.zeros(y.shape,dtype=float)
    ind = y !=0
    w[ind] = 1./(y[ind]**2)
    return w

def rmspe(yhat,y):
    w = ToWeight(y)
    np.sqrt(np.mean(w * (y - yhat)**2))
    return rmspe

def rmspe_xg(yhat,y):
    # y = y.values
    y = y.get_label()
    y = np.exp(y) - 1
    yhat = np.exp(yhat) - 1
    w = ToWeight(y)
    rmspe = np.sqrt(np.mean(w * (y - yhat)**2))
    return "rmspe",rmspe

In [7]:
store = pd.read_csv('store.csv')

In [10]:
store.head()

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [11]:
train_df = pd.read_csv('train.csv')

In [12]:
train_df.head(10)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1
5,6,5,2015-07-31,5651,589,1,1,0,1
6,7,5,2015-07-31,15344,1414,1,1,0,1
7,8,5,2015-07-31,8492,833,1,1,0,1
8,9,5,2015-07-31,8565,687,1,1,0,1
9,10,5,2015-07-31,7185,681,1,1,0,1


In [21]:
test_df = pd.read_csv('test.csv')

In [23]:
test_df.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


## 加在数据

In [16]:
def load_data():
    """
       加在数据, 设定数值型和非数值型
   """
    store = pd.read_csv('store.csv')
    train_org = pd.read_csv('train.csv',dtype={'StateHoliday':pd.np.string_})
    test_org = pd.read_csv('test.csv',dtype={'StateHoliday':pd.np.string_})
    train = pd.merge(train_org,store,on='Store',how='left')
    test = pd.merge(test_org,store,on='Store',how='left')
    features = test.columns.tolist()
    numerics = ['int16','int32','int64','float16','float32','float64']
    features_numeric = test.select_dtypes(include=numerics).columns.tolist()
    features_non_numeric = [f for f in features if f not in features_numeric]
    return(train,test,features,features_non_numeric)
# features
# ['Id','Store','DayOfWeek','Date','Open','Promo','StateHoliday','SchoolHoliday','StoreType','Assortment',
#  'CompetitionDistance', 'CompetitionOpenSinceMonth','CompetitionOpenSinceYear','Promo2','Promo2SinceWeek',
#  'Promo2SinceYear','PromoInterval']

# features_non_numeric ['Date', 'StateHoliday', 'StoreType', 'Assortment', 'PromoInterval']

In [24]:
# load_data()

## 数据与特征处理

In [None]:
def process_data(train,test,features,features_non_numeric):
    """
    Feature engineering and selection
    """
    ##   Feature engineering
    train = train[train['Sales'] > 0]
    for data in [train,test]:
        
        # year month day
        data['year'] = data.Date.apply(lambda x: x.split('-')[0])
        data['year'] = data['year'].astype(float)
        data['month'] = data.Date.apply(lambda x: x.split('-')[1])
        data['month'] = data['moth'].astype(float)
        data['day'] = data.Date.apply(lambda x: x.split('-')[2])
        data['day'] = data['data'].astype(float)
        
        # promo interval "Jan,APr,Jul,Oct"
        data['promojan'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x, float) else 1 if "Jan" in x else 0)
        # TypeError: 
        data['promofed'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x,float) else 1 if "Feb" in x else 0)
        data['promomar'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x,float) else 1 if "Mar" in x else 0)
        data['promomapr'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x,float) else 1 if "Apr" in x else 0)
        data['promomay'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x,float) else 1 if "May" in x else 0)
        data['promomjun'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x,float) else 1 if "Jun" in x else 0)
        data['promojul'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x,float) else 1 if "Jul" in x else 0)
        data['promoaug'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x,float) esle 1 if "Aug" in x else 0)
        data['promosep'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x,float) else 1 if "Sep" in x else 0)
        data['promooct'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x,float) else 1 if "Oct" in x else 0)
        data['promonov'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x,float) else 1 if "Nov" in x else 0)
        data['promodec'] = data.PromoInterval.apply(lambda x: 0 if isinstance(x,float) esle 1 if "Dec" in x else 0)
        
        # Features set
        noisy_features = [myid,'Date']
        features = [c for c in features if c not in noisy_features]
        features_non_numeric = [c for c in features_non_numeric if c not in noisy_features]
        features.extend(['year','month','day'])
        
        # Fill NA
        class DataFrameImputer(transformerMixin):
            def __init__(self):
                