In [36]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVR
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import os
from datetime import datetime
import time

In [24]:
def weighted_mean_absolute_error(y_true, y_pred, weight=None):
    if weight is None:
        weight = np.ones(y_true.shape[0])
    weight_sum = weight.sum()
    wmea = 1/weight_sum * (weight*np.abs(y_true-y_pred)).sum()
    return wmea

In [26]:
def train(X_train, y_train, X_test=None, y_test=None, weight=None):
    # model
    model = RandomForestRegressor()
    print('Training')

    print(X_train.shape)
    model.fit(X=X_train, y=y_train)

    # evaluate
    if X_test is not None:
        y_pred = model.predict(X=X_test)
        wmae = weighted_mean_absolute_error(y_test, y_pred, weight)
        print("WMAE: ", wmae)

In [27]:
data_df = pd.read_csv('data/data.csv')

In [28]:
def convert_date(date):
    if date is None:
        return None
    date = datetime.strptime(date, '%Y-%m-%d')
    return date


def get_holiday(date):
    holidays = {
        "2010-02-12": "super_bowl",
        "2011-02-11": "super_bowl",
        "2012-02-10": "super_bowl",
        "2013-02-08": "super_bowl",
        "2010-09-10": "labor_day",
        "2011-09-09": "labor_day",
        "2012-09-07": "labor_day",
        "2013-09-06": "labor_day",
        "2010-11-26": "thanksgiving",
        "2011-11-25": "thanksgiving",
        "2012-11-23": "thanksgiving",
        "2013-11-29": "thanksgiving",
        "2010-12-31": "christmas",
        "2011-12-30": "christmas",
        "2012-12-28": "christmas",
        "2013-12-27": "christmas"
    }
    if date not in holidays:
        return 'not_holiday'
    return holidays[date]


def process_date(df):
    df['Year'] = df['Date'].apply(lambda date: convert_date(date).year)
    df['Month'] = df['Date'].apply(lambda date: convert_date(date).month)
    df['Week'] = df['Date'].apply(lambda date: convert_date(date).isocalendar()[1])
    df['Holiday'] = df['Date'].apply(lambda date: get_holiday(date))
    return df


In [29]:
def convert_type(x):
    type_ = {
        'A': 3,
        'B': 2,
        'C': 1
    }
    return type_[x]

def process_type(df):
    df['Type_'] = df['Type'].apply(lambda x: convert_type(x))
    return df

In [30]:
data_df = process_date(data_df)
data_df = process_type(data_df)
print(data_df.head())

   Store  Dept        Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  \
0      1     1  2010-02-05      24924.50      False        42.31       2.572   
1      1     2  2010-02-05      50605.27      False        42.31       2.572   
2      1     3  2010-02-05      13740.12      False        42.31       2.572   
3      1     4  2010-02-05      39954.04      False        42.31       2.572   
4      1     5  2010-02-05      32229.38      False        42.31       2.572   

   MarkDown1  MarkDown2  MarkDown3  ...  MarkDown5         CPI  Unemployment  \
0        NaN        NaN        NaN  ...        NaN  211.096358         8.106   
1        NaN        NaN        NaN  ...        NaN  211.096358         8.106   
2        NaN        NaN        NaN  ...        NaN  211.096358         8.106   
3        NaN        NaN        NaN  ...        NaN  211.096358         8.106   
4        NaN        NaN        NaN  ...        NaN  211.096358         8.106   

   Type    Size  Year  Month  Week    

## Train base model

In [31]:
def get_weight(df):
    df['weight'] = df['IsHoliday'].apply(lambda x: 5 if x is True else 1)

    return df

In [48]:
train_df, test_df = train_test_split(data_df, test_size=0.2)

train_df_feature = train_df[['Store', 'Dept', 'IsHoliday', 'Week', 'Size', 'Year', 'Type_', 'Month', 'Weekly_Sales']]
test_df_feature = test_df[['Store', 'Dept', 'IsHoliday', 'Week', 'Size', 'Year', 'Type_', 'Month', 'Weekly_Sales']]

y_train = train_df_feature['Weekly_Sales'].to_numpy()
X_train = train_df_feature.drop(columns=['Weekly_Sales']).to_numpy()

y_test = test_df_feature['Weekly_Sales'].to_numpy()
X_test = test_df_feature.drop(columns=['Weekly_Sales']).to_numpy()
weight = get_weight(test_df_feature)['weight'].to_numpy()
train(X_train, y_train, X_test, y_test, weight)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Training
(337256, 8)
WMAE:  1541.8189598371705


Traing model base with features ['Store', 'Dept', 'IsHoliday', 'Week', 'Size', 'Year', 'Type_', 'Month', 'Weekly_Sales'] have result:

WMAE:  1541.930263117955

## Experiment with holiday

In [94]:
# build labelencoder for holiday
holi_encoder = LabelEncoder()
holi_encoder.fit(data_df['Holiday'].tolist())

LabelEncoder()

In [95]:
train_df.columns

Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday', 'Temperature',
       'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4',
       'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'Year', 'Month',
       'Week', 'Holiday', 'Type_'],
      dtype='object')

In [97]:
# add holiday train

holiday_train = holi_encoder.transform(train_df['Holiday'].tolist())
print(X_train.shape)
print(holiday_train.shape)
X_train_ = np.c_[X_train, holiday_train]
print('-->', X_train_.shape)

# add holiday test

holiday_test = holi_encoder.transform(test_df['Holiday'].tolist())
print(X_test.shape)
print(holiday_test.shape)
X_test_ = np.c_[X_test, holiday_test]
print('-->', X_test_.shape)

train(X_train_, y_train, X_test_, y_test, weight)

(337256, 8)
(337256,)
--> (337256, 9)
(84314, 8)
(84314,)
--> (84314, 9)
Training
(337256, 9)




WMAE:  1545.3707461646388


WMAE:  1519.7788043827425

##  Use Markdown 

In [72]:
# process fill na markdown
def fill_na(df: pd.DataFrame):
    df['MarkDown1'] = df['MarkDown1'].fillna(value=0.0)
    df['MarkDown2'] = df['MarkDown2'].fillna(value=0.0)
    df['MarkDown3'] = df['MarkDown3'].fillna(value=0.0)
    df['MarkDown4'] = df['MarkDown4'].fillna(value=0.0)
    df['MarkDown5'] = df['MarkDown5'].fillna(value=0.0)
    return df

In [88]:
train_df_mk = fill_na(train_df)
test_df_mk = fill_na(test_df)

markdown_train = train_df_mk[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']].to_numpy()
markdown_test = test_df_mk[['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']].to_numpy()

print('Train shape')
print(X_train.shape)
print(markdown_train.shape)
X_train_ = np.c_[X_train, markdown_train]
print('-->', X_train_.shape)

print('Test shape')
print(X_test.shape)
print(markdown_test.shape)
X_test_ = np.c_[X_test, markdown_test]
print('-->', X_test_.shape)

train(X_train_, y_train, X_test_, y_test, weight)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

Train shape
(337256, 8)
(337256, 5)
--> (337256, 13)
Test shape
(84314, 8)
(84314, 5)
--> (84314, 13)
Training
(337256, 13)




WMAE:  1635.5087963612532


Markdown WMAE:  1623.6446441098826

## Use CPI

In [89]:
cpi_train = train_df_mk['CPI'].to_numpy()
cpi_test = test_df_mk['CPI'].to_numpy()

print('Train shape')
print(X_train.shape)
print(cpi_train.shape)
X_train_ = np.c_[X_train, cpi_train]
print('-->', X_train_.shape)

print('Test shape')
print(X_test.shape)
print(cpi_test.shape)
X_test_ = np.c_[X_test, cpi_test]
print('-->', X_test_.shape)

train(X_train_, y_train, X_test_, y_test, weight)

Train shape
(337256, 8)
(337256,)
--> (337256, 9)
Test shape
(84314, 8)
(84314,)
--> (84314, 9)
Training
(337256, 9)




WMAE:  1631.7475901286764


CPI : WMAE:  1613.1839471726325

## Use unemployment

In [90]:
unemployment_train = train_df_mk['Unemployment'].to_numpy()
unemployment_test = test_df_mk['Unemployment'].to_numpy()

print('Train shape')
print(X_train.shape)
print(unemployment_train.shape)
X_train_ = np.c_[X_train, unemployment_train]
print('-->', X_train_.shape)

print('Test shape')
print(X_test.shape)
print(unemployment_test.shape)
X_test_ = np.c_[X_test, unemployment_test]
print('-->', X_test_.shape)

train(X_train_, y_train, X_test_, y_test, weight)

Train shape
(337256, 8)
(337256,)
--> (337256, 9)
Test shape
(84314, 8)
(84314,)
--> (84314, 9)
Training
(337256, 9)




WMAE:  1589.6848474993997


Unemployment: WMAE:  1599.0173811361162

## Use temperature

In [91]:
temperature_train = train_df_mk['Temperature'].to_numpy()
temperature_test = test_df_mk['Temperature'].to_numpy()

print('Train shape')
print(X_train.shape)
print(temperature_train.shape)
X_train_ = np.c_[X_train, temperature_train]
print('-->', X_train_.shape)

print('Test shape')
print(X_test.shape)
print(temperature_test.shape)
X_test_ = np.c_[X_test, temperature_test]
print('-->', X_test_.shape)

train(X_train_, y_train, X_test_, y_test, weight)

Train shape
(337256, 8)
(337256,)
--> (337256, 9)
Test shape
(84314, 8)
(84314,)
--> (84314, 9)
Training
(337256, 9)




WMAE:  1627.0249164251297


temperature: WMAE:  1616.27552953828

## Use Fuel_Frice

In [92]:
fuel_price_train = train_df_mk['Fuel_Price'].to_numpy()
fuel_price_test = test_df_mk['Fuel_Price'].to_numpy()

print('Train shape')
print(X_train.shape)
print(fuel_price_train.shape)
X_train_ = np.c_[X_train, fuel_price_train]
print('-->', X_train_.shape)

print('Test shape')
print(X_test.shape)
print(fuel_price_test.shape)
X_test_ = np.c_[X_test, fuel_price_test]
print('-->', X_test_.shape)

train(X_train_, y_train, X_test_, y_test, weight)

Train shape
(337256, 8)
(337256,)
--> (337256, 9)
Test shape
(84314, 8)
(84314,)
--> (84314, 9)
Training
(337256, 9)




WMAE:  1613.772685371167


Fuel_Price: WMAE:  1622.10130435505

## Markdown and holiday

In [98]:
markdown_holiday_train = np.c_[markdown_train, holiday_train]

print('Train shape')
print(X_train.shape)
print(markdown_holiday_train.shape)
X_train_ = np.c_[X_train, markdown_holiday_train]
print('-->', X_train_.shape)


markdown_holiday_test = np.c_[markdown_test, holiday_test]

print('Test shape')
print(X_test.shape)
print(markdown_holiday_test.shape)
X_test_ = np.c_[X_test, markdown_holiday_test]
print('-->', X_test_.shape)

train(X_train_, y_train, X_test_, y_test, weight)

Train shape
(337256, 8)
(337256, 6)
--> (337256, 14)
Test shape
(84314, 8)
(84314, 6)
--> (84314, 14)
Training
(337256, 14)




WMAE:  1620.59205675042
