### Install Libraries

In [None]:
#!pip install tsmoothie

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import pickle
import calendar
from datetime import datetime, timedelta
import os
import re
from scipy.stats import ttest_ind
from bioinfokit.analys import stat
import operator

### Import data

In [None]:
train_df = pd.read_pickle('../output/train_df.pkl')
test_df = pd.read_pickle('../output/test_df.pkl')

In [None]:
len(train_df)

In [None]:
len(test_df)

### Helper

In [None]:
def group_weekend(day):
    """This function will group days into weekends. Weekends are defined as Friday, Saturday, and Sunday. 
    Otherwise, it will be grouped as weekdays.
    Input: 
    day -> string (names of days)
    
    Output: 
    new_days -> int (boolean values for weekend or not)"""
    
    # clean day names
    lower_day = str(day).lower().strip()
    # define weekend
    weekends = ['friday', 'saturday', 'sunday']
    if lower_day in weekends:
        new_days = 1
    else:
        new_days = 0
    
    return new_days

In [None]:
def find_the_last_day(data):
    """This function will help us identify the last of day each month. 
    This is needed because Feburary can have different length in leap years.
    We need to identify both 15th and the last day of each month because
    that's when employees get paid in Ecuador.
    Input: 
    date -> int (individual dates)
    
    Output: 
    res -> int (last day of each month)"""
    
    n = len(data)
    res = []
    for i in range(n):
        last_day = calendar.monthrange(data['date_year'][i], data['date_month'][i])[1]
        res.append(last_day)
    return res

In [None]:
def find_payday(data):
    """This function will help us identify 15th and the last day of each month.
    Input: 
    data -> dataframe
    
    Output: 
    data -> dataframe (after we get boolean values to indicate if paydays or not)"""
    
    cond = (data['date_day'] == 15) | (data['date_day'] == data['last_day'])
    data.loc[:, ''] = np.where(cond, 1, 0)
    
    return data

In [None]:
def add_range(data, num):
    """This function will capture time frames for given dates.
    Input: 
    data -> dataframe (original dataframe)
    num -> int (specify the time frames for the target dates)
    
    Output: 
    new_date -> datetime (a range of dates from the given dates)"""
    
    target_range = timedelta(days = num)
    
    date['date'] = pd.to_datetime(data['date'])
    data.loc[:, 'added_dates'] = data['date'] + target_range
    
    return data

In [None]:
def find_the_time_range(data, num_days):
    """This function will help identify the difference in terms of sales in 'Transferred' holidays.
    Input:
    data -> dataframe
    num_days -> time windows that we want to focus in (int)
    
    Output:
    new_data -> dataframe (this dataframe will contain information 
    regarding different dates and their average sales for transferred holidays)
    """
    
    # selecting transferred holidays and sales only
    new_data = data[['date', 'transferred', 'sales']]
    
    # reset index so that we can loop through it
    new_data.reset_index(inplace = True)
    new_data.drop(columns = {'index'}, inplace = True)
    
    # create a time frame we want to look into
    time_range = pd.timedelta_range('1 day', periods = num_days)
    
    # create an aggregated values for different dates
    sales_summary = new_data.groupby(['date']).mean()[['sales']].reset_index()
    
    # change the summary dataframe as dictionary so that we can use mapper
    
    date_mapper = {}
    unique_dates = list(sales_summary.date.unique())
    
    for date in unique_dates:
        date_mapper[date] = sales_summary[sales_summary.date == date]['sales'].values[0]
        
    for i in range(len(time_range)):
        
        new_data.loc[:, f'-{i+1}_delta'] = new_data['date'] - time_range[i]
        new_data.loc[:, f'-{i+1}_delta_sales'] = new_data.loc[:, f'-{i+1}_delta'].map(date_mapper)
        new_data.loc[:, f'+{i+1}_delta'] = new_data['date'] + time_range[i]
        new_data.loc[:, f'+{i+1}_delta_sales'] = new_data.loc[:, f'+{i+1}_delta'].map(date_mapper)
        
    new_data = new_data[new_data['transferred'] == True]
        
    return new_data
        

In [None]:
def compare_the_means(data):
    """This function will find the difference between transferred holidays and their mean sales.
    Input: 
    data -> datafrmae (the dataframe we created for delta dates)
    
    Output:
    sales_compare -> dictionary 
    (the dictionary will contain information regarding
    the original dates and delta dates in terms of average sales)
    
    """
    
    # create a dictionary that contains information of different dates sales
    sales_compare = {}
    
    # get unique transferred holidays from the dataframe
    unique_holiday  = list(data['date'].unique())
    
    # select features that have information regarding the average sales.
    sales_features = [col for col in data.columns if 'sales' in col]
    
    for date in unique_holiday:
        # filter out a specific transferred holidays
        filtered_data = data[data['date'] == date]
        
        feature_dict = {}
        
        for feature in sales_features:
            if feature != 'sales':
                # compare the results between original dates and delta dates
                results = ttest_ind(filtered_data['sales'],
                                    filtered_data[feature].dropna())
                p_value = results[-1]
                
                # only save the results when the p-value is less than 0.05
                if p_value < 0.05:
                    feature_dict[feature] = results
                sales_compare[date] = feature_dict
    return sales_compare

In [None]:
def find_delta_date(data, days):
    
    """This function will find time delta for specific date.
    
    Input:
    data -> dataframe (the original dataframe)
    days -> list (list of days that we want to find on a given date)
    
    Output:
    data -> dataframe (dataframe that contains timedelta information using days)"""
    
    for day in days:
        data.loc[:, f"delta_{day}"] = data.apply(lambda row: row['date'] + timedelta(days = day) if row['transferred'] == True else row['date'], axis = 1)
     
    return data

In [None]:
def compare_dates_for_delta(data):
    
    """This function will compare the actual date and delta dates for those transfer holidays.
    
    Input:
    data -> dataframe (the original dataframe)
    
    Output:
    data -> dataframe (after identfying deltas)"""
    
    delta_features = [col for col in data.columns if 'delta' in col]
    
    for col in delta_features:
        if col == 'payday_delta':
            pass
        else:
            delta_list = list(data[data.date != data[col]][col].unique())
            data.loc[:, f"is_{col}"] = data.date.apply(lambda x: 1 if x in delta_list else 0)
    
    return data

In [None]:
def find_christmas_sales(christmas_list):
    
    """This function will find Christmas within the dataframe.
    
    Input:
    christmas_list -> list (a list of Christmas in different years)
    
    Output:
    date_list -> list (a range of Christmas season based on the input list)"""
    
    date_list = []
    # change string to datetime object
    christmas_list = [pd.to_datetime(d) for d in christmas_list]
    datedelta_list = [-5, -4, -3, -2, -1]
    
    
    for christmas in christmas_list:
        for delta in datedelta_list:
            target_range = timedelta(days = delta)
            christmas_delta = christmas + target_range
            date_list.append(christmas_delta)
            
    return date_list

### Check dataframe

In [None]:
train_df.isnull().sum()[train_df.isnull().sum() != 0]

There are some columns that have missing values. This might happen due to the join issue since we used left join. Let's take a look at the dataframe.

### Feature Engineering

We will create more columns after finishing our EDA, but for now, we can create few columns from the original columns.

We can change the columns to more proper data types.

###### date

In [None]:
train_df.loc[:, 'date'] = pd.DatetimeIndex(train_df.date)
test_df.loc[:, 'date'] = pd.DatetimeIndex(test_df.date)

Change the data type for date from object to date. This will allow us to manipulate data more easily. 

getting year, quarter, month, and days from the date column.

- date manipulation

In [None]:
train_df.loc[:, 'date_year'] = train_df.date.dt.year
train_df.loc[:, 'date_quarter'] = train_df.date.dt.quarter
train_df.loc[:, 'date_month'] = train_df.date.dt.month
train_df.loc[:, 'date_day'] = train_df.date.dt.day
train_df.loc[:, 'date_week'] = train_df.date.dt.week
train_df.loc[:, 'date_day_name'] = train_df.date.dt.day_name()

In [None]:
test_df.loc[:, 'date_year'] = test_df.date.dt.year
test_df.loc[:, 'date_quarter'] = test_df.date.dt.quarter
test_df.loc[:, 'date_month'] = test_df.date.dt.month
test_df.loc[:, 'date_day'] = test_df.date.dt.day
test_df.loc[:, 'date_week'] = test_df.date.dt.week
test_df.loc[:, 'date_day_name'] = test_df.date.dt.day_name()

In [None]:
train_df.loc[:, 'year_month'] = train_df.date.apply(lambda x: str(x)[:7])
test_df.loc[:, 'year_month'] = test_df.date.apply(lambda x: str(x)[:7])

Getting information from those individual dates including year, quarter, month, day and name of the days. Keep in mind that in the data description section, people get paid on the 15th and the last day of the month. Maybe we can check those days or the day after to see if there is a seasonality. We can also group Friday, Saturday, and Sunday as Weekend, and put others as Weekday. Let's create two columns from the dates.

In [None]:
plt.figure(figsize = (12, 6))
plt.title('Historical Sales Data', fontsize = 18)
sns.lineplot(x = 'date',
             y = 'sales',
             data = train_df)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.xlabel('')
plt.ylabel('')
plt.axhline(np.mean(train_df['sales']), color = 'red');

The red line shows the average sales throughout the years in the dataset. There seems to be a trend in the dataset. In the year of 2013, the company did not seem to perform that well, but the company performs better starting from the year of 2015. Looks like there is an increase at the end of years and a decrease in the beginning of years. Therefore, let's take a closer look in terms of months.

In [None]:
plt.figure(figsize = (15, 10))
plt.title("Sales by Quarter", fontsize = 18)
sns.barplot(x = 'date_quarter',
            hue = 'date_year',
            color = 'blue',
            alpha = 0.7,
            y = 'sales',
            data = train_df)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.xlabel('')
plt.ylabel('')
plt.axhline(np.mean(train_df['sales']), color = 'red');

In [None]:
plt.figure(figsize = (18, 10))
plt.title('Historical Sales Data', fontsize = 18)
sns.lineplot(x = 'year_month',
             y = 'sales',
             data = train_df)
plt.xticks(fontsize = 12, rotation = 270)
plt.yticks(fontsize = 12)
plt.xlabel('')
plt.ylabel('')
plt.axhline(np.mean(train_df['sales']), color = 'red');

oil price and sales usign scatterplot

is correlation good measure of data? data needs to be linear

first thing is to plot the data before looking into correlation.

In [None]:
train_df.corr()

In [None]:
train_df.groupby('date_month').mean()[['sales']].sort_values(by = 'sales', ascending = False)

- weekend

In [None]:
train_df.loc[:, 'is_weekend'] = train_df.date_day_name.apply(group_weekend)
test_df.loc[:, 'is_weekend'] = test_df.date_day_name.apply(group_weekend)

Find weekend days. If weekend, then 1 else 0. Please refer to the function in the helper section for detailed description. 

check if there is a difference between weekdays and weekends.

In [None]:
train_df.groupby('is_weekend').mean()[['sales']]

In [None]:
ttest_ind(train_df[train_df.is_weekend == 1]['sales'],
          train_df[train_df.is_weekend == 0]['sales'])

Based on the test, there is a difference between the weekdays and weekends.

In [None]:
train_plot = train_df.copy()

In [None]:
train_plot.loc[:, 'to_actual_weekends'] = train_plot.is_weekend.apply(lambda x: 'weekend' if x == 1 else 'weekdays')

In [None]:
plt.figure(figsize = (15, 10))
plt.title("Sales: Weekdays vs Weekend", fontsize = 18)
sns.barplot(x = 'to_actual_weekends',
            color = 'blue',
            alpha = 0.7,
            y = 'sales',
            data = train_plot)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.xlabel('')
plt.ylabel('')
plt.axhline(np.mean(train_df['sales']), color = 'red');

- payday

In [None]:
train_df.loc[:, 'last_day'] = find_the_last_day(train_df)
test_df.loc[:, 'last_day'] = find_the_last_day(test_df)

Find the last day of each month first. 

In [None]:
train_df.loc[:, 'is_payday'] = np.where((train_df['date_day'] == 15) | (train_df['date_day'] == train_df['last_day']), 1, 0)

In [None]:
test_df.loc[:, 'is_payday'] = np.where((test_df['date_day'] == 15) | (test_df['date_day'] == test_df['last_day']), 1, 0)

Getting the payday using the last day or the 15th.

In [None]:
train_df.groupby('is_payday').mean()[['sales']]

In [None]:
ttest_ind(train_df[train_df.is_payday == 1]['sales'],
          train_df[train_df.is_payday == 0]['sales'])

looks like there is no difference between payday and others. we can take a look at the total sales and try to capture the range after the payday.

Let's take the payday within the partition and try to see if sales within a week timeframe would be different.

In [None]:
train_df.loc[:, 'actual_payday'] = train_df.apply(lambda row: row['date'] if row['is_payday'] == 1 else None, axis = 1)
test_df.loc[:, 'actual_payday'] = test_df.apply(lambda row: row['date'] if row['is_payday'] == 1 else None, axis = 1)

In [None]:
train_df.actual_payday.fillna(method ='ffill', inplace = True)
test_df.actual_payday.fillna(method ='ffill', inplace = True)

Even after finding the actual pay day, there are missing values. This is because the first payday partition is in Junuary 1st. We can impute those missing values as 2012-12-31. For the test dataset, we can impute 8-15-22.

In [None]:
train_df.loc[:, 'actual_payday'] = train_df.actual_payday.fillna(pd.to_datetime('2012-12-31'))
test_df.loc[:, 'actual_payday'] = test_df.actual_payday.fillna(pd.to_datetime('2017-08-15'))

In [None]:
train_df.loc[:, 'payday_delta'] = train_df.date - train_df.actual_payday
test_df.loc[:, 'payday_delta'] = test_df.date - test_df.actual_payday

payday delta should range from 0 day (the date when people get paid) to 15 days (to the next paycheck period since people get paid two times per month in Ecuador). The data manipulation makes sense. 

In [None]:
train_df.groupby('payday_delta').mean()[['sales']]

Looks like different days have different sales, but how can we confirm that? Let's use Anova to see if there is any differences.

In [None]:
res = stat()
res.anova_stat(df = train_df, 
               res_var = 'sales', 
               anova_model = 'sales ~ C(payday_delta)')
res.anova_summary

Looks like there might be at least one difference between these groups. 

Using the summary statistics, let's get the average value of these dates, and divide them into 3 bins.

In [None]:
train_df.groupby('payday_delta').mean()[['sales']].describe()

Cleaned our group differences and let's use the summary statistics to find out the cut off. Let's use the median (50%) as our cut off. 

In [None]:
sales_median = train_df.groupby('payday_delta').mean()[['sales']].describe().T['50%'].values[0]

In [None]:
payday_mapper = train_df.groupby('payday_delta').mean()[['sales']] > sales_median

In [None]:
payday_mapper.reset_index(inplace = True)

In [None]:
payday_mapper.rename(columns= {'sales':'is_above_median'}, inplace = True)

Based on the observation, looks like people normally do grocery within 7 days after they get paid

In [None]:
delta_list = list(payday_mapper[payday_mapper.is_above_median == True]['payday_delta'])

In [None]:
train_df.loc[:, 'is_above_median'] = train_df.payday_delta.apply(lambda x: 1 if x in delta_list else 0)
test_df.loc[:, 'is_above_median'] = test_df.payday_delta.apply(lambda x: 1 if x in delta_list else 0)

###### store_nbr

In [None]:
store_sales_avg = train_df.groupby('store_nbr').mean()[['sales']].sort_values(by = 'sales', ascending = False)

In [None]:
store_sales_avg.reset_index(inplace = True)

In [None]:
plt.figure(figsize = (18, 8))
plt.title('Sales Based on Stores', fontsize = 20)
store_avg_bar = sns.barplot(x = 'store_nbr', 
            y = 'sales',
            color = 'blue',
            alpha = 0.6,
            data = store_sales_avg)
plt.xticks(fontsize = 20, rotation = 270)
plt.yticks(fontsize = 20)
plt.xlabel('')
plt.ylabel('')
store_avg_bar.axhline(np.mean(store_sales_avg)['sales'], linewidth = 3, color = 'red');

The plot above shows the average of sales based on different stores. Using the summary statistics, let's re group them based on their sales. We can use this summary to put stores into different bins.

In [None]:
first_q = store_sales_avg['sales'].describe()['25%']
third_q = store_sales_avg['sales'].describe()['75%']

In [None]:
store_sales_avg.loc[:, 'store_sales_bins'] = store_sales_avg.sales.apply(lambda x: 'low' if x < first_q else ('avg' if x < third_q else 'high'))

Using this information, let's regroup stores.

In [None]:
train_df = train_df.merge(store_sales_avg[['store_nbr', 'store_sales_bins']],
               left_on = 'store_nbr',
               right_on = 'store_nbr',
               how = 'left')

test_df = test_df.merge(store_sales_avg[['store_nbr', 'store_sales_bins']],
               left_on = 'store_nbr',
               right_on = 'store_nbr',
               how = 'left')

###### family

Similarly, we can apply the same logic that we created in the store_nbr to the family column. We will use 25% and 75% percentile.

In [None]:
family_avg_sales = train_df.groupby('family').mean()[['sales']].sort_values(by = 'sales', ascending = False)

In [None]:
family_avg_sales.reset_index(inplace = True)

In [None]:
family_first_q = family_avg_sales['sales'].describe()['25%']
family_third_q = family_avg_sales['sales'].describe()['75%']

In [None]:
family_avg_sales.loc[:, 'family_sales_bins'] = family_avg_sales.sales.apply(lambda x: 'low' if x < family_first_q else ('avg' if x < family_third_q else 'high'))

In [None]:
train_df = train_df.merge(family_avg_sales[['family', 'family_sales_bins']],
               left_on = 'family',
               right_on = 'family',
               how = 'left')

test_df = test_df.merge(family_avg_sales[['family', 'family_sales_bins']],
               left_on = 'family',
               right_on = 'family',
               how = 'left')

###### onpromotion

This feature presents the number of total items that were on promotion on a given date. We can use state, city, family, and the store numbers to find out the summary statistics for average promoted items.

In [None]:
onpromo_avg = train_df.groupby(['state', 'city', 'family', 'store_nbr']).mean()[['onpromotion']]

In [None]:
onpromo_avg.reset_index(inplace = True)

In [None]:
onpromo_avg.loc[:, 'unique_key'] = onpromo_avg.state.apply(lambda x: str(x).lower().strip()) + '-' + onpromo_avg.city.apply(lambda x: str(x).lower().strip()) + '-' + onpromo_avg.family.apply(lambda x: str(x).lower().strip()) + '-' + onpromo_avg.store_nbr.apply(lambda x: str(x))

In [None]:
train_df.loc[:, 'unique_key'] = train_df.state.apply(lambda x: str(x).lower().strip()) + '-' + train_df.city.apply(lambda x: str(x).lower().strip()) + '-' + train_df.family.apply(lambda x: str(x).lower().strip()) + '-' + train_df.store_nbr.apply(lambda x: str(x))
test_df.loc[:, 'unique_key'] = test_df.state.apply(lambda x: str(x).lower().strip()) + '-' + test_df.city.apply(lambda x: str(x).lower().strip()) + '-' + test_df.family.apply(lambda x: str(x).lower().strip()) + '-' + test_df.store_nbr.apply(lambda x: str(x))

In [None]:
onpromo_avg.rename(columns= {'onpromotion':'onpromotion_avg'}, inplace = True)

In [None]:
train_df = train_df.merge(onpromo_avg[['unique_key', 'onpromotion_avg']],
               left_on = 'unique_key',
               right_on = 'unique_key',
               how = 'left')

test_df = test_df.merge(onpromo_avg[['unique_key', 'onpromotion_avg']],
               left_on = 'unique_key',
               right_on = 'unique_key',
               how = 'left')

In [None]:
train_df.loc[:, 'onpromo_avg_bins'] = train_df.apply(lambda row: 'higher_than_avg' if row['onpromotion'] > row['onpromotion_avg'] else 'lower_than_avg', axis = 1)
test_df.loc[:, 'onpromo_avg_bins'] = test_df.apply(lambda row: 'higher_than_avg' if row['onpromotion'] > row['onpromotion_avg'] else 'lower_than_avg', axis = 1)

Using information above, we can recode the values. 

###### dcoilwtico

In [None]:
plt.figure(figsize = (10, 8))
plt.title("Oil Price Change in Ecuador", fontsize = 16)
oil_line = sns.lineplot(x = 'date',
             y = 'dcoilwtico',
             color = 'blue',
             alpha = 0.7,
             data = train_df)
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
plt.xlabel('')
plt.ylabel('')
oil_line.axhline(np.mean(train_df['dcoilwtico']), linewidth = 3, color = 'red');

In [None]:
train_df.dcoilwtico.isnull().sum()

In [None]:
test_df.dcoilwtico.isnull().sum()

In [None]:
train_df.dcoilwtico.fillna(method = 'ffill', inplace = True)

In [None]:
test_df.dcoilwtico.fillna(method = 'ffill', inplace = True)

fill out missing values

In [None]:
oil_price = pd.read_csv('../output/complete_oil.csv')

In [None]:
oil_price.loc[:, 'previous_price'] = oil_price.dcoilwtico.shift(1)

In [None]:
oil_price.loc[:, 'price_indicator'] = oil_price.apply(lambda row: 'decreased' if row['dcoilwtico'] > row['previous_price'] else 'increased_or_same', axis = 1)

In [None]:
oil_price.loc[:, 'year'] = oil_price.date.apply(lambda x: x[:4])

In [None]:
oil_summary_stat = oil_price.groupby('year').describe()[['dcoilwtico']].reset_index()

In [None]:
oil_dict = {}

unique_year = list(oil_summary_stat.year.values)

for year in unique_year:
    if year not in oil_dict:
        oil_dict[year] = oil_summary_stat[oil_summary_stat.year == year][('dcoilwtico',   'mean')].values[0]

this dictionary contains years and their average oil price. 

1. bring the price indicator for the gas price
2. using the average gas price

In [None]:
train_df.loc[:, 'date_join'] = train_df.date.apply(lambda x: str(x)[:10])
test_df.loc[:, 'date_join'] = test_df.date.apply(lambda x: str(x)[:10])

In [None]:
train_df = train_df.merge(oil_price[['date','price_indicator']], 
               left_on = 'date_join',
               right_on = 'date',
               how = 'left')


test_df = test_df.merge(oil_price[['date','price_indicator']], 
               left_on = 'date_join',
               right_on = 'date',
               how = 'left')

In [None]:
train_df.price_indicator.fillna('increased_or_same', inplace = True)
test_df.price_indicator.fillna('increased_or_same', inplace = True)

find the indicator for gas price based on the previous price.

In [None]:
year_avg_price = pd.DataFrame(oil_dict.items(), columns = ['years', 'avg_oil_price'] )

In [None]:
year_avg_price.years = year_avg_price.years.apply(lambda x: int(x))

getting the mean of gas price of each year

In [None]:
train_df = train_df.merge(year_avg_price,
               left_on= 'date_year',
               right_on= 'years')

test_df = test_df.merge(year_avg_price,
               left_on= 'date_year',
               right_on= 'years')

In [None]:
train_df.loc[:, 'is_higher_than_avg_oil_price'] = train_df.apply(lambda row: 1 if row['dcoilwtico'] > row['avg_oil_price'] else 0, axis = 1)
test_df.loc[:, 'is_higher_than_avg_oil_price'] = test_df.apply(lambda row: 1 if row['dcoilwtico'] > row['avg_oil_price'] else 0, axis = 1)

- seprate the years

Based on the plot above, it looks like the price before 2015 and after 2015 would have different patterns. Therefore, let's seperate these price gap so that the model can distinguish.

find the difference for month to month

In [None]:
oil_total_avg = np.mean(oil_price.dcoilwtico)

In [None]:
oil_price_avg_by_month = train_df.groupby('year_month').mean()[['dcoilwtico']]

In [None]:
oil_price_avg_by_month[oil_price_avg_by_month.dcoilwtico < oil_total_avg]

Looks like after 2014-12, the price of oil dropped dramatically. Let's divide in the dataset. 

In [None]:
train_df.rename(columns= {'date_x':'date'}, inplace = True)
test_df.rename(columns= {'date_x':'date'}, inplace = True)

In [None]:
train_df.drop(columns= ['date_y', 'date_join'], inplace = True)
test_df.drop(columns= ['date_y', 'date_join'], inplace = True)

In [None]:
train_df.loc[:, 'is_after_2014-12'] = train_df.date.apply(lambda x: 1 if x >= pd.to_datetime('2014-12-01') else 0)
test_df.loc[:, 'is_after_2014-12'] = test_df.date.apply(lambda x: 1 if x >= pd.to_datetime('2014-12-01') else 0)

###### Holiday

- holiday missing values

There are some missing values in holiday related features. This is because

1. Null values in holiday_counts happen because the given dates are not holidays. Therefore, we can impute those missing values as 0.

2. Null values in is_multiple also happen for the same reason. We can also impute those values as 0.

In [None]:
train_df.holiday_counts.fillna(0, inplace = True)
test_df.holiday_counts.fillna(0, inplace = True)

train_df.is_multiple.fillna(0, inplace = True)
test_df.is_multiple.fillna(0, inplace = True)

In [None]:
train_plot = train_df.copy()

In [None]:
train_plot.holiday_counts.fillna(0, inplace = True)
test_df.holiday_counts.fillna(0, inplace = True)

train_plot.is_multiple.fillna(0, inplace = True)
test_df.is_multiple.fillna(0, inplace = True)

In [None]:
train_plot.loc[:, 'mults'] = train_plot.is_multiple.apply(lambda x: 'multiple' if x == 1 else 'N/A')

In [None]:
plt.figure(figsize = (15, 10))
plt.title("Sales: Multiple Holidays vs Regular Days", fontsize = 18)
sns.barplot(x = 'mults',
            color = 'blue',
            alpha = 0.7,
            y = 'sales',
            data = train_plot)
plt.xticks(fontsize = 18)
plt.yticks(fontsize = 18)
plt.xlabel('')
plt.ylabel('')
plt.axhline(np.mean(train_df['sales']), color = 'red');

- transferred

using the function we create above, let's find out the different time frames and the sales difference.

In [None]:
transfer_holiday = pd.read_pickle('../asset/transfer_holidays.pkl')

In [None]:
train_df = train_df.merge(transfer_holiday[['date', 'transferred']], left_on= 'date', right_on= 'date', how = 'left')
test_df = test_df.merge(transfer_holiday[['date', 'transferred']], left_on= 'date', right_on= 'date', how = 'left')

In [None]:
transferred = find_the_time_range(train_df, 5)

In [None]:
pvalues = compare_the_means(transferred)

In [None]:
delta_dict = {}

for val in pvalues.values():
    for key in val:
        if key not in delta_dict:
            delta_dict[key] = 1
        else:
            delta_dict[key] += 1

In [None]:
sorted_delta = dict(sorted(delta_dict.items(), key=operator.itemgetter(1),reverse=True))

In [None]:
np.mean(list(sorted_delta.values()))

In [None]:
high_delta = {}

for k, v in delta_dict.items():
    if v > 6:
        high_delta[k] = v

In [None]:
high_delta

let's use this information for the transferred holidays. meaning if transferred == True, then -,+ 3 and 4 will have a higher sales.

In [None]:
delta_list = [-4, -3, 3, 4]

train_df = find_delta_date(train_df, delta_list)
test_df = find_delta_date(test_df, delta_list)

In [None]:
train_df = compare_dates_for_delta(train_df)
test_df = compare_dates_for_delta(test_df)

- christmas sales

In [None]:
christmas_season = ['2013-12', '2014-12', '2015-12', '2016-12']

for date in christmas_season:
    plt.figure(figsize = (12, 8))
    plt.title(f"Christmas Sales in {date}", fontsize = 15)
    sns.lineplot(x = 'date',
                 y = 'sales',
                 data = train_df[train_df.year_month == date] )
    plt.xticks(fontsize = 15, rotation = 270)
    plt.yticks(fontsize = 15)
    plt.xlabel('')
    plt.ylabel('')
    plt.axhline(np.mean(train_df[train_df.year_month == date]['sales']), color = 'red')

Looks like around Christmas seasons, there is an increase in terms of the total sales. We can look into dates before Christmas since the stores are closed on the day of Christmas.  

In [None]:
christmas_list = ['2013-12-25', '2014-12-25', '2015-12-25', '2016-12-25']

In [None]:
total_christmas = find_christmas_sales(christmas_list)

Using the function above, we can find the range of Christmas season.

In [None]:
train_df.loc[:, 'christmas_sales_season'] = train_df.date.apply(lambda x: 1 if x in total_christmas else 0)
test_df.loc[:, 'christmas_sales_season'] = test_df.date.apply(lambda x: 1 if x in total_christmas else 0)

Assign 1 if those dates are within the range of Christmas. 

###### city

both city and states are geolocation information and might deliver similar information. let's use average to determine if we can distinguish between low and high sales regions.

In [None]:
train_df.groupby(['state','city']).mean()[['sales']].sort_values(by = 'sales', ascending = False)

based on the table above, looks like we can only use state information rather than city since the average is consistent throughout the region. in addition to this, there are not many cities under each state, so we can use state instead.

###### state

In [None]:
state_sales_summary = train_df.groupby(['state']).mean()[['sales']].sort_values(by = 'sales', ascending = False)

In [None]:
state_sales_1st = state_sales_summary.describe().T['25%'].values[0]
state_sales_3rd = state_sales_summary.describe().T['75%'].values[0]

In [None]:
state_sales_summary.loc[:, 'state_sales_cut'] = state_sales_summary.sales.apply(lambda x: 'low' if x < state_sales_1st else ('med' if x < state_sales_3rd else 'high'))

In [None]:
state_sales_summary = state_sales_summary.reset_index()

In [None]:
train_df = train_df.merge(state_sales_summary[['state', 'state_sales_cut']], 
               left_on = 'state',
               right_on = 'state',
               how = 'left')


test_df = test_df.merge(state_sales_summary[['state', 'state_sales_cut']], 
               left_on = 'state',
               right_on = 'state',
               how = 'left')

###### store_type

In [None]:
store_type_summary = train_df.groupby('type').mean()[['sales']]

In [None]:
store_type_summary.sort_values(by = 'sales', ascending = False)

In [None]:
plt.figure(figsize = (18, 8))
plt.title('Sales Based on Store Types', fontsize = 20)
store_avg_bar = sns.barplot(x = store_type_summary.index, 
            y = 'sales',
            color = 'blue',
            alpha = 0.6,
            data = store_type_summary)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel('')
plt.ylabel('')
store_avg_bar.axhline(np.mean(store_type_summary)['sales'], linewidth = 3, color = 'red');

looks like variation is high for the store type A, so we can encode the store type A as high.

In [None]:
train_df.loc[:, 'store_type_sales'] = train_df.type.apply(lambda x: 'high' if x == 'A' else 'low')
test_df.loc[:, 'store_type_sales'] = test_df.type.apply(lambda x: 'high' if x == 'A' else 'low')

###### cluster

In [None]:
cluster_sales = train_df.groupby('cluster').mean()[['sales']].sort_values(by = 'sales', ascending = False)

In [None]:
cluster_sales_avg = cluster_sales.describe().T['mean'].values[0]

In [None]:
cluster_sales.loc[:, 'cluster_sales_indicator'] = cluster_sales.sales.apply(lambda x: 'higher_than_avg' if x > cluster_sales_avg else 'lower_than_avg')

In [None]:
cluster_sales.reset_index(inplace = True)

In [None]:
plt.figure(figsize = (18, 8))
plt.title('Sales Based on Clusters', fontsize = 20)
store_avg_bar = sns.barplot(x = 'cluster', 
            y = 'sales',
            color = 'blue',
            alpha = 0.6,
            data = cluster_sales)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)
plt.xlabel('')
plt.ylabel('')
store_avg_bar.axhline(np.mean(cluster_sales)['sales'], linewidth = 3, color = 'red');

In [None]:
train_df = train_df.merge(cluster_sales[['cluster', 'cluster_sales_indicator']], 
               left_on = 'cluster',
               right_on = 'cluster',
               how = 'left')


test_df = test_df.merge(cluster_sales[['cluster', 'cluster_sales_indicator']], 
               left_on = 'cluster',
               right_on = 'cluster',
               how = 'left')

### Export the dataframe

In [None]:
export_path = '../asset/'

with open(export_path + 'train_df.pkl', 'wb') as f:
    pickle.dump(train_df, f)
    
with open(export_path + 'test_df.pkl', 'wb') as f:
    pickle.dump(test_df, f)