In [2]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt 
import plotly.io as pio
import plotly.express as px
import plotly.graph_objs as go
from tqdm import tqdm

import os 
from pathlib import Path
import zipfile

In [3]:
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [4]:
zip_path = "./walmart-recruiting-store-sales-forecasting.zip"
extract_dir = "./walmart_dataset"

def extract_dataset(zip_path: str, extract_dir: str):
    if not os.path.exists(zip_path):
        print(f"El archivo {zip_path} no existe.")
        return None
    if not os.path.exists(extract_dir):
        print(f"La carpeta de destino {extract_dir} no existe. Creando la carpeta... 🫡")
        os.makedirs(extract_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Archivo extraído en: {extract_dir} 🫣")

In [5]:
# Para los archivos zip
def download_dataset():
    def format(df, columna_date):
        df[columna_date] = pd.to_datetime(df[columna_date], yearfirst=True)
        #df.set_index(columna_date, inplace=True)
        df.sort_values(columna_date, ascending=True, inplace=True)
        return df
    try:
        zip_path = "./walmart-recruiting-store-sales-forecasting.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/features.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/sampleSubmission.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/test.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/train.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)

        #Para dataframes
        features_df = pd.read_csv("./walmart_dataset/features.csv", date_format=False)
        stores_df = pd.read_csv("./walmart_dataset/stores.csv", date_format=False)
        test_df = pd.read_csv("./walmart_dataset/test.csv", date_format=False)
        train_df = pd.read_csv("./walmart_dataset/train.csv", date_format=False)
        print("dataset descargado con éxito :)")
        train_df = format(train_df, "Date")
        test_df = format(test_df, "Date")
        features_df = format(features_df, "Date")
        test_df["Weekly_Sales"] = 0
        print("dataset formateado :)")
        return features_df, stores_df, test_df, train_df
    except Exception as e:
        print("no s epudo descargar :(")
        return None, None, None, None

In [6]:
features_df, stores_df, test_df, train_df = download_dataset()

Archivo extraído en: ./walmart_dataset 🫣
Archivo extraído en: ./walmart_dataset 🫣
Archivo extraído en: ./walmart_dataset 🫣
Archivo extraído en: ./walmart_dataset 🫣
Archivo extraído en: ./walmart_dataset 🫣
dataset descargado con éxito :)
dataset formateado :)


In [7]:
def features_info():
    display("Información de features: (^◕.◕^) 😼")
    display(features_df.info())
    display("Información de stores: ( •̀ ω •́ )✧ 🫡")
    display(stores_df.info())
    display("Información de train: ヾ(•ω•`)o 🥺")
    display(train_df.info())
    display("Información de test: ╰(*°▽°*)╯ 🫣")
    display(test_df.info())
    return "all!"

In [8]:
features_info()

'Información de features: (^◕.◕^) 😼'

<class 'pandas.core.frame.DataFrame'>
Index: 8190 entries, 0 to 8189
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         8190 non-null   int64         
 1   Date          8190 non-null   datetime64[ns]
 2   Temperature   8190 non-null   float64       
 3   Fuel_Price    8190 non-null   float64       
 4   MarkDown1     4032 non-null   float64       
 5   MarkDown2     2921 non-null   float64       
 6   MarkDown3     3613 non-null   float64       
 7   MarkDown4     3464 non-null   float64       
 8   MarkDown5     4050 non-null   float64       
 9   CPI           7605 non-null   float64       
 10  Unemployment  7605 non-null   float64       
 11  IsHoliday     8190 non-null   bool          
dtypes: bool(1), datetime64[ns](1), float64(9), int64(1)
memory usage: 775.8 KB


None

'Información de stores: ( •̀ ω •́ )✧ 🫡'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Store   45 non-null     int64 
 1   Type    45 non-null     object
 2   Size    45 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.2+ KB


None

'Información de train: ヾ(•ω•`)o 🥺'

<class 'pandas.core.frame.DataFrame'>
Index: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         421570 non-null  int64         
 1   Dept          421570 non-null  int64         
 2   Date          421570 non-null  datetime64[ns]
 3   Weekly_Sales  421570 non-null  float64       
 4   IsHoliday     421570 non-null  bool          
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2)
memory usage: 16.5 MB


None

'Información de test: ╰(*°▽°*)╯ 🫣'

<class 'pandas.core.frame.DataFrame'>
Index: 115064 entries, 0 to 115063
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         115064 non-null  int64         
 1   Dept          115064 non-null  int64         
 2   Date          115064 non-null  datetime64[ns]
 3   IsHoliday     115064 non-null  bool          
 4   Weekly_Sales  115064 non-null  int64         
dtypes: bool(1), datetime64[ns](1), int64(3)
memory usage: 4.5 MB


None

'all!'

In [9]:
def merge_data():
    data = pd.concat([train_df, test_df])
    data = data.merge(stores_df, how="left", on="Store")
    data = data.merge(features_df, how="left", on= ["Store", "Date"])
    data.sort_values(by=["Store", "Dept", "Date"], ascending=True, inplace=True)
    return data

In [10]:
def view_data_horizon(data_df: pd.DataFrame, store: int= 1, dept: int= 0, target: str="Weekly_Sales", back_horizon: int= 52):
    plot_df = data_df[(data_df["Store"] == 1) & (data_df["Dept"] == 1)].copy()
    plot_df[f"lag_{back_horizon}_{target}"] = plot_df.groupby(["Store", "Dept"])[target].shift(back_horizon)
    plot_df.set_index("Date", inplace=True)
    plot_df.sort_values("Date", ascending=True, inplace=True)
    fig = px.line(
        plot_df, 
        y=[f"{target}", f"lag_{back_horizon}_{target}"], 
        title=f"{target} and Lag {back_horizon} Weeks",
    )
    fig.show()
    del plot_df

In [11]:
view_data_horizon(train_df, target="Weekly_Sales")

In [12]:
data = merge_data()
data

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y
0,1,1,2010-02-05,24924.50,False,A,151315,42.31,2.572,,,,,,211.096358,8.106,False
5747,1,1,2010-02-12,46039.49,True,A,151315,38.51,2.548,,,,,,211.242170,8.106,True
6829,1,1,2010-02-19,41595.55,False,A,151315,39.93,2.514,,,,,,211.289143,8.106,False
11698,1,1,2010-02-26,19403.54,False,A,151315,46.63,2.561,,,,,,211.319643,8.106,False
12681,1,1,2010-03-05,21827.90,False,A,151315,46.50,2.625,,,,,,211.350143,8.106,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523896,45,98,2013-06-28,0.00,False,B,118221,76.05,3.639,4842.29,975.03,3.00,2449.97,3169.69,,,False
525567,45,98,2013-07-05,0.00,False,B,118221,77.50,3.614,9090.48,2268.58,582.74,5797.47,1514.93,,,False
527978,45,98,2013-07-12,0.00,False,B,118221,79.37,3.614,3789.94,1827.31,85.72,744.84,2150.36,,,False
532235,45,98,2013-07-19,0.00,False,B,118221,82.84,3.737,2961.49,1047.07,204.19,363.00,1059.46,,,False


In [13]:
view_data_horizon(data, target="Unemployment")

In [14]:
view_data_horizon(data, target="CPI")

In [15]:
view_data_horizon(data, target="Fuel_Price", back_horizon=52)

In [16]:
view_data_horizon(data, target="Weekly_Sales", back_horizon=52)

In [17]:
view_data_horizon(data, target="Temperature", back_horizon=52)

In [18]:

def add_memory_feature(
        data_df: pd.DataFrame,
        target: str= "Weekly_Sales",
        columns_id: list[str]= ["Store", "Dept", "Date"],
        back_horizon: int= 52,
        lags: list[int]= [1,2,3],
        aggregation_windows: list[int]= [3,4,5]
    ) -> pd.DataFrame:
    data = data_df[columns_id + [target,]].copy()
    data.sort_values(by=columns_id, ascending=True, inplace=True)
    lags = [0, ] + lags
    for lag in tqdm(lags, desc="Adding memory variable 😸"):
        memory = back_horizon + lag 
        data[f"lag_{lag}_{target}"] = data.groupby(["Store", "Dept"])[target].shift(memory)
        for w in tqdm(aggregation_windows,desc="Adding window aggregation 🫡"):
            data[f"l_{lag}_w_{w}_sum_{target}"] = data.groupby(["Store", "Dept"])[target].shift(memory).rolling(window=w).sum()
            data[f"l_{lag}_w_{w}_mean_{target}"] = data.groupby(["Store", "Dept"])[target].shift(memory).rolling(window=w).mean()
            data[f"l_{lag}_w_{w}_median_{target}"] = data.groupby(["Store", "Dept"])[target].shift(memory).rolling(window=w).median()
            data[f"l_{lag}_w_{w}_std_{target}"] = data.groupby(["Store", "Dept"])[target].shift(memory).rolling(window=w).std()
            data[f"l_{lag}_w_{w}_max_{target}"] = data.groupby(["Store", "Dept"])[target].shift(memory).rolling(window=w).max()
            data[f"l_{lag}_w_{w}_min_{target}"] = data.groupby(["Store", "Dept"])[target].shift(memory).rolling(window=w).min()
    data.sort_values(by=columns_id, ascending=True, inplace=True)
    return data

In [19]:
# data_df["weekday_name"] = data_df.index.day_name()
# data_df["week_day"] = data_df.index.weekday
# data_df["day"] = data_df.index.day
# data_df["iso_week"] = data_df.index.isocalendar().week 
# data_df["hour"] = data_df.index.hour
# data_df["month"] = data_df.index.month
# data_df["month_name"] = data_df.index.month_name()
# data_df["year"] = data_df.index.year

In [20]:
def add_calendar_features(
        data_df: pd.DataFrame,  
        columns_id: list[str]= ["Store", "Dept", "Date"],
        back_horizon: int = 52
    ) -> pd.DataFrame:
    data = data_df[columns_id ].copy()
    data["month"] = data["Date"].dt.month 
    data["week"] = data["Date"].dt.isocalendar().week 
    data.sort_values(by=columns_id, ascending=True, inplace=True)
    return data    

In [21]:
add_calendar_features(data)

Unnamed: 0,Store,Dept,Date,month,week
0,1,1,2010-02-05,2,5
5747,1,1,2010-02-12,2,6
6829,1,1,2010-02-19,2,7
11698,1,1,2010-02-26,2,8
12681,1,1,2010-03-05,3,9
...,...,...,...,...,...
523896,45,98,2013-06-28,6,26
525567,45,98,2013-07-05,7,27
527978,45,98,2013-07-12,7,28
532235,45,98,2013-07-19,7,29


In [22]:
def add_fuel_feature(
        data_df: pd.DataFrame,
        target: str= "Fuel_Price",
        columns_id: list[str]= ["Store", "Dept", "Date"],
        back_horizon: int= 52,
        lags: list[int]= [1,2,3],
        aggregation_windows: list[int]= [3,4,5]
    ) -> pd.DataFrame:
    pass
    data = data_df[columns_id + [target,]].copy()
    lags = [0, ] + lags
    for lag in lags:
        memory = back_horizon + lag
        data[f"lag_{lag}_{target}"] = data.groupby(["Store", "Dept"])[target].shift(memory)
        for w in aggregation_windows:
            pass
    return data  

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 536634 entries, 0 to 536633
Data columns (total 17 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         536634 non-null  int64         
 1   Dept          536634 non-null  int64         
 2   Date          536634 non-null  datetime64[ns]
 3   Weekly_Sales  536634 non-null  float64       
 4   IsHoliday_x   536634 non-null  bool          
 5   Type          536634 non-null  object        
 6   Size          536634 non-null  int64         
 7   Temperature   536634 non-null  float64       
 8   Fuel_Price    536634 non-null  float64       
 9   MarkDown1     265596 non-null  float64       
 10  MarkDown2     197685 non-null  float64       
 11  MarkDown3     242326 non-null  float64       
 12  MarkDown4     237143 non-null  float64       
 13  MarkDown5     266496 non-null  float64       
 14  CPI           498472 non-null  float64       
 15  Unemployment  498472 n

In [24]:
add_memory_feature(data)

Adding window aggregation 🫡: 100%|██████████| 3/3 [00:00<00:00,  3.29it/s]
Adding window aggregation 🫡: 100%|██████████| 3/3 [00:00<00:00,  3.32it/s]
Adding window aggregation 🫡: 100%|██████████| 3/3 [00:00<00:00,  3.26it/s]
Adding window aggregation 🫡: 100%|██████████| 3/3 [00:00<00:00,  3.22it/s]
Adding memory variable 😸: 100%|██████████| 4/4 [00:03<00:00,  1.06it/s]


Unnamed: 0,Store,Dept,Date,Weekly_Sales,lag_0_Weekly_Sales,l_0_w_3_sum_Weekly_Sales,l_0_w_3_mean_Weekly_Sales,l_0_w_3_median_Weekly_Sales,l_0_w_3_std_Weekly_Sales,l_0_w_3_max_Weekly_Sales,...,l_3_w_4_median_Weekly_Sales,l_3_w_4_std_Weekly_Sales,l_3_w_4_max_Weekly_Sales,l_3_w_4_min_Weekly_Sales,l_3_w_5_sum_Weekly_Sales,l_3_w_5_mean_Weekly_Sales,l_3_w_5_median_Weekly_Sales,l_3_w_5_std_Weekly_Sales,l_3_w_5_max_Weekly_Sales,l_3_w_5_min_Weekly_Sales
0,1,1,2010-02-05,24924.50,,,,,,,...,,,,,,,,,,
5747,1,1,2010-02-12,46039.49,,,,,,,...,,,,,,,,,,
6829,1,1,2010-02-19,41595.55,,,,,,,...,,,,,,,,,,
11698,1,1,2010-02-26,19403.54,,,,,,,...,,,,,,,,,,
12681,1,1,2010-03-05,21827.90,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523896,45,98,2013-06-28,0.00,690.52,2169.49,723.163333,690.52,120.235680,856.35,...,770.690,70.249079,874.64,713.50,4023.12,804.624,795.94,78.582297,893.60,713.50
525567,45,98,2013-07-05,0.00,659.65,1972.79,657.596667,659.65,33.996539,690.52,...,826.145,72.655671,874.64,713.50,3985.87,797.174,795.94,69.249635,874.64,713.50
527978,45,98,2013-07-12,0.00,695.21,2045.38,681.793333,690.52,19.319535,695.21,...,784.925,120.107491,874.64,622.62,3863.05,772.610,795.94,104.830565,874.64,622.62
532235,45,98,2013-07-19,0.00,845.30,2200.16,733.386667,695.21,98.537171,845.30,...,702.010,98.290848,856.35,622.62,3757.63,751.526,713.50,109.464142,874.64,622.62
