In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt 
import plotly.io as pio
import plotly.express as px
import plotly.graph_objs as go
from tqdm import tqdm

import os 
from pathlib import Path
import zipfile

In [2]:
zip_path = "./walmart-recruiting-store-sales-forecasting.zip"
extract_dir = "./walmart_dataset"

def extract_dataset(zip_path: str, extract_dir: str):
    if not os.path.exists(zip_path):
        print(f"El archivo {zip_path} no existe.")
        return None
    if not os.path.exists(extract_dir):
        print(f"La carpeta de destino {extract_dir} no existe. Creando la carpeta... 🫡")
        os.makedirs(extract_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Archivo extraído en: {extract_dir} 🫣")

In [3]:
# Para los archivos zip
def download_dataset():
    def format(df, columna_date):
        df[columna_date] = pd.to_datetime(df[columna_date], yearfirst=True)
        #df.set_index(columna_date, inplace=True)
        df.sort_values(columna_date, ascending=True, inplace=True)
        return df
    try:
        zip_path = "./walmart-recruiting-store-sales-forecasting.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/features.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/sampleSubmission.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/test.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/train.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)

        #Para dataframes
        features_df = pd.read_csv("./walmart_dataset/features.csv", date_format=False)
        stores_df = pd.read_csv("./walmart_dataset/stores.csv", date_format=False)
        test_df = pd.read_csv("./walmart_dataset/test.csv", date_format=False)
        train_df = pd.read_csv("./walmart_dataset/train.csv", date_format=False)
        print("dataset descargado con éxito :)")
        train_df = format(train_df, "Date")
        test_df = format(test_df, "Date")
        features_df = format(features_df, "Date")
        test_df["Weekly_Sales"] = 0 
        print("dataset formateado :)")
        return features_df, stores_df, test_df, train_df
    except Exception as e:
        print("no s epudo descargar :(")
        return None, None, None, None

In [4]:
features_df, stores_df, test_df, train_df = download_dataset()

Archivo extraído en: ./walmart_dataset 🫣
Archivo extraído en: ./walmart_dataset 🫣
Archivo extraído en: ./walmart_dataset 🫣
Archivo extraído en: ./walmart_dataset 🫣
Archivo extraído en: ./walmart_dataset 🫣
dataset descargado con éxito :)
dataset formateado :)


In [5]:
def features_info():
    display("Información de features: (^◕.◕^) 😼")
    display(features_df.info())
    display("Información de stores: ( •̀ ω •́ )✧ 🫡")
    display(stores_df.info())
    display("Información de train: ヾ(•ω•`)o 🥺")
    display(train_df.info())
    display("Información de test: ╰(*°▽°*)╯ 🫣")
    display(test_df.info())
    return "all!"

In [6]:
features_info()

'Información de features: (^◕.◕^) 😼'

<class 'pandas.core.frame.DataFrame'>
Index: 8190 entries, 0 to 8189
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         8190 non-null   int64         
 1   Date          8190 non-null   datetime64[ns]
 2   Temperature   8190 non-null   float64       
 3   Fuel_Price    8190 non-null   float64       
 4   MarkDown1     4032 non-null   float64       
 5   MarkDown2     2921 non-null   float64       
 6   MarkDown3     3613 non-null   float64       
 7   MarkDown4     3464 non-null   float64       
 8   MarkDown5     4050 non-null   float64       
 9   CPI           7605 non-null   float64       
 10  Unemployment  7605 non-null   float64       
 11  IsHoliday     8190 non-null   bool          
dtypes: bool(1), datetime64[ns](1), float64(9), int64(1)
memory usage: 775.8 KB


None

'Información de stores: ( •̀ ω •́ )✧ 🫡'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Store   45 non-null     int64 
 1   Type    45 non-null     object
 2   Size    45 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.2+ KB


None

'Información de train: ヾ(•ω•`)o 🥺'

<class 'pandas.core.frame.DataFrame'>
Index: 421570 entries, 0 to 421569
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         421570 non-null  int64         
 1   Dept          421570 non-null  int64         
 2   Date          421570 non-null  datetime64[ns]
 3   Weekly_Sales  421570 non-null  float64       
 4   IsHoliday     421570 non-null  bool          
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2)
memory usage: 16.5 MB


None

'Información de test: ╰(*°▽°*)╯ 🫣'

<class 'pandas.core.frame.DataFrame'>
Index: 115064 entries, 0 to 115063
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Store         115064 non-null  int64         
 1   Dept          115064 non-null  int64         
 2   Date          115064 non-null  datetime64[ns]
 3   IsHoliday     115064 non-null  bool          
 4   Weekly_Sales  115064 non-null  int64         
dtypes: bool(1), datetime64[ns](1), int64(3)
memory usage: 4.5 MB


None

'all!'

In [25]:
def view_data_horizon(data_df: pd.DataFrame, store: int= 1, dept: int= 0, target: str="Weekly_Sales", back_horizon: int= 52):
    plot_df = data_df[(data_df["Store"] == 1) & (data_df["Dept"] == 1)].copy()
    plot_df[f"lag_{back_horizon}_{target}"] = plot_df.groupby(["Store", "Dept"])[target].shift(back_horizon)
    plot_df.set_index("Date", inplace=True)
    plot_df.sort_values("Date", ascending=True, inplace=True)
    fig = px.line(
        plot_df, 
        y=[f"{target}", f"lag_{back_horizon}_{target}"], 
        title=f"{target} and Lag {back_horizon} Weeks"
    )
    fig.show()
    del plot_df

In [26]:
view_data_horizon(train_df, target="Weekly_Sales")