In [2]:
import numpy as np 
import matplotlib.pyplot as plt 
import plotly.io as pio
import plotly.express as px
import pandas as pd 
from tqdm import tqdm
# sistema operativo
import os 
from pathlib import Path
import zipfile

In [3]:
zip_path = "./walmart-recruiting-store-sales-forecasting.zip"
extract_dir = "./walmart_dataset"

def extract_dataset(zip_path: str, extract_dir: str):
    if not os.path.exists(zip_path):
        print(f"El archivo {zip_path} no existe.")
        return None
    if not os.path.exists(extract_dir):
        print(f"La carpeta de destino {extract_dir} no existe. Creando la carpeta... 🫡")
        os.makedirs(extract_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"Archivo extraído en: {extract_dir} 🫣")

In [4]:
# Para los archivos zip
def download_dataset():
    def format(df, columna_date):
        df[columna_date] = pd.to_datetime(df[columna_date], yearfirst=True)
        df.set_index(columna_date, inplace=True)
        df.sort_values(columna_date, ascending=True, inplace=True)
        return df

    try:
        zip_path = "./walmart-recruiting-store-sales-forecasting.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/features.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/sampleSubmission.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/test.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)
        zip_path = "./walmart_dataset/train.csv.zip"
        extract_dir = "./walmart_dataset"
        extract_dataset(zip_path , extract_dir)

        #Para dataframes
        features_df = pd.read_csv("./walmart_dataset/features.csv", date_format=False)
        stores_df = pd.read_csv("./walmart_dataset/stores.csv", date_format=False)
        test_df = pd.read_csv("./walmart_dataset/test.csv", date_format=False)
        train_df = pd.read_csv("./walmart_dataset/train.csv", date_format=False)
        print("dataset descargado con éxito :)")
        train_df = format(train_df, "Date")
        test_df = format(test_df, "Date")
        features_df = format(features_df, "Date")
        print("dataset formateado :)")
        return features_df, stores_df, test_df, train_df
    except Exception as e:
        print("no s epudo descargar :(")
        return None, None, None, None
    

In [5]:
features_df, stores_df, test_df, train_df = download_dataset()

El archivo ./walmart-recruiting-store-sales-forecasting.zip no existe.
El archivo ./walmart_dataset/features.csv.zip no existe.
El archivo ./walmart_dataset/sampleSubmission.csv.zip no existe.
El archivo ./walmart_dataset/test.csv.zip no existe.
El archivo ./walmart_dataset/train.csv.zip no existe.
dataset descargado con éxito :)
dataset formateado :)


In [6]:
stores_df.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [7]:
features_df.head()

Unnamed: 0_level_0,Store,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-02-05,1,42.31,2.572,,,,,,211.096358,8.106,False
2010-02-05,16,19.79,2.58,,,,,,189.381697,7.039,False
2010-02-05,31,39.05,2.572,,,,,,210.752605,8.324,False
2010-02-05,21,39.05,2.572,,,,,,210.752605,8.324,False
2010-02-05,26,9.55,2.788,,,,,,131.527903,8.488,False


In [8]:





test_df.head()

Unnamed: 0_level_0,Store,Dept,IsHoliday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-11-02,1,1,False
2012-11-02,19,10,False
2012-11-02,16,55,False
2012-11-02,22,18,False
2012-11-02,25,16,False


In [9]:
print(f"Fecha Inicio {test_df.index.min()} y fecha Fin {test_df.index.max()}")

Fecha Inicio 2012-11-02 00:00:00 y fecha Fin 2013-07-26 00:00:00


In [10]:
train_df.head()

Unnamed: 0_level_0,Store,Dept,Weekly_Sales,IsHoliday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-02-05,1,1,24924.5,False
2010-02-05,29,5,15552.08,False
2010-02-05,29,6,3200.22,False
2010-02-05,29,7,10820.05,False
2010-02-05,29,8,20055.64,False


In [11]:
print(f"Fecha Inicio {train_df.index.min()} y fecha Fin {train_df.index.max()}")

Fecha Inicio 2010-02-05 00:00:00 y fecha Fin 2012-10-26 00:00:00


In [12]:
print(f"Fecha Inicio {test_df.index.min()} y fecha Fin {test_df.index.max()}")

Fecha Inicio 2012-11-02 00:00:00 y fecha Fin 2013-07-26 00:00:00


In [13]:
data_df = train_df

In [14]:
print(f"Fecha Inicio {data_df.index.min()} y fecha Fin {data_df.index.max()}")

Fecha Inicio 2010-02-05 00:00:00 y fecha Fin 2012-10-26 00:00:00


In [15]:
# Este es el dataset para la tienda 1 y el departamento 1
# aqui tengo la serie de tiempo solo para estos
dataset = data_df[(data_df["Dept"] == 1) & (data_df["Store"] == 1)]
dataset = dataset[["Weekly_Sales"]]

In [16]:
dataset

Unnamed: 0_level_0,Weekly_Sales
Date,Unnamed: 1_level_1
2010-02-05,24924.50
2010-02-12,46039.49
2010-02-19,41595.55
2010-02-26,19403.54
2010-03-05,21827.90
...,...
2012-09-28,18947.81
2012-10-05,21904.47
2012-10-12,22764.01
2012-10-19,24185.27


In [17]:
# Dividir
# train_dataset = Fecha Inicio 2010-02-05 00:00:00 y fecha Fin 2012-09-25 00:00:00 
# test_dataset = Fecha Inicio 2012-09-26 00:00:00 y fecha Fin 2012-10-26 00:00:00 
# summit =  Fecha Inicio 2012-11-02 00:00:00 y fecha Fin 2013-07-26 00:00:00

# evaluarlos
# hacer plots
# ver autocorrelacion
# aplicar un test
# aplicar todos los modelos y pruebas
# obtener la matriz final