## Importing Libraries

In [62]:
import os
import pandas as pd

## Initial Exploration

In [60]:
grocery_sales = pd.read_csv("datasets/grocery_sales.csv", index_col="index")
grocery_sales.head()

Unnamed: 0_level_0,Store_ID,Date,Dept,Weekly_Sales
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,2010-02-05,1,24924.5
1,1,2010-02-05,26,11737.12
2,1,2010-02-05,17,13223.76
3,1,2010-02-05,45,37.44
4,1,2010-02-05,28,1085.29


In [50]:
grocery_sales.dtypes

Store_ID          int64
Date             object
Dept              int64
Weekly_Sales    float64
dtype: object

In [51]:
grocery_sales.isna().sum()

Store_ID         0
Date            39
Dept             0
Weekly_Sales    38
dtype: int64

In [61]:
extra_data = pd.read_parquet("datasets/extra_data.parquet")
extra_data.set_index("index", inplace=True)
extra_data.head()

Unnamed: 0_level_0,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,3.0,151315.0
1,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,3.0,151315.0
2,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,3.0,151315.0
3,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,,3.0,151315.0
4,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,,3.0,151315.0


In [52]:
extra_data.dtypes

IsHoliday         int64
Temperature     float64
Fuel_Price      float64
MarkDown1       float64
MarkDown2       float64
MarkDown3       float64
MarkDown4       float64
MarkDown5       float64
CPI             float64
Unemployment    float64
Type            float64
Size            float64
dtype: object

In [53]:
extra_data.isna().sum()

IsHoliday        0
Temperature      0
Fuel_Price       0
MarkDown1        0
MarkDown2        0
MarkDown3        0
MarkDown4        1
MarkDown5        1
CPI             47
Unemployment    37
Type             1
Size             1
dtype: int64

## Data Extraction and Integration

In [57]:
def extract(csv_path, par_path):
    
    csv_df = pd.read_csv(csv_path)
    par_df = pd.read_parquet(par_path)
    merged_df = csv_df.merge(par_df, left_on="index", right_on="index")
    merged_df.set_index("index", inplace=True)
    
    return merged_df

In [58]:
merged_df = extract("datasets/grocery_sales.csv", "datasets/extra_data.parquet")
merged_df.head()

Unnamed: 0_level_0,Store_ID,Date,Dept,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,1,2010-02-05,1,24924.5,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,3.0,151315.0
1,1,2010-02-05,26,11737.12,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,3.0,151315.0
2,1,2010-02-05,17,13223.76,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,3.0,151315.0
3,1,2010-02-05,45,37.44,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,,3.0,151315.0
4,1,2010-02-05,28,1085.29,0,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,,3.0,151315.0


## Data Transformation

In [54]:
def transform(df):
    
    df.fillna({
        'Weekly_Sales':df['Weekly_Sales'].mean(),
        'CPI':df['CPI'].mean(),
        'Unemployment':df['Unemployment'].mean(),
        'MarkDown4':df['MarkDown4'].mode(),
        'MarkDown5':df['MarkDown5'].mode(),
        'Type':df['Type'].mode(),
        'Size':df['Size'].mode(),
    }, inplace= True)
    df.fillna(method = 'ffill', inplace=True)

    df['Month'] = pd.to_datetime(df['Date'], yearfirst=True).dt.month
    
    df = df[df['Weekly_Sales'] > 10000]
    
    df = df[["Store_ID","Month","Dept","IsHoliday","Weekly_Sales","CPI","Unemployment"]]

    return df

In [55]:
clean_data = transform(merged_df)
clean_data.head()

Unnamed: 0_level_0,Store_ID,Month,Dept,IsHoliday,Weekly_Sales,CPI,Unemployment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,2,1,0,24924.5,211.096358,8.106
1,1,2,26,0,11737.12,211.096358,8.106
2,1,2,17,0,13223.76,211.096358,8.106
5,1,2,79,0,46729.77,211.096358,7.500052
6,1,2,55,0,21249.31,211.096358,7.500052


In [26]:
def avg_monthly_sales(df):
    df = pd.DataFrame(data=df.groupby("Month").Weekly_Sales.agg("mean"))
    df.reset_index(inplace=True)
    df.columns = ["Month", "Avg_Sales"]
    df["Avg_Sales"] = df["Avg_Sales"].round(decimals=2)
    return df

In [28]:
agg_data = avg_monthly_sales(clean_data)
agg_data.head(13)

Unnamed: 0,Month,Avg_Sales
0,1,33174.18
1,2,34331.35
2,3,33220.89
3,4,33390.86
4,5,33339.89
5,6,34582.47
6,7,33922.76
7,8,33644.79
8,9,33258.05
9,10,32731.06


In [29]:
def load(df1, df2, path1, path2):
    df1.to_csv(path1, index=False)
    df2.to_csv(path2, index=False)

In [30]:
load(clean_data, agg_data, "clean_data.csv", "agg_data.csv")

In [31]:
def validation(path1, path2):
    if os.path.exists(path1) == 0:
        raise Exception("Path 1 does not exists.")
    if os.path.exists(path2) == 0:
        raise Exception("Path 2 does not exists.")

In [39]:
validation("clean_data.csv", "agg_data.csv")

Exception: Path 1 does not exists.

In [38]:
os.remove("agg_data.csv")
os.remove("clean_data.csv")