In [2]:
#Import libraries
import pandas as pd
import numpy as np
import os

In [5]:
#Load data
stores = pd.read_csv("../data/raw_data/stores_data_set.csv")
features = pd.read_csv("../data/raw_data/features_data_set.csv")
sales = pd.read_csv("../data/raw_data/sales_data_set.csv")

In [6]:
#Explore structure
print("Stores:", stores.shape)
print("Features:", features.shape)
print("Sales:", sales.shape)

Stores: (45, 3)
Features: (8190, 12)
Sales: (421570, 5)


In [8]:
#Convert date columns correctly
features['Date'] = pd.to_datetime(features['Date'], dayfirst=True)
sales['Date'] = pd.to_datetime(sales['Date'], dayfirst=True)

In [9]:
#Merge dataframes
df = sales.merge(features, on=['Store', 'Date'], how='left')
df = df.merge(stores, on='Store', how='left')

In [11]:
#Handle missing values
markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
for col in markdown_cols:
    if col in df.columns:
        df[col].fillna(0, inplace=True)

df['CPI'] = df['CPI'].fillna(method='ffill')
df['Unemployment'] = df['Unemployment'].fillna(method='ffill')
print("Missing values after cleaning:\n", df.isnull().sum())


Missing values after cleaning:
 Store           0
Dept            0
Date            0
Weekly_Sales    0
IsHoliday_x     0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
IsHoliday_y     0
Type            0
Size            0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)
  df['CPI'] = df['CPI'].fillna(method='ffill')
  df['Unemployment'] = df['Unemployment'].fillna(method='ffill')


In [12]:
#Save cleaned and merged data
df.to_csv("../data/processed_data/cleaned_data.csv", index=False)
print("Cleaned data saved to data/processed_data/cleaned_data.csv")


Cleaned data saved to data/processed_data/cleaned_data.csv
