In [2]:
import pandas as pd

# Load the dataset
file_path = 'Walmart.csv'
data = pd.read_csv(file_path)

# Display the first few rows and basic information about the dataset
data.head(), data.info(), data.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Date          6435 non-null   object 
 2   Weekly_Sales  6435 non-null   float64
 3   Holiday_Flag  6435 non-null   int64  
 4   Temperature   6435 non-null   float64
 5   Fuel_Price    6435 non-null   float64
 6   CPI           6435 non-null   float64
 7   Unemployment  6435 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 402.3+ KB


(   Store        Date  Weekly_Sales  Holiday_Flag  Temperature  Fuel_Price  \
 0      1  05-02-2010    1643690.90             0        42.31       2.572   
 1      1  12-02-2010    1641957.44             1        38.51       2.548   
 2      1  19-02-2010    1611968.17             0        39.93       2.514   
 3      1  26-02-2010    1409727.59             0        46.63       2.561   
 4      1  05-03-2010    1554806.68             0        46.50       2.625   
 
           CPI  Unemployment  
 0  211.096358         8.106  
 1  211.242170         8.106  
 2  211.289143         8.106  
 3  211.319643         8.106  
 4  211.350143         8.106  ,
 None,
              Store  Weekly_Sales  Holiday_Flag  Temperature   Fuel_Price  \
 count  6435.000000  6.435000e+03   6435.000000  6435.000000  6435.000000   
 mean     23.000000  1.046965e+06      0.069930    60.663782     3.358607   
 std      12.988182  5.643666e+05      0.255049    18.444933     0.459020   
 min       1.000000  2.09986

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from datetime import datetime

# Convert 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')

# Feature Engineering: Extract year, month, and week from the Date
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Week'] = data['Date'].dt.isocalendar().week

# Drop the original Date column
data = data.drop('Date', axis=1)

# Define features and target variable
X = data.drop('Weekly_Sales', axis=1)
y = data['Weekly_Sales']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled[:5], X_test_scaled[:5]




(array([[-1.16466588, -0.26842748,  0.8057783 , -1.71088305,  1.10380441,
         -0.8978744 , -1.22576021,  0.7955697 ,  0.79789733],
        [-1.24165623, -0.26842748, -2.15108453,  0.01856676,  0.52254808,
          0.43847483,  0.0325077 , -1.05624317, -1.11197777],
        [ 1.45300579, -0.26842748,  0.04355515, -0.51121407, -1.14613429,
          0.53724615, -1.22576021,  1.41284065,  1.36378625],
        [-0.62573348, -0.26842748,  0.47392973,  1.18758726, -0.89989198,
         -0.10183097,  0.0325077 ,  0.48693422,  0.58568899],
        [ 1.45300579, -0.26842748,  1.45973113,  0.83075141, -1.077504  ,
          0.13895749,  0.0325077 ,  0.48693422,  0.51495287]]),
 array([[-0.39476245, -0.26842748, -0.97689275, -1.22269659, -1.01271986,
          0.64349205, -1.22576021, -1.05624317, -1.11197777],
        [ 0.06717961, -0.26842748,  0.45714783,  1.31455952, -0.92404756,
          0.11493203,  0.0325077 , -0.13033674, -0.05093605],
        [-1.62660794, -0.26842748,  0.29420097

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mse_rf, mae_rf, r2_rf


(13365855228.832611, 63197.56307614608, 0.9585110481236588)