<a href="https://colab.research.google.com/github/tvani2/Walmart-Recruiting---Store-Sales-Forecasting/blob/main/LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install kaggle
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/cs231n/assignments/finalproject/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 373MB/s]


In [None]:
! unzip walmart-recruiting-store-sales-forecasting.zip

Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [None]:
!unzip features.csv.zip
!unzip train.csv.zip
!unzip test.csv.zip
!unzip sampleSubmission.csv.zip

Archive:  features.csv.zip
  inflating: features.csv            
Archive:  train.csv.zip
  inflating: train.csv               
Archive:  test.csv.zip
  inflating: test.csv                
Archive:  sampleSubmission.csv.zip
  inflating: sampleSubmission.csv    


In [None]:
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
features = pd.read_csv("features.csv")
stores = pd.read_csv("stores.csv")

In [None]:
print("Train:")
print(train.head())

print("\nTest:")
print(test.head())

print("\nFeatures:")
print(features.head())

print("\nStores:")
print(stores.head())
train.info()
features.info()

Train:
   Store  Dept        Date  Weekly_Sales  IsHoliday
0      1     1  2010-02-05      24924.50      False
1      1     1  2010-02-12      46039.49       True
2      1     1  2010-02-19      41595.55      False
3      1     1  2010-02-26      19403.54      False
4      1     1  2010-03-05      21827.90      False

Test:
   Store  Dept        Date  IsHoliday
0      1     1  2012-11-02      False
1      1     1  2012-11-09      False
2      1     1  2012-11-16      False
3      1     1  2012-11-23       True
4      1     1  2012-11-30      False

Features:
   Store        Date  Temperature  Fuel_Price  MarkDown1  MarkDown2  \
0      1  2010-02-05        42.31       2.572        NaN        NaN   
1      1  2010-02-12        38.51       2.548        NaN        NaN   
2      1  2010-02-19        39.93       2.514        NaN        NaN   
3      1  2010-02-26        46.63       2.561        NaN        NaN   
4      1  2010-03-05        46.50       2.625        NaN        NaN   

   MarkD

In [None]:
def missing_percent(df):
    return df.isna().mean().round(4) * 100

print("Features missing %:")
print(missing_percent(features))

Features missing %:
Store            0.00
Date             0.00
Temperature      0.00
Fuel_Price       0.00
MarkDown1       50.77
MarkDown2       64.33
MarkDown3       55.89
MarkDown4       57.70
MarkDown5       50.55
CPI              7.14
Unemployment     7.14
IsHoliday        0.00
dtype: float64


In [None]:
class DataProcessor:
    def merge_data(self, main_df, features_df, stores_df):
        merged = pd.merge(main_df, features_df, on=["Store", "Date", "IsHoliday"], how="left")
        merged = pd.merge(merged, stores_df, on="Store", how="left")
        return merged

    def convert_date(self, df):
        df["Date"] = pd.to_datetime(df["Date"])
        return df

    def extract_date_features(self, df):
        df["Year"] = df["Date"].dt.year
        df["Month"] = df["Date"].dt.month
        df["Week"] = df["Date"].dt.isocalendar().week
        df["Day"] = df["Date"].dt.dayofweek
        return df

    def fill_markdowns(self, df):
        markdown_cols = [f"MarkDown{i}" for i in range(1, 6)]
        df[markdown_cols] = df[markdown_cols].fillna(0)
        return df

    def fill_economics(self, df):
        df["CPI"] = df["CPI"].fillna(df["CPI"].median())
        df["Unemployment"] = df["Unemployment"].fillna(df["Unemployment"].median())
        return df

    def encode_types(self, df):
        df["Type"] = df["Type"].map({"A": 0, "B": 1, "C": 2})
        return df

    def process_data(self, main_df, features_df, stores_df):
        df = self.merge_data(main_df, features_df, stores_df)
        df = self.convert_date(df)
        df = self.extract_date_features(df)
        df = self.fill_markdowns(df)
        df = self.fill_economics(df)
        df = self.encode_types(df)
        return df

In [None]:
processor = DataProcessor()

train_merged = processor.process_data(train, features, stores)
test_merged = processor.process_data(test, features, stores)

In [None]:
train_merged.shape, test_merged.shape

((421570, 20), (115064, 19))

In [None]:
train_merged.isna().sum()

Unnamed: 0,0
Store,0
Dept,0
Date,0
Weekly_Sales,0
IsHoliday,0
Temperature,0
Fuel_Price,0
MarkDown1,0
MarkDown2,0
MarkDown3,0


In [None]:
train_merged.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,Year,Month,Week,Day
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,0,151315,2010,2,5,4
1,1,1,2010-02-12,46039.49,True,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,0,151315,2010,2,6,4
2,1,1,2010-02-19,41595.55,False,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,0,151315,2010,2,7,4
3,1,1,2010-02-26,19403.54,False,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,0,151315,2010,2,8,4
4,1,1,2010-03-05,21827.9,False,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,0,151315,2010,3,9,4


In [None]:
train_merged['Weekly_Sales'].sort_values(ascending=True).head()

Unnamed: 0,Weekly_Sales
267730,-4988.94
336495,-3924.0
417801,-1750.0
153916,-1699.0
271300,-1321.48


In [None]:
# Calculate mean and median weekly sales
mean_weekly_sales = train_merged['Weekly_Sales'].mean()
median_weekly_sales = train_merged['Weekly_Sales'].median()

print(f"Mean weekly sales: ${mean_weekly_sales:.2f}")
print(f"Median weekly sales: ${median_weekly_sales:.2f}")

# Compare holiday vs Non-Holiday Sales
holiday_sales = train_merged.groupby('IsHoliday')['Weekly_Sales'].agg(['mean', 'median', 'count'])

print("\nHoliday vs Non-Holiday Sales:")
print(holiday_sales)

# Top 10 departments by average sales
top_10_departments = train_merged.groupby('Dept')['Weekly_Sales'].mean().sort_values(ascending=False).head(10)

print("\nTop 10 departments by average sales:")
print(top_10_departments)

Mean weekly sales: $15981.26
Median weekly sales: $7612.03

Holiday vs Non-Holiday Sales:
                   mean   median   count
IsHoliday                               
False      15901.445069  7589.95  391909
True       17035.823187  7947.74   29661

Top 10 departments by average sales:
Dept
92    75204.870531
95    69824.423080
38    61090.619568
72    50566.515417
65    45441.706224
90    45232.084488
40    44900.702727
2     43607.020113
91    33687.910758
94    33405.883963
Name: Weekly_Sales, dtype: float64


In [None]:
if not train_merged['Date'].is_monotonic_increasing:
    train_merged = train_merged.sort_values(by='Date')

display(train_merged.head())

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,Year,Month,Week,Day
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,0,151315,2010,2,5,4
277665,29,5,2010-02-05,15552.08,False,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,1,93638,2010,2,5,4
277808,29,6,2010-02-05,3200.22,False,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,1,93638,2010,2,5,4
277951,29,7,2010-02-05,10820.05,False,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,1,93638,2010,2,5,4
278094,29,8,2010-02-05,20055.64,False,24.36,2.788,0.0,0.0,0.0,0.0,0.0,131.527903,10.064,1,93638,2010,2,5,4


Create features representing the sales from previous weeks by grouping the dataframe by Store and Dept and applying a shift of 1, 2, and 3 weeks respectively.



In [None]:
train_merged['Weekly_Sales_Lag1'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1).fillna(0)
train_merged['Weekly_Sales_Lag2'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(2).fillna(0)
train_merged['Weekly_Sales_Lag3'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(3).fillna(0)

display(train_merged.head())

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Unemployment,Type,Size,Year,Month,Week,Day,Weekly_Sales_Lag1,Weekly_Sales_Lag2,Weekly_Sales_Lag3
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,...,8.106,0,151315,2010,2,5,4,0.0,0.0,0.0
277665,29,5,2010-02-05,15552.08,False,24.36,2.788,0.0,0.0,0.0,...,10.064,1,93638,2010,2,5,4,0.0,0.0,0.0
277808,29,6,2010-02-05,3200.22,False,24.36,2.788,0.0,0.0,0.0,...,10.064,1,93638,2010,2,5,4,0.0,0.0,0.0
277951,29,7,2010-02-05,10820.05,False,24.36,2.788,0.0,0.0,0.0,...,10.064,1,93638,2010,2,5,4,0.0,0.0,0.0
278094,29,8,2010-02-05,20055.64,False,24.36,2.788,0.0,0.0,0.0,...,10.064,1,93638,2010,2,5,4,0.0,0.0,0.0


Calculate moving averages of sales over different time windows to capture trends.

In [None]:
train_merged['Weekly_Sales_MA4'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(lambda x: x.rolling(window=4).mean()).fillna(0)
train_merged['Weekly_Sales_MA12'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(lambda x: x.rolling(window=12).mean()).fillna(0)
train_merged['Weekly_Sales_MA26'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(lambda x: x.rolling(window=26).mean()).fillna(0)

display(train_merged.head())

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Year,Month,Week,Day,Weekly_Sales_Lag1,Weekly_Sales_Lag2,Weekly_Sales_Lag3,Weekly_Sales_MA4,Weekly_Sales_MA12,Weekly_Sales_MA26
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,...,2010,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0
277665,29,5,2010-02-05,15552.08,False,24.36,2.788,0.0,0.0,0.0,...,2010,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0
277808,29,6,2010-02-05,3200.22,False,24.36,2.788,0.0,0.0,0.0,...,2010,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0
277951,29,7,2010-02-05,10820.05,False,24.36,2.788,0.0,0.0,0.0,...,2010,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0
278094,29,8,2010-02-05,20055.64,False,24.36,2.788,0.0,0.0,0.0,...,2010,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0


Create features to capture monthly, weekly, or yearly seasonality.

In [None]:
train_merged["Year"] = train_merged["Date"].dt.year
train_merged["Month"] = train_merged["Date"].dt.month
train_merged["Week"] = train_merged["Date"].dt.isocalendar().week
train_merged["Day"] = train_merged["Date"].dt.dayofweek

display(train_merged.head())

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Year,Month,Week,Day,Weekly_Sales_Lag1,Weekly_Sales_Lag2,Weekly_Sales_Lag3,Weekly_Sales_MA4,Weekly_Sales_MA12,Weekly_Sales_MA26
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,...,2010,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0
277665,29,5,2010-02-05,15552.08,False,24.36,2.788,0.0,0.0,0.0,...,2010,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0
277808,29,6,2010-02-05,3200.22,False,24.36,2.788,0.0,0.0,0.0,...,2010,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0
277951,29,7,2010-02-05,10820.05,False,24.36,2.788,0.0,0.0,0.0,...,2010,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0
278094,29,8,2010-02-05,20055.64,False,24.36,2.788,0.0,0.0,0.0,...,2010,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0


Identify the dates of the major holidays based on the dataset's IsHoliday column and create features related to holiday proximity.



In [None]:
major_holiday_dates = train_merged[train_merged['IsHoliday'] == True]['Date'].unique()

def days_to_next_holiday(date, holidays):
    future_holidays = holidays[holidays > date]
    if len(future_holidays) > 0:
        return (future_holidays.min() - date).days
    return -1 # Or some other indicator for no future holidays

def days_since_last_holiday(date, holidays):
    past_holidays = holidays[holidays < date]
    if len(past_holidays) > 0:
        return (date - past_holidays.max()).days
    return -1 # Or some other indicator for no past holidays

# Apply holiday proximity features to both train and test data
train_merged['Days_To_Next_Holiday'] = train_merged['Date'].apply(lambda x: days_to_next_holiday(x, major_holiday_dates))
train_merged['Days_Since_Last_Holiday'] = train_merged['Date'].apply(lambda x: days_since_last_holiday(x, major_holiday_dates))

test_merged['Days_To_Next_Holiday'] = test_merged['Date'].apply(lambda x: days_to_next_holiday(x, major_holiday_dates))
test_merged['Days_Since_Last_Holiday'] = test_merged['Date'].apply(lambda x: days_since_last_holiday(x, major_holiday_dates))

display(train_merged.head())
display(test_merged.head())

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Week,Day,Weekly_Sales_Lag1,Weekly_Sales_Lag2,Weekly_Sales_Lag3,Weekly_Sales_MA4,Weekly_Sales_MA12,Weekly_Sales_MA26,Days_To_Next_Holiday,Days_Since_Last_Holiday
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,...,5,4,0.0,0.0,0.0,0.0,0.0,0.0,7,-1
277665,29,5,2010-02-05,15552.08,False,24.36,2.788,0.0,0.0,0.0,...,5,4,0.0,0.0,0.0,0.0,0.0,0.0,7,-1
277808,29,6,2010-02-05,3200.22,False,24.36,2.788,0.0,0.0,0.0,...,5,4,0.0,0.0,0.0,0.0,0.0,0.0,7,-1
277951,29,7,2010-02-05,10820.05,False,24.36,2.788,0.0,0.0,0.0,...,5,4,0.0,0.0,0.0,0.0,0.0,0.0,7,-1
278094,29,8,2010-02-05,20055.64,False,24.36,2.788,0.0,0.0,0.0,...,5,4,0.0,0.0,0.0,0.0,0.0,0.0,7,-1


Unnamed: 0,Store,Dept,Date,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,...,CPI,Unemployment,Type,Size,Year,Month,Week,Day,Days_To_Next_Holiday,Days_Since_Last_Holiday
0,1,1,2012-11-02,False,55.32,3.386,6766.44,5147.7,50.82,3639.9,...,223.462779,6.573,0,151315,2012,11,44,4,-1,56
1,1,1,2012-11-09,False,61.24,3.314,11421.32,3370.89,40.28,4646.79,...,223.481307,6.573,0,151315,2012,11,45,4,-1,63
2,1,1,2012-11-16,False,52.92,3.252,9696.28,292.1,103.78,1133.15,...,223.512911,6.573,0,151315,2012,11,46,4,-1,70
3,1,1,2012-11-23,True,56.23,3.211,883.59,4.17,74910.32,209.91,...,223.561947,6.573,0,151315,2012,11,47,4,-1,77
4,1,1,2012-11-30,False,52.34,3.207,2460.03,0.0,3838.35,150.57,...,223.610984,6.573,0,151315,2012,11,48,4,-1,84


In [None]:
holiday_dates = {
    "Super_Bowl": ["2010-02-12", "2011-02-11", "2012-02-10", "2013-02-08"],
    "Labor_Day": ["2010-09-10", "2011-09-09", "2012-09-07", "2013-09-06"],
    "Thanksgiving": ["2010-11-26", "2011-11-25", "2012-11-23", "2013-11-29"],
    "Christmas": ["2010-12-31", "2011-12-30", "2012-12-28", "2013-12-27"]
}

for holiday, dates in holiday_dates.items():
    train_merged[holiday] = train_merged["Date"].isin(pd.to_datetime(dates)).astype(int)
    test_merged[holiday] = test_merged["Date"].isin(pd.to_datetime(dates)).astype(int)

train_merged['IsHoliday'] = train_merged['IsHoliday'].astype(int)
test_merged['IsHoliday'] = test_merged['IsHoliday'].astype(int)

display(train_merged.head())
display(test_merged.head())

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Weekly_Sales_Lag3,Weekly_Sales_MA4,Weekly_Sales_MA12,Weekly_Sales_MA26,Days_To_Next_Holiday,Days_Since_Last_Holiday,Super_Bowl,Labor_Day,Thanksgiving,Christmas
0,1,1,2010-02-05,24924.5,0,42.31,2.572,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7,-1,0,0,0,0
277665,29,5,2010-02-05,15552.08,0,24.36,2.788,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7,-1,0,0,0,0
277808,29,6,2010-02-05,3200.22,0,24.36,2.788,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7,-1,0,0,0,0
277951,29,7,2010-02-05,10820.05,0,24.36,2.788,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7,-1,0,0,0,0
278094,29,8,2010-02-05,20055.64,0,24.36,2.788,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7,-1,0,0,0,0


Unnamed: 0,Store,Dept,Date,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,...,Year,Month,Week,Day,Days_To_Next_Holiday,Days_Since_Last_Holiday,Super_Bowl,Labor_Day,Thanksgiving,Christmas
0,1,1,2012-11-02,0,55.32,3.386,6766.44,5147.7,50.82,3639.9,...,2012,11,44,4,-1,56,0,0,0,0
1,1,1,2012-11-09,0,61.24,3.314,11421.32,3370.89,40.28,4646.79,...,2012,11,45,4,-1,63,0,0,0,0
2,1,1,2012-11-16,0,52.92,3.252,9696.28,292.1,103.78,1133.15,...,2012,11,46,4,-1,70,0,0,0,0
3,1,1,2012-11-23,1,56.23,3.211,883.59,4.17,74910.32,209.91,...,2012,11,47,4,-1,77,0,0,1,0
4,1,1,2012-11-30,0,52.34,3.207,2460.03,0.0,3838.35,150.57,...,2012,11,48,4,-1,84,0,0,0,0


In [None]:
# Determine the split points (e.g., 80% for training, 10% for validation, 10% for testing)
train_size = int(len(train_merged) * 0.8)
val_size = int(len(train_merged) * 0.1)
test_size = len(train_merged) - train_size - val_size

# Find the dates at the calculated split points
train_split_date = train_merged.iloc[train_size]['Date']
val_split_date = train_merged.iloc[train_size + val_size]['Date']

# Find the closest Friday to the split dates within the dataset
# We can iterate through the dates in the merged dataframe to find the exact Friday
def find_closest_friday(date, df_dates):
    closest_friday = None
    for d in df_dates:
        if d >= date and d.dayofweek == 4: # Friday is dayofweek 4
            closest_friday = d
            break
    return closest_friday

all_dates = train_merged['Date'].unique()
train_split_date_friday = find_closest_friday(train_split_date, all_dates)
val_split_date_friday = find_closest_friday(val_split_date, all_dates)

print(f"Original train split date: {train_split_date}")
print(f"Adjusted train split date (Friday): {train_split_date_friday}")
print(f"Original validation split date: {val_split_date}")
print(f"Adjusted validation split date (Friday): {val_split_date_friday}")

# Split the data based on the adjusted Friday dates
train_df = train_merged[train_merged['Date'] < train_split_date_friday]
val_df = train_merged[(train_merged['Date'] >= train_split_date_friday) & (train_merged['Date'] < val_split_date_friday)]
test_df = train_merged[train_merged['Date'] >= val_split_date_friday]

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print("Training set shape after resetting index:", train_df.shape)
print("Validation set shape after resetting index:", val_df.shape)
print("Test set shape after resetting index:", test_df.shape)

Original train split date: 2012-04-13 00:00:00
Adjusted train split date (Friday): 2012-04-13 00:00:00
Original validation split date: 2012-07-20 00:00:00
Adjusted validation split date (Friday): 2012-07-20 00:00:00
Training set shape after resetting index: (335761, 32)
Validation set shape after resetting index: (41394, 32)
Test set shape after resetting index: (44415, 32)


In [None]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtvani22[0m ([33mfinal-project-ml[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
TARGET = "Weekly_Sales"
DROP_COLS = ["Date", TARGET]

# Features
X_train = train_df.drop(columns=DROP_COLS)
X_val = val_df.drop(columns=DROP_COLS)
X_test = test_df.drop(columns=DROP_COLS)

# Targets
y_train = train_df[TARGET]
y_val = val_df[TARGET]
y_test = test_df[TARGET]

In [None]:
!pip install lightgbm --quiet
import lightgbm as lgb
import numpy as np
wandb.init(
    project="walmart-forecasting",
    entity="final-project-ml",
    name="lightGBM basic",
    config={
        "model": "LightGBM",
        "learning_rate": 0.05,
        "num_leaves": 31,
        "n_estimators": 1000,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "random_state": 42
    }
)

In [None]:
def WMAE(y_true, y_pred, weights):
    return np.round(np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights), 5)

# Use 5 for holiday weeks, 1 otherwise
train_weights = np.where(train_df["IsHoliday"] == 1, 5, 1)
val_weights = np.where(val_df["IsHoliday"] == 1, 5, 1)
test_weights = np.where(test_df["IsHoliday"] == 1, 5, 1)

In [None]:
def weighted_mean(y_true, weights):
    return (y_true * weights).sum() / weights.sum()

train_mean_weighted = weighted_mean(y_train, train_weights)
val_mean_weighted = weighted_mean(y_val, val_weights)
test_mean_weighted = weighted_mean(y_test, test_weights)

In [None]:
model = lgb.LGBMRegressor(
    objective="regression",
    learning_rate=0.05,
    num_leaves=31,
    n_estimators=1000,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbosity=-1
)

best_val_wmae = float("inf")
early_stopping_rounds = 50
rounds_no_improve = 0

for i in range(1, 101):
    model.n_estimators = i
    model.fit(X_train, y_train)

    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)
    test_preds = model.predict(X_test)

    train_wmae = WMAE(y_train, train_preds, train_weights)
    val_wmae = WMAE(y_val, val_preds, val_weights)
    test_wmae = WMAE(y_test, test_preds, test_weights)

    train_pct_off = (train_wmae / train_mean_weighted) * 100
    val_pct_off = (val_wmae / val_mean_weighted) * 100
    test_pct_off = (test_wmae / test_mean_weighted) * 100

    wandb.log({
        "epoch": i,
        "train_WMAE": train_wmae,
        "val_WMAE": val_wmae,
        "test_WMAE": test_wmae,
        "train_WMAE_percent": train_pct_off,
        "val_WMAE_percent": val_pct_off,
        "test_WMAE_percent": test_pct_off
    })

    print(f"Epoch {i} - Train: {train_wmae:.2f} ({train_pct_off:.2f}%)"
          f", Val: {val_wmae:.2f} ({val_pct_off:.2f}%)"
          f", Test: {test_wmae:.2f} ({test_pct_off:.2f}%)")

    if val_wmae < best_val_wmae:
        best_val_wmae = val_wmae
        rounds_no_improve = 0
    else:
        rounds_no_improve += 1

    if rounds_no_improve >= early_stopping_rounds:
        print(f"Early stopping at round {i}")
        break

wandb.finish()

Epoch 1 - Train: 14661.85 (89.98%), Val: 14474.29 (90.49%), Test: 14382.53 (91.43%)
Epoch 2 - Train: 13963.25 (85.69%), Val: 13766.05 (86.06%), Test: 13676.35 (86.94%)
Epoch 3 - Train: 13301.82 (81.63%), Val: 13092.38 (81.85%), Test: 13007.92 (82.69%)
Epoch 4 - Train: 12679.89 (77.82%), Val: 12451.16 (77.84%), Test: 12373.68 (78.66%)
Epoch 5 - Train: 12093.63 (74.22%), Val: 11840.16 (74.02%), Test: 11771.62 (74.83%)
Epoch 6 - Train: 11530.75 (70.77%), Val: 11263.97 (70.42%), Test: 11200.79 (71.20%)
Epoch 7 - Train: 11008.28 (67.56%), Val: 10717.45 (67.00%), Test: 10657.18 (67.75%)
Epoch 8 - Train: 10512.70 (64.52%), Val: 10201.11 (63.77%), Test: 10145.43 (64.49%)
Epoch 9 - Train: 10028.86 (61.55%), Val: 9709.15 (60.70%), Test: 9657.50 (61.39%)
Epoch 10 - Train: 9572.01 (58.74%), Val: 9242.33 (57.78%), Test: 9195.80 (58.46%)
Epoch 11 - Train: 9142.01 (56.11%), Val: 8798.64 (55.00%), Test: 8757.82 (55.67%)
Epoch 12 - Train: 8743.34 (53.66%), Val: 8379.66 (52.39%), Test: 8341.99 (53.03%)


0,1
epoch,▁▁▁▂▂▁▁▁▂▂▂▂▃▃▃▄▄▄▄▁▁▁▂▂▂▃▃▃▃▅▅▅▆▆▆▇▇▇██
test_WMAE,▇▆▆▆▆▇▇▆▆▅▄▃▃▂▂▁█▅▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_WMAE_percent,█▇▆▄▃▃▃▂▂▂▁▁█▇▆▆▃▃▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_WMAE,█▇▇▆█▆▆▅▅▄█▇▅▄▄▃▂▂▂▁▆▆▅▅▅▄▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train_WMAE_percent,█▆▆▅▄▃▃▃▂▂▂▂▂▁█▆▅▅▅▄▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_WMAE,▆██▇▇▇▅▄▄█▅▄▄▃▃▂▂▂▁▁▅▅▄▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
val_WMAE_percent,██▇▅▅▄▄▄▄▂▂▂▂█▅▅▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,100.0
test_WMAE,1425.34185
test_WMAE_percent,9.06062
train_WMAE,1737.16161
train_WMAE_percent,10.6611
val_WMAE,1291.13213
val_WMAE_percent,8.07148


In [None]:
import lightgbm as lgb
import numpy as np
import pandas as pd
import itertools
import wandb

# Initialize wandb
wandb.init(
    project="walmart-forecasting",
    entity="final-project-ml",
    name="LightGBM tuning",
    config={
        "model": "LightGBM",
        "search_type": "grid",
        "early_stopping_rounds": 50,
        "n_estimators": 1000,
        "metric": "WMAE"
    }
)

# Define WMAE
def WMAE(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

# Define weights
train_weights = np.where(train_df['IsHoliday'] == True, 5, 1)
val_weights = np.where(val_df['IsHoliday'] == True, 5, 1)
test_weights = np.where(test_df['IsHoliday'] == True, 5, 1)

# Hyperparameter grid
param_grid = {
    'num_leaves': [31, 50, 70],
    'max_depth': [7, 10, 15],
    'learning_rate': [0.05, 0.1],
    'min_child_samples': [20, 50],
    'subsample': [0.8, 1.0]
}

best_val_wmae = float('inf')
best_params = None

# Grid search
for num_leaves, max_depth, learning_rate, min_child_samples, subsample in itertools.product(
    param_grid['num_leaves'],
    param_grid['max_depth'],
    param_grid['learning_rate'],
    param_grid['min_child_samples'],
    param_grid['subsample']
):
    model = lgb.LGBMRegressor(
        objective="regression",
        num_leaves=num_leaves,
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=1000,
        min_child_samples=min_child_samples,
        subsample=subsample,
        random_state=42
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )

    val_preds = model.predict(X_val, num_iteration=model.best_iteration_)
    val_wmae = WMAE(y_val, val_preds, val_weights)

    wandb.log({
        "val_WMAE": val_wmae,
        "params/num_leaves": num_leaves,
        "params/max_depth": max_depth,
        "params/learning_rate": learning_rate,
        "params/min_child_samples": min_child_samples,
        "params/subsample": subsample
    })

    print(f"Params: num_leaves={num_leaves}, max_depth={max_depth}, lr={learning_rate}, "
          f"min_child_samples={min_child_samples}, subsample={subsample} -> Val WMAE: {val_wmae:.4f}")

    if val_wmae < best_val_wmae:
        best_val_wmae = val_wmae
        best_params = {
            'num_leaves': num_leaves,
            'max_depth': max_depth,
            'learning_rate': learning_rate,
            'min_child_samples': min_child_samples,
            'subsample': subsample
        }

# Final training on train+val
print("\nBest params:", best_params)
print("Best validation WMAE:", best_val_wmae)

X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

final_model = lgb.LGBMRegressor(
    objective="regression",
    **best_params,
    n_estimators=1000,
    random_state=42
)

final_model.fit(
    X_train_val, y_train_val,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

test_preds = final_model.predict(X_test, num_iteration=final_model.best_iteration_)
test_wmae = WMAE(y_test, test_preds, test_weights)

print(f"Test WMAE with best params: {test_wmae:.4f}")
wandb.log({"final_test_WMAE": test_wmae})

wandb.finish()

Params: num_leaves=31, max_depth=7, lr=0.05, min_child_samples=20, subsample=0.8 -> Val WMAE: 875.1967
Params: num_leaves=31, max_depth=7, lr=0.05, min_child_samples=20, subsample=1.0 -> Val WMAE: 875.1967
Params: num_leaves=31, max_depth=7, lr=0.05, min_child_samples=50, subsample=0.8 -> Val WMAE: 901.4147
Params: num_leaves=31, max_depth=7, lr=0.05, min_child_samples=50, subsample=1.0 -> Val WMAE: 901.4147
Params: num_leaves=31, max_depth=7, lr=0.1, min_child_samples=20, subsample=0.8 -> Val WMAE: 730.5658
Params: num_leaves=31, max_depth=7, lr=0.1, min_child_samples=20, subsample=1.0 -> Val WMAE: 730.5658
Params: num_leaves=31, max_depth=7, lr=0.1, min_child_samples=50, subsample=0.8 -> Val WMAE: 726.7433
Params: num_leaves=31, max_depth=7, lr=0.1, min_child_samples=50, subsample=1.0 -> Val WMAE: 726.7433
Params: num_leaves=31, max_depth=10, lr=0.05, min_child_samples=20, subsample=0.8 -> Val WMAE: 871.2514
Params: num_leaves=31, max_depth=10, lr=0.05, min_child_samples=20, subsampl

0,1
final_test_WMAE,▁
params/learning_rate,▁▁███▁▁██▁▁██▁▁▁██▁▁██▁██▁▁▁███▁▁██▁▁▁██
params/max_depth,▁▁▁▁▄▄▄▄████▁▁▁▄▄▄▄▄█████▁▁▁▁▁▄▄▄▄▄█████
params/min_child_samples,▁▁██▁█▁█▁▁▁▁▁████▁▁███▁▁█▁▁█▁▁▁██▁▁▁██▁█
params/num_leaves,▁▁▁▁▁▁▁▁▁▁▄▄▄▄▄▄▄▄▄▄▄▄▄▄▄███████████████
params/subsample,▁█▁██▁▁▁█▁▁▁█▁██▁█▁██▁█▁█▁██▁██▁█▁██▁█▁▁
val_WMAE,▇█▄▄▄██▅▅▅▇▃▅▅▅▅▂▂▄▅▅▂▂▂▂▄▂▂▄▄▂▄▅▄▁▂▃▃▃▁

0,1
final_test_WMAE,841.44964
params/learning_rate,0.1
params/max_depth,15.0
params/min_child_samples,50.0
params/num_leaves,70.0
params/subsample,1.0
val_WMAE,627.0611
