## Preprocessing

In [1]:
import pandas as pd 
import numpy as np 
from datetime import timedelta

In [2]:
df_train = pd.read_csv('sales_train_evaluation.csv')
df_train.head() #HOBBIES_1_001	HOBBIES_1	HOBBIES	CA_1	CA	0	2011-01-29

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0


In [3]:
df_cal = pd.read_csv('calendar.csv')
df_cal['date'] = pd.to_datetime(df_cal['date'])
df_cal.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,,,,,1,0,1


In [4]:
df_sell = pd.read_csv('sell_prices.csv')
df_sell.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


### Pivot from long format to short

In [5]:
start_date = pd.to_datetime('2011-01-29')

# Melt the DataFrame to long format directly, without adding separate date columns
melted_df = pd.melt(df_train, 
                    id_vars=['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
                    value_vars=[f'd_{i}' for i in range(1, 1942)], 
                    var_name='day', 
                    value_name='units_sold')

# Calculate the date directly from the 'day' column
# Extract the day index from the 'day' column (removing 'd_') and convert to int
melted_df['date'] = melted_df['day'].apply(lambda x: start_date + timedelta(days=int(x.split('_')[1])-1))

# Drop the 'day' column if it's no longer needed
melted_df.drop(columns=['day'], inplace=True)

### Add calender information

In [6]:
# HOBBIES_1	HOBBIES
merged_df = pd.merge(melted_df, df_cal, on='date', how='left')
merged_df.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,units_sold,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0
1,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0
2,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0
3,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0
4,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,2011-01-29,11101,Saturday,1,1,2011,,,,,0,0,0


### Add price information

In [7]:
# Left join on 'store_id', 'item_id', and 'wm_yr_wk'
final_df = pd.merge(merged_df, df_sell, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')

In [8]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 20 columns):
 #   Column        Dtype         
---  ------        -----         
 0   item_id       object        
 1   dept_id       object        
 2   cat_id        object        
 3   store_id      object        
 4   state_id      object        
 5   units_sold    int64         
 6   date          datetime64[ns]
 7   wm_yr_wk      int64         
 8   weekday       object        
 9   wday          int64         
 10  month         int64         
 11  year          int64         
 12  event_name_1  object        
 13  event_type_1  object        
 14  event_name_2  object        
 15  event_type_2  object        
 16  snap_CA       int64         
 17  snap_TX       int64         
 18  snap_WI       int64         
 19  sell_price    float64       
dtypes: datetime64[ns](1), float64(1), int64(8), object(10)
memory usage: 8.8+ GB


## Reduce memory usage

In [9]:
# Convert object columns with limited unique values to 'category'
for col in ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'weekday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']:
    final_df[col] = final_df[col].astype('category')

# Convert integer columns to the smallest possible integer type
for col in ['units_sold', 'wm_yr_wk', 'wday', 'month', 'year', 'snap_CA', 'snap_TX', 'snap_WI']:
    final_df[col] = pd.to_numeric(final_df[col], downcast='integer')

# Convert float columns to float32 if precision allows
final_df['sell_price'] = final_df['sell_price'].astype('float32')

# Check memory usage after optimization
print(final_df.info(memory_usage='deep'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 20 columns):
 #   Column        Dtype         
---  ------        -----         
 0   item_id       category      
 1   dept_id       category      
 2   cat_id        category      
 3   store_id      category      
 4   state_id      category      
 5   units_sold    int16         
 6   date          datetime64[ns]
 7   wm_yr_wk      int16         
 8   weekday       category      
 9   wday          int8          
 10  month         int8          
 11  year          int16         
 12  event_name_1  category      
 13  event_type_1  category      
 14  event_name_2  category      
 15  event_type_2  category      
 16  snap_CA       int8          
 17  snap_TX       int8          
 18  snap_WI       int8          
 19  sell_price    float32       
dtypes: category(10), datetime64[ns](1), float32(1), int16(3), int8(5)
memory usage: 1.9 GB
None


### Save to parquet

In [11]:
# Make the final optimized data to parquet
final_df.to_parquet('sales_train_eval_processed.parquet', compression='snappy', engine='pyarrow')

  if _pandas_api.is_sparse(col):
