In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

##### Data Preparation

In [None]:
# Read and process the CSV file
def process_data(file_path):
    # Read the CSV data
    # We use encoding='utf-8' to handle special characters
    # We set quotechar='"' to properly handle quoted values
    df = pd.read_csv(file_path, 
                     encoding='utf-8',
                     quotechar='"',
                     encoding_errors='ignore')
    
    print("Initial data preview:")
    print(df.head())
    print("\nInitial data info:")
    print(df.info())
    
    # Separate our target variable (ORDERVOLUME) from our features
    target = df['ORDERVOLUME']
    
    # Define our categorical columns that need encoding
    categorical_columns = ['CUSTOMER_NAME', 'ORDERTYPE', 'WAREHOUSE', 'CITY', 'ORDERWEEK']
    categorical_data = df[categorical_columns]
    
    # Initialize the OneHotEncoder with updated settings
    # sparse_output=False explicitly tells the encoder to return dense matrices
    # This helps avoid the deprecation warning and future compatibility issues
    encoder = OneHotEncoder(sparse_output=False, 
                          handle_unknown='ignore')
    
    # Transform our categorical data
    # The result will be a dense numpy array, avoiding sparse matrix warnings
    encoded_data = encoder.fit_transform(categorical_data)
    
    # Create descriptive names for our encoded features
    feature_names = []
    for i, column in enumerate(categorical_columns):
        # Get categories for this column and create clear feature names
        categories = encoder.categories_[i]
        # Clean up category names and create column labels
        column_names = [f"{column}_{str(cat).strip()}" for cat in categories]
        feature_names.extend(column_names)
    
    # Create our encoded DataFrame using the dense array
    # This approach avoids any sparse matrix handling
    encoded_df = pd.DataFrame(data=encoded_data,  # Our dense array of encoded values
                            columns=feature_names,  # Our descriptive column names
                            index=df.index)        # Preserve original index
    
    # Combine encoded features with our target variable
    final_df = pd.concat([encoded_df, target], axis=1)
    
    # Print helpful information about our transformation
    print("\nTransformation Summary:")
    print(f"Original shape: {df.shape}")
    print(f"Transformed shape: {final_df.shape}")
    print(f"Number of encoded features: {len(feature_names)}")
    
    # Show a sample of our encoded data
    print("\nSample of encoded data (first 5 rows, first 10 columns):")
    print(final_df.iloc[:5, :10])
    
    # Display encoding statistics for each original categorical column
    print("\nEncoding statistics:")
    for col in categorical_columns:
        n_categories = len([c for c in feature_names if c.startswith(f"{col}_")])
        print(f"{col}: {n_categories} unique categories")
    
    return final_df

In [18]:
encoded_df = process_data('2024_OrderVolume_AsofNow.csv')


Initial data preview:
   ORDERVOLUME      CUSTOMER_NAME     ORDERTYPE          WAREHOUSE  \
0            1         HOME DEPOT  CollectOrder  DAL BHI DALLAS IO   
1            1         HOME DEPOT  CollectOrder  DAL BHI DALLAS IO   
2            1         HOME DEPOT  CollectOrder  DAL BHI DALLAS IO   
3            3  EDMAR CORPORATION  CollectOrder  ELW BHI ELWOOD IO   
4           19          MARSHALLS  CollectOrder  ELW BHI ELWOOD IO   

         CITY ORDERWEEK  
0      DALLAS   2024 01  
1     HOUSTON   2024 01  
2      TOPEKA   2024 01  
3  COOKEVILLE   2024 01  
4     DECATUR   2024 01  

Initial data info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13966 entries, 0 to 13965
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ORDERVOLUME    13966 non-null  int64 
 1   CUSTOMER_NAME  13966 non-null  object
 2   ORDERTYPE      13966 non-null  object
 3   WAREHOUSE      13966 non-null  object
 4   CITY      

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [20]:
print(encoded_df.head())
encoded_df.to_csv('encoded_data.csv', index=False)

   CUSTOMER_NAME_35TH AVENUE SEW & VAC  CUSTOMER_NAME_A CLEANER PLACE  \
0                                  0.0                            0.0   
1                                  0.0                            0.0   
2                                  0.0                            0.0   
3                                  0.0                            0.0   
4                                  0.0                            0.0   

   CUSTOMER_NAME_A&L SALES INC  CUSTOMER_NAME_A-1 JANITORIAL SUPPLY INC.  \
0                          0.0                                       0.0   
1                          0.0                                       0.0   
2                          0.0                                       0.0   
3                          0.0                                       0.0   
4                          0.0                                       0.0   

   CUSTOMER_NAME_A-1 VACUUM CLEANER COMPANY  \
0                                       0.0   
1         

##### Model class that will inherit from nn.Module

In [None]:
class Model(nn.Module):
    # Input layer for features for OrderVolume - Customer Name, Order Type, Warehouse, City, Order Week
    