# Store Sales Time Series Forecasting

## Business Context 

Using 4 years of data from 

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import datetime as dt


pd.set_option('display.max_columns', None)

In [11]:
df = pd.read_csv("train.csv")

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9800 non-null   int64  
 1   Order ID       9800 non-null   object 
 2   Order Date     9800 non-null   object 
 3   Ship Date      9800 non-null   object 
 4   Ship Mode      9800 non-null   object 
 5   Customer ID    9800 non-null   object 
 6   Customer Name  9800 non-null   object 
 7   Segment        9800 non-null   object 
 8   Country        9800 non-null   object 
 9   City           9800 non-null   object 
 10  State          9800 non-null   object 
 11  Postal Code    9789 non-null   float64
 12  Region         9800 non-null   object 
 13  Product ID     9800 non-null   object 
 14  Category       9800 non-null   object 
 15  Sub-Category   9800 non-null   object 
 16  Product Name   9800 non-null   object 
 17  Sales          9800 non-null   float64
dtypes: float

In [13]:
# Handle missing values 
df['Postal Code'].fillna(df['Postal Code'].mode()[0], inplace=True)

In [14]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9800 non-null   int64  
 1   Order ID       9800 non-null   object 
 2   Order Date     9800 non-null   object 
 3   Ship Date      9800 non-null   object 
 4   Ship Mode      9800 non-null   object 
 5   Customer ID    9800 non-null   object 
 6   Customer Name  9800 non-null   object 
 7   Segment        9800 non-null   object 
 8   Country        9800 non-null   object 
 9   City           9800 non-null   object 
 10  State          9800 non-null   object 
 11  Postal Code    9800 non-null   float64
 12  Region         9800 non-null   object 
 13  Product ID     9800 non-null   object 
 14  Category       9800 non-null   object 
 15  Sub-Category   9800 non-null   object 
 16  Product Name   9800 non-null   object 
 17  Sales          9800 non-null   float64
dtypes: float

In [15]:
df.head(8)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales
0,1,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96
1,2,CA-2017-152156,08/11/2017,11/11/2017,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,Kentucky,42420.0,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94
2,3,CA-2017-138688,12/06/2017,16/06/2017,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,California,90036.0,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62
3,4,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775
4,5,US-2016-108966,11/10/2016,18/10/2016,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,Florida,33311.0,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368
5,6,CA-2015-115812,09/06/2015,14/06/2015,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,FUR-FU-10001487,Furniture,Furnishings,Eldon Expressions Wood and Plastic Desk Access...,48.86
6,7,CA-2015-115812,09/06/2015,14/06/2015,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,OFF-AR-10002833,Office Supplies,Art,Newell 322,7.28
7,8,CA-2015-115812,09/06/2015,14/06/2015,Standard Class,BH-11710,Brosina Hoffman,Consumer,United States,Los Angeles,California,90032.0,West,TEC-PH-10002275,Technology,Phones,Mitel 5320 IP Phone VoIP phone,907.152


In [16]:
def clean_data(data):
    # Convert Order Date
    df['Order Date'] = pd.to_datetime(df['Order Date'], format="%d/%m/%Y")
    
    # Create Day of Week 
    df['Day of Week'] = df['Order Date'].dt.dayofweek
    
    # Create Day
    df['Day'] = df['Order Date'].dt.day
    
    # Create Year
    df['Year'] = df['Order Date'].dt.year
    
    # Creaate Month
    df['Month'] = df['Order Date'].dt.month
    
    # Create Season 
    df['Season'] = df['Order Date'].apply(lambda x: "Spring" if x.month in [3, 4, 5] else
                                    "Summer" if x.month in [6, 7, 8] else
                                    "Autumn" if x.month in [9, 10, 11] else
                                    "Winter")
    
    # Convert Postal Code to int
    df['Postal Code'] = df['Postal Code'].astype(int)
    
    # Drop rows
    df.drop(['Row ID', 'Order ID', 'Ship Date', 'Ship Mode', 'Customer ID', 'Customer Name', 
         'Country', 'City', 'State', 'Postal Code', 'Product ID', 
         'Product Name'], axis=1, inplace=True)
    
    return df

clean_data(df)

Unnamed: 0,Order Date,Segment,Region,Category,Sub-Category,Sales,Day of Week,Day,Year,Month,Season
0,2017-11-08,Consumer,South,Furniture,Bookcases,261.9600,2,8,2017,11,Autumn
1,2017-11-08,Consumer,South,Furniture,Chairs,731.9400,2,8,2017,11,Autumn
2,2017-06-12,Corporate,West,Office Supplies,Labels,14.6200,0,12,2017,6,Summer
3,2016-10-11,Consumer,South,Furniture,Tables,957.5775,1,11,2016,10,Autumn
4,2016-10-11,Consumer,South,Office Supplies,Storage,22.3680,1,11,2016,10,Autumn
...,...,...,...,...,...,...,...,...,...,...,...
9795,2017-05-21,Corporate,Central,Office Supplies,Binders,3.7980,6,21,2017,5,Spring
9796,2016-01-12,Corporate,East,Office Supplies,Art,10.3680,1,12,2016,1,Winter
9797,2016-01-12,Corporate,East,Technology,Phones,235.1880,1,12,2016,1,Winter
9798,2016-01-12,Corporate,East,Technology,Phones,26.3760,1,12,2016,1,Winter


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9800 entries, 0 to 9799
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Row ID        9800 non-null   int64         
 1   Order ID      9800 non-null   object        
 2   Order Date    9800 non-null   datetime64[ns]
 3   Segment       9800 non-null   object        
 4   Region        9800 non-null   object        
 5   Category      9800 non-null   object        
 6   Sub-Category  9800 non-null   object        
 7   Sales         9800 non-null   float64       
 8   Day of Week   9800 non-null   int32         
 9   Day           9800 non-null   int32         
 10  Year          9800 non-null   int32         
 11  Month         9800 non-null   int32         
 12  Season        9800 non-null   object        
dtypes: datetime64[ns](1), float64(1), int32(4), int64(1), object(6)
memory usage: 842.3+ KB


In [None]:
df.