## Importing Required Modules 

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from pandas import DataFrame
from functools import reduce
from enum import Enum

warnings.filterwarnings("ignore")

In [2]:
os.path.abspath(os.getcwd())

'/Users/alexandermendelsohn/Documents/Sales Analysis'

### Merging a 12 Months of Sales Data into a Single csv

In [None]:
x=pd.read_csv("Sales Data/Sales_April_2019.csv")
file=[ i for i in os.listdir("Sales Data")]
df=pd.DataFrame()
for j in file:
    print("Sales Data/"+j)
    x=pd.read_csv("Sales Data/"+j)
    df=pd.concat([df,x])
    
df.head() 

In [None]:
df.shape

### Saving the Data into new csv file

In [None]:
df.to_csv("Updated_sales.csv",index=False)

# Data Preprocessing

define function to read data in chunks

In [None]:
df.shape

In [None]:
df.drop_duplicates().shape

In [None]:
all_features = df.columns.tolist()
all_features

In [None]:
pd.isnull(df[all_features]).sum(axis=0).sort_values(ascending=False)[0:]

In [None]:
df_tmp = df.drop(['Unnamed: 0'], axis=1)
df_tmp[df_tmp.isnull().any(axis=1)]

In [None]:
df.dtypes

In [None]:
float_vars = (df.dtypes.values == np.dtype('float64'))

In [None]:
FILE_PATH = "Updated_sales.csv"
CHUNK_SIZE = 1000

def read_raw_data(file_path: str, chunk_size: int=1000) -> DataFrame:
    csv_reader = pd.read_csv(file_path, chunksize=chunk_size)
    processed_chunks = []

    # append the processed chunk to the list
    for chunk in csv_reader:
        chunk = chunk.drop(['Unnamed: 0'], axis=1)
        chunk = chunk.drop_duplicates()
        chunk = chunk.loc[chunk["Order ID"] != "Order ID"].dropna()
        processed_chunks.append(chunk)

    # concatenate the processed chunks into a single DataFrame
    return pd.concat(processed_chunks, axis=0)

df_prepro = read_raw_data(file_path=FILE_PATH)

In [None]:
df_prepro.shape

In [None]:
df_prepro.head()

### new features

In [None]:
def split_purchase_address(df_to_process: DataFrame) -> DataFrame:
    df_address_split = df_to_process["Purchase Address"].str.split(",", n=3, expand=True)
    df_address_split.columns = ["Street Name", "City", "State and Postal Code"]
    
    df_state_postal_split = (
        df_address_split["State and Postal Code"]
        .str.strip()
        .str.split(" ", n=2, expand=True)
    )
    df_state_postal_split.columns = ["State Code", "Postal Code"]
    
    return pd.concat([df_to_process, df_address_split, df_state_postal_split], axis=1)

def split_order_date(df_to_process: DataFrame) -> DataFrame:
    df_to_process['Order Month'] = pd.to_datetime(df_to_process['Order Date']).dt.month
    df_to_process['Order Day'] = pd.to_datetime(df_to_process['Order Date']).dt.day
    df_to_process['Order Hour'] = pd.to_datetime(df_to_process['Order Date']).dt.hour
    df_to_process['Order Year'] = pd.to_datetime(df_to_process['Order Date']).dt.year
    
    return df_to_process

def convert_numerical_column_types(df_to_process: DataFrame) -> DataFrame:
    df_to_process["Quantity Ordered"] = df_to_process["Quantity Ordered"].astype(int)
    df_to_process["Price Each"] = df_to_process["Price Each"].astype(float)
    df_to_process["Order ID"] = df_to_process["Order ID"].astype(int)
    
    return df_to_process


def calculate_total_order_cost(df_to_process: DataFrame) -> DataFrame:
    df_to_process["Total Cost"] = df_to_process["Quantity Ordered"] * df_to_process["Price Each"]
    return df_to_process

In [None]:
processed_df = (
    df
    .pipe(split_purchase_address)
    .pipe(split_order_date)
    .pipe(convert_numerical_column_types)
    .pipe(calculate_total_order_cost)
)

In [None]:
processed_df.columns.tolist()

In [None]:
processed_df.head()

### Grouping features sets as ENUM

An enum, short for enumeration, is a “set of symbolic names (members) bound to unique values”. Enums have a few key benefits:

- Defining enums lets you have related constants organized in one (or many) classes that can act as a source of truth for dimensions, measures, and other constants you need to call in your pipelines;
- Using enums will allow you to avoid passing invalid values in your data pipelines, assuming you correctly define and maintain the enum class;
- Enums allow users to work with a standardized set of data points and constants, which is helpful when multiple people are aggregating or creating models based on one main source of data (to help avoid having multiple definitions or aliases for the same column in the raw data source).

In [None]:
class CategoricalColumns(Enum):
    PRODUCT = "Product"
    QUANTITY_ORDERED = "Quantity Ordered"
    CITY = "City"
    STATE_CODE = "State Code"
    POSTAL_CODE = "Postal Code"
    HOUSE_NUMBER = "House Number"
    ORDER DAY = "Order Month"
    ORDER MONTH = "Order Day"
    ORDER HOUR = "Order Hour"
    ORDER YEAR = "Order Year"
    
class AddressColumns(Enum):
    STREET_NAME = "Street Name"
    CITY = "City"
    STATE_CODE = "State Code"
    POSTAL_CODE = "Postal Code"
    
class NumericalColumns(Enum):
    TOTAL_COST = "Total Cost"
    QUANTITY_ORDERED = "Quantity Ordered"
    PRICE EACH = 'Price Each'

In [None]:
SalesGroupByColumns.PRODUCT.value

In [None]:
[column.value for column in SalesGroupByColumns]

In [None]:
groupby_columns = [column.value for column in SalesGroupByColumns]

grouped_df = (
    processed_df
    .groupby(groupby_columns)
    ["Order ID"]
    .count()
    .reset_index()
    .sort_values("Order ID", ascending=False)
    .rename({"Order ID": "Count of Order IDs"}, axis=1)
)

grouped_df.head()

In [None]:
# then separately we can do the groupby
groupby_columns = [column.value for column in AddressColumns]

grouped_df = (
    processed_df
    .groupby(groupby_columns)
    .agg(
        Total_Cost=(SalesMeasureColumns.TOTAL_COST.value, np.sum),
        Total_Quantity_Ordered=(SalesMeasureColumns.QUANTITY_ORDERED.value, np.sum)
    )
    .reset_index()
    .sort_values("Total_Cost", ascending=False)
)

### Filtering

In [None]:
filter_conditions = [
    grouped_df["Street Name"].str.contains("North"),
    grouped_df["Postal Code"].str.contains("940"),
    grouped_df["Total_Cost"] < 1000
]

The reduce method from functools allows you to pass a function and an iterable as arguments. The reduce method then applies the function to the elements in the iterable cumulatively. This means that it will perform the function in sequence for the set and combination of elements in the iterable.

Defining column names and filters in a centralized place means that everyone can refer back to a single source of truth and avoid having different names and logic to refer to ultimately the same thing.

In [None]:
# functools reduce
grouped_df.loc[reduce(lambda x, y: x & y, filter_conditions)]