In [15]:
# ==== Setup & Imports ====
from pathlib import Path
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose

pd.set_option('display.float_format', lambda x: f'{x:,.2f}')

CSV_PATH = 'data/superstore.csv'  

df = pd.read_csv(CSV_PATH, encoding='ISO-8859-1')
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.91
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.58
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.87
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.58,5,0.45,-383.03
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.37,2,0.2,2.52


## Data Card
Dataset containing Information related to Sales, Profits and other interesting facts of a Superstore giant. Can you tell about the region. Their is no north region available in this column.

## Metadata
```
Row ID => Unique ID for each row.
Order ID => Unique Order ID for each Customer.
Order Date => Order Date of the product.
Ship Date => Shipping Date of the Product.
Ship Mode=> Shipping Mode specified by the Customer.
Customer ID => Unique ID to identify each Customer.
Customer Name => Name of the Customer.
Segment => The segment where the Customer belongs.
Country => Country of residence of the Customer.
City => City of residence of of the Customer.
State => State of residence of the Customer.
Postal Code => Postal Code of every Customer.
Region => Region where the Customer belong.
Product ID => Unique ID of the Product.
Category => Category of the product ordered.
Sub-Category => Sub-Category of the product ordered.
Product Name => Name of the Product
Sales => Sales of the Product.
Quantity => Quantity of the Product.
Discount => Discount provided.
Profit => Profit/Loss incurred.
```

In [16]:
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns.str.strip().str.lower()
        .str.replace(r'[^0-9a-zA-Z]+', '_', regex=True)
        .str.strip('_')
    )
    return df


df = standardize_columns(df)
df.head().T


Unnamed: 0,0,1,2,3,4
row_id,1,2,3,4,5
order_id,CA-2016-152156,CA-2016-152156,CA-2016-138688,US-2015-108966,US-2015-108966
order_date,11/8/2016,11/8/2016,6/12/2016,10/11/2015,10/11/2015
ship_date,11/11/2016,11/11/2016,6/16/2016,10/18/2015,10/18/2015
ship_mode,Second Class,Second Class,Second Class,Standard Class,Standard Class
customer_id,CG-12520,CG-12520,DV-13045,SO-20335,SO-20335
customer_name,Claire Gute,Claire Gute,Darrin Van Huff,Sean O'Donnell,Sean O'Donnell
segment,Consumer,Consumer,Corporate,Consumer,Consumer
country,United States,United States,United States,United States,United States
city,Henderson,Henderson,Los Angeles,Fort Lauderdale,Fort Lauderdale


In [17]:
print(df.isnull().sum())
print(df.duplicated().sum())


row_id           0
order_id         0
order_date       0
ship_date        0
ship_mode        0
customer_id      0
customer_name    0
segment          0
country          0
city             0
state            0
postal_code      0
region           0
product_id       0
category         0
sub_category     0
product_name     0
sales            0
quantity         0
discount         0
profit           0
dtype: int64
0


## 1. How much money did we make in total, then year by year, month by month.
## 2. Are we getting better or worse at our job
## 3. Who is our most profitable employee (what does profitable mean)?
## 4. Who is our lest profitable employee (by how much?)

## 5. What sells the most. 
## 6. What makes us the most money. 
## 7. Are we losting money.  If so, where and how? 


## 0. Distribution of sales quantities, profits, and discounts

In [21]:
## 1. How much money did we make in total, then year by year, month by month.
total_profit = df.profit.sum().round(2)
formatted_currency_format = "${:,.2f}".format(total_profit)
print(formatted_currency_format)



$286,397.02


In [23]:
## 1.2 How much money did we make in total, then year by year, month by month.
# df['order_date']
df.head()

Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,postal_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.91
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.58
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.87
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.58,5,0.45,-383.03
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.37,2,0.2,2.52


In [26]:
df['order_date'] = pd.to_datetime(df['order_date'])
df.head()

Unnamed: 0,row_id,order_id,order_date,ship_date,ship_mode,customer_id,customer_name,segment,country,city,...,postal_code,region,product_id,category,sub_category,product_name,sales,quantity,discount,profit
0,1,CA-2016-152156,2016-11-08,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.91
1,2,CA-2016-152156,2016-11-08,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.58
2,3,CA-2016-138688,2016-06-12,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.87
3,4,US-2015-108966,2015-10-11,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.58,5,0.45,-383.03
4,5,US-2015-108966,2015-10-11,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.37,2,0.2,2.52


In [29]:
df.groupby(df['order_date'].dt.year)['profit'].sum()
# df.groupby(df['order_date'].dt.month)['profit'].sum()

order_date
2014   49,543.97
2015   61,618.60
2016   81,795.17
2017   93,439.27
Name: profit, dtype: float64

In [56]:
profit_over_time = df.groupby(
    [df['order_date'].dt.year,
     df['order_date'].dt.month])['profit'].sum().to_frame()



In [63]:
daily_profit = df.groupby('order_date')['profit'].sum().reset_index()
daily_profit



Unnamed: 0,order_date,profit
0,2014-01-03,5.55
1,2014-01-04,-65.99
2,2014-01-05,4.88
3,2014-01-06,1358.05
4,2014-01-07,-71.96
...,...,...
1232,2017-12-26,61.12
1233,2017-12-27,-31.97
1234,2017-12-28,253.12
1235,2017-12-29,644.43


In [70]:
px.bar(daily_profit, x='order_date', y='profit')

In [76]:
cum_profit = df.groupby('order_date')['profit'].cumsum()
cum_profit

0        41.91
1       261.50
2         6.87
3      -383.03
4      -380.51
         ...  
9989      4.10
9990    414.64
9991    434.03
9992    447.35
9993    292.33
Name: profit, Length: 9994, dtype: float64

In [None]:
cum_profit = daily_profit.profit.cumsum()
px.line(cum_profit)

In [None]:
import plotly.express as px
fig = px.histogram(df, x='profit')
fig

