<a href="https://colab.research.google.com/github/taggartpatrick/basic_analytics/blob/main/Bike_Order_Lines_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Load libraries


In [36]:
import pandas as pd
import numpy as np
import altair as alt

### 2. Import data


In [4]:
bike_sales = pd.read_csv("https://raw.githubusercontent.com/taggartpatrick/basic_analytics/main/bike_orderlines.csv")

In [5]:
bike_sales.head()

Unnamed: 0,order_date,order_id,order_line,quantity,price,total_price,model,category_1,category_2,frame_material,bikeshop_name,city,state
0,2011-01-07T00:00:00Z,1.0,1,1,6070.0,6070.0,Jekyll Carbon 2,Mountain,Over Mountain,Carbon,Ithaca Mountain Climbers,Ithaca,NY
1,2011-01-07T00:00:00Z,1.0,2,1,5970.0,5970.0,Trigger Carbon 2,Mountain,Over Mountain,Carbon,Ithaca Mountain Climbers,Ithaca,NY
2,2011-01-10T00:00:00Z,2.0,1,1,2770.0,2770.0,Beast of the East 1,Mountain,Trail,Aluminum,Kansas City 29ers,Kansas City,KS
3,2011-01-10T00:00:00Z,2.0,2,1,5970.0,5970.0,Trigger Carbon 2,Mountain,Over Mountain,Carbon,Kansas City 29ers,Kansas City,KS
4,2011-01-10T00:00:00Z,3.0,1,1,10660.0,10660.0,Supersix Evo Hi-Mod Team,Road,Elite Road,Carbon,Louisville Race Equipment,Louisville,KY


### 3. Tidy Data

#### Tidy Bike Data

In [12]:
bike_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15644 entries, 0 to 15643
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype    
---  ------          --------------  -----    
 0   order_date      15644 non-null  object   
 1   order_id        15644 non-null  float64  
 2   order_line      15644 non-null  int64    
 3   quantity        15644 non-null  int64    
 4   price           15644 non-null  float64  
 5   total_price     15644 non-null  float64  
 6   model           15644 non-null  object   
 7   category_1      15644 non-null  object   
 8   category_2      15644 non-null  object   
 9   frame_material  15644 non-null  object   
 10  bikeshop_name   15644 non-null  object   
 11  city            15644 non-null  object   
 12  state           15644 non-null  object   
 13  Order_Month     15644 non-null  int64    
 14  Order_Day       15644 non-null  int64    
 15  Order_YM        15644 non-null  period[M]
 16  Order_Weekday   15644 non-null  object  

In [13]:
# Convert order datetime to date 
bike_sales['order_date'] = pd.to_datetime(bike_sales['order_date']).dt.date

In [73]:
# Add fields for month, day and year
bike_sales['Order_Month'] = pd.DatetimeIndex(bike_sales['order_date']).month
bike_sales['Order_Day'] = pd.DatetimeIndex(bike_sales['order_date']).day
bike_sales['Order_Year'] = pd.DatetimeIndex(bike_sales['order_date']).year
bike_sales['Order_YM'] = pd.to_datetime(bike_sales['order_date']).dt.to_period('M')
bike_sales['Order_Weekday'] = pd.to_datetime(bike_sales['order_date']).dt.day_name()
bike_sales['Order_YM'] = pd.to_datetime(bike_sales['order_date']).dt.to_period('M')

In [74]:
bike_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15644 entries, 0 to 15643
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype    
---  ------             --------------  -----    
 0   order_date         15644 non-null  object   
 1   order_id           15644 non-null  float64  
 2   order_line         15644 non-null  int64    
 3   quantity           15644 non-null  int64    
 4   price              15644 non-null  float64  
 5   total_price        15644 non-null  float64  
 6   model              15644 non-null  object   
 7   category_1         15644 non-null  object   
 8   category_2         15644 non-null  object   
 9   frame_material     15644 non-null  object   
 10  bikeshop_name      15644 non-null  object   
 11  city               15644 non-null  object   
 12  state              15644 non-null  object   
 13  Order_Month        15644 non-null  int64    
 14  Order_Day          15644 non-null  int64    
 15  Order_YM           15644 non-null  p

### Transform Data

#### Group Sales by Shop

In [75]:
bike_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15644 entries, 0 to 15643
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype    
---  ------             --------------  -----    
 0   order_date         15644 non-null  object   
 1   order_id           15644 non-null  float64  
 2   order_line         15644 non-null  int64    
 3   quantity           15644 non-null  int64    
 4   price              15644 non-null  float64  
 5   total_price        15644 non-null  float64  
 6   model              15644 non-null  object   
 7   category_1         15644 non-null  object   
 8   category_2         15644 non-null  object   
 9   frame_material     15644 non-null  object   
 10  bikeshop_name      15644 non-null  object   
 11  city               15644 non-null  object   
 12  state              15644 non-null  object   
 13  Order_Month        15644 non-null  int64    
 14  Order_Day          15644 non-null  int64    
 15  Order_YM           15644 non-null  p

In [35]:
# Get sales by bikeshop and sort decending
shop_sales = bike_sales.groupby('bikeshop_name')['total_price'].sum().to_frame(name = "Total Sales").reset_index()
shop_sales.sort_values(by = ['Total Sales'], inplace = True, ascending= False)
shop_sales.head()

Unnamed: 0,bikeshop_name,Total Sales
10,Kansas City 29ers,11535455.0
6,Denver Bike Shop,7697670.0
9,Ithaca Mountain Climbers,6299335.0
21,Phoenix Bi-peds,4168535.0
19,Oklahoma City Race Equipment,3450040.0


In [None]:
# Bar chart of sales
source = shop_sales

bars = alt.Chart(source).mark_bar().encode(
    x="Total Sales:Q",
    y= alt.Y("bikeshop_name:O",sort='-x')
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='Total Sales:Q'
)

(bars + text).properties(height=900)

In [76]:
# Get Sales over time by frame material
sales_mat_tbl = bike_sales.groupby(['frame_material','Order_Year'])['total_price'].sum().to_frame(name = "Total Sales").reset_index()

In [84]:
# produce facte plot by Material 
source = sales_mat_tbl

line = alt.Chart(source).mark_line().encode(
    x = 'Order_Year',
    y = 'Total Sales',
    color = 'frame_material'
).properties(
    width = 300,
    height = 300
).facet(column = 'frame_material')

line

#line.facet('frame material')