# ST590 - Project 2

Authored by Group 13 - Dave Bergeron, Xavier Genelin, and Maksim Nikiforov

## Part 2

In [1]:
# setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import sqlite3

In [2]:
con = sqlite3.connect("northwind.sqlite")
pd.read_sql('SELECT * FROM sqlite_schema WHERE type = \'table\'', con)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,Categories,Categories,2,CREATE TABLE [Categories]\n( [CategoryID]...
1,table,sqlite_sequence,sqlite_sequence,3,"CREATE TABLE sqlite_sequence(name,seq)"
2,table,CustomerCustomerDemo,CustomerCustomerDemo,28,CREATE TABLE [CustomerCustomerDemo](\n [Cust...
3,table,CustomerDemographics,CustomerDemographics,30,CREATE TABLE [CustomerDemographics](\n [Cust...
4,table,Customers,Customers,32,CREATE TABLE [Customers]\n( [CustomerID] ...
5,table,Employees,Employees,38,CREATE TABLE [Employees]\n( [EmployeeID] ...
6,table,EmployeeTerritories,EmployeeTerritories,68,CREATE TABLE [EmployeeTerritories](\n [Emplo...
7,table,Order Details,Order Details,70,CREATE TABLE [Order Details](\n [OrderID]INT...
8,table,Orders,Orders,92,CREATE TABLE [Orders](\n [OrderID]INTEGER NO...
9,table,Products,Products,128,CREATE TABLE [Products](\n [ProductID]INTEGE...


The _Employees_ table contains information on nine individuals whose employee identifications range of 1 to 9. One employee is a vice president, one employee is a sales manager, and six employees are sales representatives. The six representatives have access to one insides sales coordinator who presumably helps fulfill orders, ensures timely delivery, and assists with other logistics.   

In [3]:
pd.read_sql('SELECT EmployeeID, LastName, FirstName, Title FROM Employees', con)

Unnamed: 0,EmployeeID,LastName,FirstName,Title
0,1,Davolio,Nancy,Sales Representative
1,2,Fuller,Andrew,"Vice President, Sales"
2,3,Leverling,Janet,Sales Representative
3,4,Peacock,Margaret,Sales Representative
4,5,Buchanan,Steven,Sales Manager
5,6,Suyama,Michael,Sales Representative
6,7,King,Robert,Sales Representative
7,8,Callahan,Laura,Inside Sales Coordinator
8,9,Dodsworth,Anne,Sales Representative


We can assess the performance of each individual by understanding how much they sell. To do this, we can combine information from the _Orders_ table with additional details from the _Order Details_ table.  

In [40]:
# Merge "Orders" and "Order Details" tables to reveal the quantitites sold by each employee
# as well as the price and discount per unit
orders_per_employee = pd.merge(
    left = pd.read_sql("SELECT EmployeeID, OrderID, OrderDate FROM Orders", con), 
    right = pd.read_sql("SELECT OrderID,  UnitPrice, Quantity, Discount FROM [Order Details]", con),
    how = "left", 
    on = "OrderID")

# Create a new column to calculate total sales per order, inclusive of any discounts
orders_per_employee["TotalSale"] = \
(orders_per_employee["UnitPrice"]-orders_per_employee["UnitPrice"]*orders_per_employee["Discount"])*orders_per_employee["Quantity"]

# Group sales by EmployeeID and sum up total sales for each employee.
# Reset the index to allow the EmployeeID column to be merged with other tables. 
summed_sales = orders_per_employee[["EmployeeID", "Quantity", "TotalSale"]].groupby("EmployeeID").sum("TotalSale").reset_index()
summed_sales

Unnamed: 0,EmployeeID,Quantity,TotalSale
0,1,7812,192107.6045
1,2,6055,166537.755
2,3,7852,202812.843
3,4,9798,232890.846
4,5,3036,68792.2825
5,6,3527,73913.1295
6,7,4654,124568.235
7,8,5913,126862.2775
8,9,2670,77308.0665


By associating each employee ID with employee names, we see that Margaret Peacock sold the most amount of product, bringing in $\$232,890.85$ in sales (assuming a US Dollar currency). As sales representatives, Michael Suyama and Anne Dodsworth sold three times less, suggesting that there is room for improvement. 

In [13]:
# Merge EmployeeID with names and titles for better perspective
named_employee_sales = pd.merge(
    left = pd.read_sql("SELECT EmployeeID,  LastName, FirstName, Title FROM Employees", con),
    right = summed_sales, 
    how = "left", 
    on = "EmployeeID")

# Sort in descending order of total sales
named_employee_sales.sort_values(by=["TotalSale"], ascending=False)

Unnamed: 0,EmployeeID,LastName,FirstName,Title,Quantity,TotalSale
3,4,Peacock,Margaret,Sales Representative,9798,232890.846
2,3,Leverling,Janet,Sales Representative,7852,202812.843
0,1,Davolio,Nancy,Sales Representative,7812,192107.6045
1,2,Fuller,Andrew,"Vice President, Sales",6055,166537.755
7,8,Callahan,Laura,Inside Sales Coordinator,5913,126862.2775
6,7,King,Robert,Sales Representative,4654,124568.235
8,9,Dodsworth,Anne,Sales Representative,2670,77308.0665
5,6,Suyama,Michael,Sales Representative,3527,73913.1295
4,5,Buchanan,Steven,Sales Manager,3036,68792.2825


We can also assess annual sales for each employee.

In [53]:
# Convert "OrderDate" column to DateTime in pandas
orders_per_employee["OrderDate"] = pd.to_datetime(orders_per_employee["OrderDate"])

# Extract year from the "OrderDate" DateTime column and save it in a new column
orders_per_employee["OrderYear"] = orders_per_employee["OrderDate"].dt.year

# Sum up sales and group by EmployeeID and OrderYear 
orders_per_employee[["OrderYear", "TotalSale", "EmployeeID"]].groupby(["EmployeeID", "OrderYear"]).sum("TotalSale")

Unnamed: 0_level_0,Unnamed: 1_level_0,TotalSale
EmployeeID,OrderYear,Unnamed: 2_level_1
1,1996,35764.515
1,1997,93148.0775
1,1998,63195.012
2,1996,21757.06
2,1997,70444.14
2,1998,74336.555
3,1996,18223.96
3,1997,108026.1555
3,1998,76562.7275
4,1996,49945.115
