In [1]:
from sqlalchemy import create_engine

In [2]:
conn_string = 'postgresql://jeffreykatz@localhost/ecommerce'

conn = create_engine(conn_string)

In [59]:
import pandas as pd

df = pd.read_csv('./ecommerce-dataset.csv')

In [60]:
columns = ['transaction_id', 'customer_id', 'date', 'product', 'gender',
       'device_type', 'country', 'state', 'city', 'category',
       'customer_login_type', 'delivery_type', 'quantity',
       'transaction_start', 'transaction_result', 'total_amount',
       'individual_price', 'year_month', 'time']

In [61]:
df.columns = columns

In [62]:
df[:1]

Unnamed: 0,transaction_id,customer_id,date,product,gender,device_type,country,state,city,category,customer_login_type,delivery_type,quantity,transaction_start,transaction_result,total_amount,individual_price,year_month,time
0,40170,1348959766,14/11/2013,Hair Band,Female,Web,United States,New York,New York City,Accessories,Member,one-day deliver,12,1,0,6910,576,13-Nov,22:35:51


In [67]:
individual_price = pd.to_numeric(df['individual_price'].str.replace(',', ''), errors = 'coerce')

In [63]:
total_amount = pd.to_numeric(df['total_amount'].str.replace(',', ''), errors = 'coerce')

In [56]:
date = pd.to_datetime(updated_df['date'], infer_datetime_format=True)

In [71]:
time = pd.to_datetime(updated_df['time'], infer_datetime_format=True)

In [72]:
updated_df = df.assign(total_amount = numeric_price, date = date, individual_price = individual_price, time = time)

In [74]:
updated_df[:1]

Unnamed: 0,transaction_id,customer_id,date,product,gender,device_type,country,state,city,category,customer_login_type,delivery_type,quantity,transaction_start,transaction_result,total_amount,individual_price,year_month,time
0,40170,1348959766,2013-11-14,Hair Band,Female,Web,United States,New York,New York City,Accessories,Member,one-day deliver,12,1,0,576.0,576.0,13-Nov,2022-11-03 22:35:51


In [80]:
updated_df.to_sql('transactions', conn, if_exists = 'replace')

535

In [81]:
pd.read_sql("select total_amount from transactions limit 3", conn)

Unnamed: 0,total_amount
0,576.0
1,100.0
2,217.0


In [85]:
overall_year_month = pd.read_sql("""select sum(total_amount), EXTRACT(YEAR FROM date) as year, EXTRACT(MONTH FROM date) as month 
from transactions group by year, month order by year, month""", conn)

overall_year_month

Unnamed: 0,sum,year,month
0,989610.0,2013.0,9.0
1,4073720.0,2013.0,10.0
2,5707065.0,2013.0,11.0
3,4725306.0,2013.0,12.0
4,742064.0,2014.0,1.0


Now one limitation of the above data is that we only have data from months September through January 2014.  Normally, we would like to compare year over year sales from the same month or even day.  For example, January 2014 should be compared against January 2013 -- not December.

Another thing to notice is that our numbers for September 2013 and January 2014 are a lot smaller than the other months.  Perhaps we have missing data.  One first check is to see the number of days accounted for in both of these months.

In [98]:
query = """select EXTRACT(month FROM date) as month, count(distinct(EXTRACT(day FROM date))) from transactions group by month"""
overall_year_month = pd.read_sql(query, conn)

In [99]:
overall_year_month

Unnamed: 0,month,count
0,1.0,13
1,9.0,11
2,10.0,31
3,11.0,29
4,12.0,31


So it looks like our months of October through December are essentially complete while September and January has only 1/3 of the days.  Let's see if the days are at least contiguous.  To do so, let's see the min and maximum days in the dataset.

In [100]:
query = """select min(date), max(date) from transactions"""
pd.read_sql(query, conn)

Unnamed: 0,min,max
0,2013-09-20,2014-01-13


Ok so it does look like there are not gaps in our dates for these months.  This means grouping by something like weeks (instead of months) may be more appropriate.

> Skipping first and last week.

In [1]:
query = """select count(distinct(extract(day from date))), EXTRACT(week FROM date) as week where week = 38 group by week"""
# overall_year_month = pd.read_sql(query, conn)

In [110]:
query = """select EXTRACT(week FROM date) as week, EXTRACT(year FROM date) as year, SUM(total_amount) 
from transactions group by week, year order by year, week OFFSET 2 limit 16"""
overall_year_month = pd.read_sql(query, conn)
overall_year_month

Unnamed: 0,week,year,sum
0,39.0,2013.0,595164.0
1,40.0,2013.0,738260.0
2,41.0,2013.0,754522.0
3,42.0,2013.0,982342.0
4,43.0,2013.0,992948.0
5,44.0,2013.0,1089473.0
6,45.0,2013.0,1339593.0
7,46.0,2013.0,1366328.0
8,47.0,2013.0,1248567.0
9,48.0,2013.0,1667409.0


In [None]:
from transactions

In [88]:
# !pip3 install plotly

In [91]:
# import plotly.graph_objects as go
# go.Figure(go.Scatter(y = overall_year_month['sum']))

In [2]:
# overall_year_month['sum'].plot()

### Resources

[Crosstab](https://stackoverflow.com/questions/3002499/postgresql-crosstab-query/11751905#11751905)