## SQL queries example

This file is for the reason to demonstrate SQL skills

1. Get data

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Thu Jun  2 21:34:41 2022

@author: Yury
"""

import pandas as pd
path=r'C:\a_job\2022\smalls (cat food)\task\data'

df_o=pd.read_excel(f"{path}\Analyst Exercise - FACT_ORDERS.xlsx")
df_c=pd.read_excel(f"{path}\Analyst Exercise - DIM_CUSTOMERS.xlsx")

2. Create database tables

In [5]:
import sqlite3
# connection object
connection_obj = sqlite3.connect('smalls_database_jp.db')
 
# cursor object
cursor_obj = connection_obj.cursor()

cursor_obj.execute('''
          CREATE TABLE IF NOT EXISTS FACT_ORDERS
          ([ORDER_ID] INTEGER PRIMARY KEY, 
          [CUSTOMER_ID] INTEGER, 
          [ORDER_DATE] , 
          [ORDER_SEQUENCE] INTEGER, 
          [ORDER_TYPE] TEXT,
          [REVENUE] REAL, 
          [MARGIN] REAL
          )
          ''')
          
cursor_obj.execute('''
          CREATE TABLE IF NOT EXISTS DIM_CUSTOMERS
          ([CUSTOMER_ID] INTEGER PRIMARY KEY,
          [TRIAL_WEEK], 
          [TRIAL_PLAN] TEXT,
          [GA_SOURCE] TEXT,
          [GA_MEDIUM] TEXT,
          [CANCELLATION_FLAG] TEXT,
          [CANCELLATION_DATE], 
          [CANCELLATION_REASON] TEXT,
          [CAT_COUNT] INTEGER
          )
          ''')

connection_obj.commit()

3. Data appeared to have not unique CUSTOMER_ID in DIM_CUSTOMERS -> remove duplicates

In [3]:
tmp=df_c.groupby(['CUSTOMER_ID']).size().reset_index().copy()
tmp.rename(columns={0:"count"}, inplace =True)  
duplicate_id=tmp.loc[tmp["count"]>1]['CUSTOMER_ID'].tolist()
print("Duplicates in Customer", duplicate_id)
print(df_c.loc[df_c['CUSTOMER_ID'].isin(duplicate_id)])
df_c=df_c.loc[~(df_c['CUSTOMER_ID'].isin(duplicate_id))].copy()
df_o=df_o.loc[~(df_o['CUSTOMER_ID'].isin(duplicate_id))].copy()

Duplicates in Customer [5359139848291]
         CUSTOMER_ID TRIAL_WEEK TRIAL_PLAN GA_SOURCE GA_MEDIUM  \
35982  5359139848291 2021-08-02      FRESH       NaN       NaN   
36089  5359139848291 2021-08-02      FRESH       NaN       NaN   

      CANCELLATION_FLAG   CANCELLATION_DATE CANCELLATION_REASON  CAT_COUNT  
35982          INACTIVE 2021-08-31 12:53:17               Other        1.0  
36089          INACTIVE 2021-08-05 09:25:37               Other        1.0  


4. Insert data

In [6]:
#insert
df=df_c[['CUSTOMER_ID', 'TRIAL_WEEK', 'TRIAL_PLAN', 'GA_SOURCE', 'GA_MEDIUM',
       'CANCELLATION_FLAG', 'CANCELLATION_DATE', 'CANCELLATION_REASON',
       'CAT_COUNT']].copy()
df.to_sql(name='DIM_CUSTOMERS', con=connection_obj, if_exists='append', index=False)

df=df_o[['ORDER_ID', 'CUSTOMER_ID', 'ORDER_DATE', 'ORDER_SEQUENCE', 'ORDER_TYPE',
       'REVENUE', 'MARGIN']].copy()
df.to_sql(name='FACT_ORDERS', con=connection_obj, if_exists='append', index=False)

connection_obj.commit()

5. Check data

In [7]:
#check data
cursor_obj.execute('''select * from DIM_CUSTOMERS limit 10''')
df = pd.DataFrame(cursor_obj.fetchall())                

In [8]:
cursor_obj.execute('''select * from FACT_ORDERS limit 10''')
df = pd.DataFrame(cursor_obj.fetchall())    

6. Example of Query:
    
    **Get information about retention as of number of sequential orders for users that where acuired during month (trial_week aggregated to month)**
    
    
    

In [9]:
# get query
cursor_obj.execute('''
        WITH one as (
            SELECT *,
            --DATE_FORMAT(TRIAL_WEEK, '%Y-%m-01') as TRIAL_MONTH
            strftime('%Y-%m-01', TRIAL_WEEK) as TRIAL_MONTH
            FROM FACT_ORDERS as o
            INNER JOIN DIM_CUSTOMERS as c on c.CUSTOMER_ID=o.CUSTOMER_ID
            WHERE ORDER_TYPE='SUBSCRIPTION'
            ),
            two as (
            SELECT TRIAL_MONTH, ORDER_SEQUENCE, count(ORDER_ID) as num_orders
            FROM one
            GROUP BY 1,2
            ),
            three as (
            SELECT t.*, tt.num_orders prev_num_orders
            FROM two t
            LEFT JOIN two tt ON (t.TRIAL_MONTH=tt.TRIAL_MONTH AND t.ORDER_SEQUENCE=tt.ORDER_SEQUENCE+1)
            )
            
        SELECT *,  100.*num_orders/prev_num_orders retention
        FROM three
        ORDER BY 1        

''')

df = pd.DataFrame(cursor_obj.fetchall()
                  , columns=['TRIAL_MONTH','ORDER_SEQUENCE', 'num_orders','prev_num_orders', 'retention'])

df

Unnamed: 0,TRIAL_MONTH,ORDER_SEQUENCE,num_orders,prev_num_orders,retention
0,2020-01-01,2,652,,
1,2020-01-01,3,475,652.0,72.852761
2,2020-01-01,4,400,475.0,84.210526
3,2020-01-01,5,350,400.0,87.500000
4,2020-01-01,6,298,350.0,85.142857
...,...,...,...,...,...
697,2022-03-01,5,3,23.0,13.043478
698,2022-04-01,2,513,,
699,2022-04-01,3,20,513.0,3.898635
700,2022-04-01,4,2,20.0,10.000000


7. Close connection

In [10]:
# Close the connection
connection_obj.close()