# Data Querying

The notebook runs some basic and advanced SQL queries to perform some bivariate analyses to understand customer spending behavior before training and evaluating prediction models. Amazon Athena was used to read SQL queries. 

## Importing libraries and initiating sagemaker session

In [2]:
#!pip install awswrangler

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import sagemaker
import boto3
import botocore
import awswrangler as wr

config = botocore.config.Config()
sm = boto3.client(service_name="sagemaker", config=config)
sess = sagemaker.Session(sagemaker_client=sm)

bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = sess.boto_region_name

In [5]:
wr.catalog.create_database(
    name='UK Online Retail Store Database',
    exist_ok=True
)


In [10]:
wr.catalog.create_csv_table(database = "UK Online Retail Store Database",
                           path = f"s3://{bucket}/data/customers/",
                           table = "df_customers",
                           columns_types = {"CustomerID": "float",
                                            "Country": "string",
                                            "Recency": "int",
                                            "Frequency": "int",
                                            "DailySpending": "float",
                                            "DailyTransCount":"float",
                                            "MonetaryValue_x": "float",
                                            "MonetaryValue_y": "float"},
                           mode = "overwrite",
                           skip_header_line_count = 1,
                           sep = ",")

In [11]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="top" href="https://console.aws.amazon.com/glue/home?region={}#">AWS Glue Catalog</a></b>'.format(region)))

In [12]:
wr.athena.create_athena_bucket()

's3://aws-athena-query-results-397738742408-us-east-2/'

## SQL Queries

#### Reviewing the data

In [13]:
sql_statment = """
SELECT *
FROM df_customers
"""

df = wr.athena.read_sql_query(sql=sql_statment, database = "UK Online Retail Store Database")
df.head()

Unnamed: 0,customerid,country,recency,frequency,dailyspending,dailytranscount,monetaryvalue_x,monetaryvalue_y
0,13313.0,United Kingdom,53,31,304.869995,16.0,609.73999,945.580017
1,18097.0,United Kingdom,43,49,637.02002,24.0,1274.040039,1241.23999
2,16656.0,United Kingdom,30,27,625.744019,5.0,3128.719971,2638.552002
3,16875.0,United Kingdom,134,46,402.545013,23.0,805.090027,1290.439941
4,13094.0,United Kingdom,29,12,124.199997,2.0,869.400024,834.23999


#### What are the top 5 countries (with at least 5 customers) in terms of average monetary value for the first 6 months?

In [16]:
sql_statement = """
SELECT country, AVG(monetaryvalue_x) AS avg_monetary_value
FROM df_customers
GROUP BY country
HAVING COUNT(*)>5
ORDER BY avg_monetary_value DESC
LIMIT 5
"""

wr.athena.read_sql_query(sql=sql_statement, database = "UK Online Retail Store Database")

Unnamed: 0,country,avg_monetary_value
0,Portugal,1771.39563
1,Switzerland,1383.050049
2,Germany,1359.877319
3,France,1225.043091
4,Australia,1041.546021


#### What are the average monetary values of the most recent (95th percentile) & least recent(5th percentile) customers?

In [17]:
sql_statement = """
WITH CTE1 AS (
    SELECT customerid, recency, monetaryvalue_x, monetaryvalue_y, NTILE(20) OVER(ORDER BY recency ASC) AS pct
    FROM df_customers
    ),
CTE2 AS (
    SELECT 'Most Recent' AS customer_group, AVG(monetaryvalue_x) AS avg_monetary_value_1, AVG(monetaryvalue_y) AS avg_monetary_value_2
    FROM CTE1
    WHERE pct<=1
    ),
CTE3 AS (
    SELECT 'Least Recent' AS customer_group, AVG(monetaryvalue_x) AS avg_monetary_value_1, AVG(monetaryvalue_y) AS avg_monetary_value_2
    FROM CTE1
    WHERE pct>=19
    )
SELECT *
FROM CTE2
UNION ALL
SELECT *
FROM CTE3
"""

wr.athena.read_sql_query(sql=sql_statement, database = "UK Online Retail Store Database")

Unnamed: 0,customer_group,avg_monetary_value_1,avg_monetary_value_2
0,Least Recent,405.59082,680.242981
1,Most Recent,1384.545166,1494.465454


#### What are the average monetary values of the most frequent (95th percentile) & least frequent (5th percentile) customers?

In [18]:
sql_statement = """
WITH CTE1 AS (
    SELECT customerid, frequency, monetaryvalue_x, monetaryvalue_y, NTILE(20) OVER(ORDER BY frequency DESC) AS pct
    FROM df_customers
    ),
CTE2 AS (
    SELECT 'Most Frequent' AS customer_group, AVG(monetaryvalue_x) AS avg_monetary_value_1, AVG(monetaryvalue_y) AS avg_monetary_value_2
    FROM CTE1
    WHERE pct<=1
    ),
CTE3 AS (
    SELECT 'Least Frequent' AS customer_group, AVG(monetaryvalue_x) AS avg_monetary_value_1, AVG(monetaryvalue_y) AS avg_monetary_value_2
    FROM CTE1
    WHERE pct>=19
    )
SELECT *
FROM CTE2
UNION ALL
SELECT *
FROM CTE3
"""

wr.athena.read_sql_query(sql=sql_statement, database = "UK Online Retail Store Database")

Unnamed: 0,customer_group,avg_monetary_value_1,avg_monetary_value_2
0,Least Frequent,312.254364,584.938477
1,Most Frequent,1880.893433,1937.292114


#### What are the average frequency & recency of the most valueable (95th percentile) & least (5th percentile) valueable customers?

In [19]:
sql_statement = """
WITH CTE1 AS (
    SELECT customerid, frequency, recency, monetaryvalue_x, NTILE(20) OVER(ORDER BY monetaryvalue_x DESC) AS pct
    FROM df_customers
),
CTE2 AS (
    SELECT 'Most Valueable' AS customer_group, AVG(frequency) AS frequency, AVG(recency) AS recency
    FROM CTE1
    WHERE pct<=1
    ),
CTE3 AS (
    SELECT 'Least Valueable' AS customer_group, AVG(frequency) AS frequency, AVG(recency) AS recency
    FROM CTE1
    WHERE pct>=19
)
SELECT * 
FROM CTE2
UNION ALL
SELECT *
FROM CTE3
"""

wr.athena.read_sql_query(sql=sql_statement, database = "UK Online Retail Store Database")

Unnamed: 0,customer_group,frequency,recency
0,Least Valueable,11.72043,76.709677
1,Most Valueable,186.734043,23.510638


#### What are the average daily spending & transaction counts of the most valueable & least valueable customers?

In [20]:
sql_statement = """
WITH CTE1 AS (
    SELECT customerid, dailyspending, dailytranscount, monetaryvalue_x, NTILE(20) OVER(ORDER BY monetaryvalue_x DESC) AS pct
    FROM df_customers
),
CTE2 AS (
    SELECT 'Most Valueable' AS customer_group, AVG(dailyspending) AS dailyspending, AVG(dailytranscount) AS dailytranscount
    FROM CTE1
    WHERE pct<=1
    ),
CTE3 AS (
    SELECT 'Least Valueable' AS customer_group, AVG(dailyspending) AS dailyspending, AVG(dailytranscount) AS dailytranscount
    FROM CTE1
    WHERE pct>=19
)
SELECT * 
FROM CTE2
UNION ALL
SELECT *
FROM CTE3
"""

wr.athena.read_sql_query(sql=sql_statement, database = "UK Online Retail Store Database")

Unnamed: 0,customer_group,dailyspending,dailytranscount
0,Most Valueable,642.046143,28.159575
1,Least Valueable,110.993523,11.548388


#### What is the average difference in monetary values for the first and last 6 months of 2011?

In [21]:
sql_statement = """
SELECT AVG(ABS(monetaryvalue_x - monetaryvalue_y)) AS avg_monetary_value_diff
FROM df_customers
"""

wr.athena.read_sql_query(sql=sql_statement, database = "UK Online Retail Store Database")

Unnamed: 0,avg_monetary_value_diff
0,505.760468
