In [1]:
import pandas as pd
import psycopg2

def execute_query(sql_query, dbname='temp', user='postgres', password='postgres', port='5432'):
    # Create a connection to the PostgreSQL database
    conn = psycopg2.connect(dbname=dbname, user=user, password=password, port=port)

    # Use read_sql to execute the query and load the results into a DataFrame
    df = pd.read_sql(sql_query, conn)

    # Close the database connection
    conn.close()

    # Return the DataFrame
    return df


# Division

Compute the average revenue per employee for Fortune 500 companies by sector.

In [2]:

query_result = execute_query("""
-- Select average revenue per employee by sector
SELECT sector, 
       AVG(revenues/employees::NUMERIC) AS avg_rev_employee
  FROM fortune500
 GROUP BY sector
 -- Use the column alias to order the results
 ORDER BY avg_rev_employee;                         
""")
query_result

Unnamed: 0,sector,avg_rev_employee
0,"Hotels, Restaurants & Leisure",0.094987
1,Apparel,0.278659
2,Food & Drug Stores,0.308
3,Motor Vehicles & Parts,0.342527
4,Household Products,0.355573
5,Retailing,0.360195
6,Industrials,0.361485
7,Aerospace & Defense,0.366715
8,Transportation,0.403654
9,Business Services,0.42011


# Explore with division

In exploring a new database, it can be unclear what the data means and how columns are related to each other.

What information does the unanswered_pct column in the stackoverflow table contain? Is it the percent of questions with the tag that are unanswered (unanswered ?s with tag/all ?s with tag)? Or is it something else, such as the percent of all unanswered questions on the site with the tag (unanswered ?s with tag/all unanswered ?s)?

In [3]:

query_result = execute_query("""
-- Divide unanswered_count by question_count
SELECT unanswered_count/question_count::NUMERIC AS computed_pct, 
       -- What are you comparing the above quantity to?
       unanswered_pct
  FROM stackoverflow
 -- Select rows where question_count is not 0
 WHERE question_count <> 0
 LIMIT 10;                       
""")
query_result

Unnamed: 0,computed_pct,unanswered_pct
0,0.465485,0.001752
1,0.386364,0.000117
2,0.393768,5.8e-05
3,0.331897,1.6e-05
4,0.429286,0.000125
5,0.34799,0.012886
6,0.350839,0.007619
7,0.307292,1.2e-05
8,0.354281,8.1e-05
9,0.380658,0.000244


# Summarize numeric columns

Summarize the profit column in the `fortune500` table using the functions you've learned.

In [4]:

query_result = execute_query("""
-- Select min, avg, max, and stddev of fortune500 profits
SELECT min(profits),
       avg(profits),
       max(profits),
       stddev(profits)
  FROM fortune500;               
""")
query_result

Unnamed: 0,min,avg,max,stddev
0,-6177.0,1783.475351,45687.0,3940.495363


In [5]:

query_result = execute_query("""
-- Select sector and summary measures of fortune500 profits
SELECT min(profits),
       avg(profits),
       max(profits),
       stddev(profits),
       sector
  FROM fortune500
 -- What to group by?
 GROUP BY sector
 -- Order by the average profits
 ORDER BY avg;
            
""")
query_result

Unnamed: 0,min,avg,max,stddev,sector
0,-6177.0,10.444643,7840.0,2264.572143,Energy
1,-440.0,272.468421,1027.0,406.632781,Materials
2,15.0,390.169231,911.8,277.66512,Engineering & Construction
3,-199.4,391.27931,2258.0,532.171184,Wholesalers
4,-2221.0,991.785106,13643.0,2348.342559,Retailing
5,-3.9,1137.021429,4318.0,1129.752304,Chemicals
6,57.2,1155.355,5991.0,1454.360687,Business Services
7,-502.2,1217.428571,4173.0,1613.041449,Food & Drug Stores
8,396.0,1263.7,3760.0,1419.134571,Apparel
9,348.0,1451.06,4686.5,1372.975733,"Hotels, Restaurants & Leisure"


# Summarize group statistics

Sometimes you want to understand how a value varies across groups. For example, how does the maximum value per group vary across groups?

To find out, first summarize by group, and then compute summary statistics of the group results. One way to do this is to compute group values in a subquery, and then summarize the results of the subquery.

In [6]:

query_result = execute_query("""
-- Compute standard deviation of maximum values
SELECT stddev(maxval),
	   -- min
       min(maxval),
       -- max
       max(maxval),
       -- avg
       avg(maxval)
  -- Subquery to compute max of question_count by tag
  FROM (SELECT max(question_count) AS maxval
          FROM stackoverflow
         -- Compute max by...
         GROUP BY tag) AS max_results; -- alias for subquery            
""")
query_result

Unnamed: 0,stddev,min,max,avg
0,176458.379527,30,1138658,52652.433962


# Truncate

Use `trunc()` to examine the distributions of attributes of the Fortune 500 companies.


In [7]:

query_result = execute_query("""
-- Truncate employees
SELECT TRUNC(employees, -5) AS employee_bin,
       -- Count number of companies with each truncated value
       COUNT(*)
  FROM fortune500
 -- Use alias to group
 GROUP BY employee_bin
 -- Use alias to order
 ORDER BY employee_bin;          
""")
query_result

Unnamed: 0,employee_bin,count
0,0.0,433
1,100000.0,35
2,200000.0,20
3,300000.0,7
4,400000.0,4
5,2300000.0,1


In [8]:

query_result = execute_query("""
-- Truncate employees
SELECT TRUNC(employees, -4) AS employee_bin,
       -- Count number of companies with each truncated value
       COUNT(*)
  FROM fortune500
 -- Limit to which companies?
 WHERE employees < 100000
 -- Use alias to group
 GROUP BY employee_bin
 -- Use alias to order
 ORDER BY employee_bin;     
""")
query_result

Unnamed: 0,employee_bin,count
0,0.0,102
1,10000.0,108
2,20000.0,63
3,30000.0,42
4,40000.0,35
5,50000.0,31
6,60000.0,18
7,70000.0,18
8,80000.0,6
9,90000.0,10


# Generate series

Summarize the distribution of the number of questions with the tag "dropbox" on Stack Overflow per day by binning the data.

In [9]:

query_result = execute_query("""
-- Select the min and max of question_count
SELECT min(question_count), 
       max(question_count)
  -- From what table?
  FROM stackoverflow
 -- For tag dropbox
 WHERE tag = 'dropbox'; 
""")
query_result

Unnamed: 0,min,max
0,2315,3072


In [10]:

query_result = execute_query("""
-- Create lower and upper bounds of bins
SELECT generate_series(2200 , 3050, 50) AS lower,
       generate_series(2250, 3100, 50) AS upper;
""")
query_result

Unnamed: 0,lower,upper
0,2200,2250
1,2250,2300
2,2300,2350
3,2350,2400
4,2400,2450
5,2450,2500
6,2500,2550
7,2550,2600
8,2600,2650
9,2650,2700


In [11]:

query_result = execute_query("""
-- Bins created in Step 2
WITH bins AS (
      SELECT generate_series(2200, 3050, 50) AS lower,
             generate_series(2250, 3100, 50) AS upper),
     -- Subset stackoverflow to just tag dropbox (Step 1)
     dropbox AS (
      SELECT question_count 
        FROM stackoverflow
       WHERE tag='dropbox') 
-- Select columns for result
-- What column are you counting to summarize?
SELECT lower, upper, count(question_count) 
  FROM bins  -- Created above
       -- Join to dropbox (created above), 
       -- keeping all rows from the bins table in the join
       LEFT JOIN dropbox
       -- Compare question_count to lower and upper
         ON question_count >= lower 
        AND question_count < upper
 -- Group by lower and upper to count values in each bin
 GROUP BY lower, upper
 -- Order by lower to put bins in order
 ORDER BY lower;
""")
query_result

Unnamed: 0,lower,upper,count
0,2200,2250,0
1,2250,2300,0
2,2300,2350,22
3,2350,2400,39
4,2400,2450,54
5,2450,2500,53
6,2500,2550,45
7,2550,2600,41
8,2600,2650,46
9,2650,2700,57


# Correlation

What's the relationship between a company's revenue and its other financial attributes? Compute the correlation between revenues and other financial variables with the `corr()` function.

In [12]:

query_result = execute_query("""
-- Correlation between revenues and profit
SELECT corr(revenues , profits) AS rev_profits,
	   -- Correlation between revenues and assets
       corr(revenues , assets) AS rev_assets,
       -- Correlation between revenues and equity
       corr(revenues , equity) AS rev_equity 
  FROM fortune500;
""")
query_result

Unnamed: 0,rev_profits,rev_assets,rev_equity
0,0.599994,0.3295,0.546571


# Mean and Median

Compute the mean (avg()) and median assets of Fortune 500 companies by sector.

In [13]:

query_result = execute_query("""
-- What groups are you computing statistics by?
SELECT sector,
       -- Select the mean of assets with the avg function
       AVG(assets) AS mean,
       -- Select the median
       PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY assets) AS median
  FROM fortune500
 -- Computing statistics for each what?
 GROUP BY sector
 -- Order results by a value of interest
 ORDER BY mean;
""")
query_result

Unnamed: 0,sector,mean,median
0,Engineering & Construction,8199.230769,8709.0
1,Wholesalers,9362.586207,5390.0
2,Materials,10833.263158,7741.0
3,Apparel,11064.8,9739.0
4,Retailing,14473.148936,7858.0
5,"Hotels, Restaurants & Leisure",16795.4,14330.0
6,Business Services,19626.1,12485.0
7,Chemicals,20151.214286,15769.0
8,Household Products,23179.083333,10231.0
9,Food & Drug Stores,24630.714286,17464.0


# Create a temp table

Find the Fortune 500 companies that have profits in the top 20% for their sector (compared to other Fortune 500 companies).

and save the results in a temporary table.

In [14]:

query_result = execute_query("""
-- To clear table if it already exists;
-- fill in name of temp table
DROP TABLE IF EXISTS profit80;

-- Create the temporary table
CREATE TEMP TABLE profit80 AS 
  -- Select the two columns you need; alias as needed
  SELECT sector, 
         PERCENTILE_DISC(.8) WITHIN GROUP (ORDER BY profits ) AS pct80
    -- What table are you getting the data from?
    FROM fortune500
   -- What do you need to group by?
   GROUP BY sector;
   
-- See what you created: select all columns and rows 
-- from the table you created
SELECT * 
  FROM profit80;
""")
query_result

Unnamed: 0,sector,pct80
0,Aerospace & Defense,4895.0
1,Apparel,1074.1
2,Business Services,1401.0
3,Chemicals,1500.0
4,Energy,1311.0
5,Engineering & Construction,602.7
6,Financials,3014.0
7,Food & Drug Stores,2025.7
8,"Food, Beverages & Tobacco",6073.0
9,Health Care,4965.0


In [15]:

query_result = execute_query("""
-- Code from previous step
DROP TABLE IF EXISTS profit80;

CREATE TEMP TABLE profit80 AS
  SELECT sector, 
         percentile_disc(0.8) WITHIN GROUP (ORDER BY profits) AS pct80
    FROM fortune500 
   GROUP BY sector;

-- Select columns, aliasing as needed
SELECT title, fortune500.sector, 
       profits, profits/pct80 AS ratio
-- What tables do you need to join?  
  FROM fortune500 
       LEFT JOIN profit80
-- How are the tables joined?
       ON fortune500.sector=profit80.sector
-- What rows do you want to select?
 WHERE profits/pct80 > 1;
""")
query_result

Unnamed: 0,title,sector,profits,ratio
0,Walmart,Retailing,13643.0,11.109935
1,Berkshire Hathaway,Financials,24074.0,7.987392
2,Apple,Technology,45687.0,6.287779
3,Exxon Mobil,Energy,7840.0,5.980168
4,McKesson,Wholesalers,2258.0,3.726688
...,...,...,...,...
85,AutoZone,Retailing,1241.0,1.010586
86,Sempra Energy,Energy,1370.0,1.045004
87,Weyerhaeuser,Materials,1027.0,2.070565
88,PPL,Energy,1902.0,1.450801


# Create a temp table to simplify a query

The Stack Overflow data contains daily question counts through 2018-09-25 for all tags, but each tag has a different starting date in the data.

Find out how many questions had each tag on the first date for which data for the tag is available, as well as how many questions had the tag on the last day. Also, compute the difference between these two values.

In [16]:

query_result = execute_query("""
-- To clear table if it already exists
DROP TABLE IF EXISTS startdates;

-- Create temp table syntax
CREATE TEMP TABLE startdates AS
-- Compute the minimum date for each what?
SELECT tag,
       MIN(date) AS mindate
  FROM stackoverflow
 -- What do you need to compute the min date for each tag?
 GROUP BY tag;
 
 -- Look at the table you created
 SELECT * 
   FROM startdates;
""")
query_result

Unnamed: 0,tag,mindate
0,amazon-route53,2016-01-01
1,google-spreadsheet,2016-01-01
2,dropbox,2016-01-01
3,amazon-data-pipeline,2016-09-01
4,amazon,2016-01-01
5,amazon-sns,2016-09-01
6,ios,2016-01-01
7,amazon-web-services,2016-01-01
8,amazon-cloudsearch,2016-01-01
9,amazon-ses,2016-09-01


In [26]:

query_result = execute_query("""
-- To clear table if it already exists
DROP TABLE IF EXISTS startdates;

CREATE TEMP TABLE startdates AS
SELECT tag, min(date) AS mindate
  FROM stackoverflow
 GROUP BY tag;
 
-- Select tag (Remember the table name!) and mindate
SELECT startdates.tag, 
       mindate, 
       -- Select question count on the min and max days
	   so_min.question_count  AS min_date_question_count,
       so_max.question_count  AS max_date_question_count,
       -- Compute the change in question_count (max- min)
       so_max.question_count - so_min.question_count AS change
  FROM startdates
       -- Join startdates to stackoverflow with alias so_min
       INNER JOIN stackoverflow AS so_min
          -- What needs to match between tables?
          ON startdates.tag = so_min.tag
         AND startdates.mindate = so_min.date
       -- Join to stackoverflow again with alias so_max
       INNER JOIN stackoverflow AS so_max
       	  -- Again, what needs to match between tables?
          ON startdates.tag = so_max.tag
         AND so_max.date = '2018-09-25';
""")
query_result

Unnamed: 0,tag,mindate,min_date_question_count,max_date_question_count,change
0,paypal,2016-01-01,13296,18050,4754
1,amazon-elb,2016-09-01,576,1452,876
2,amazon-mws,2016-09-01,367,706,339
3,amazon-swf,2016-09-01,167,232,65
4,amazon-sns,2016-09-01,690,1400,710
5,excel,2016-01-01,81384,177603,96219
6,mongodb,2016-01-01,55510,104159,48649
7,amazon-glacier,2016-09-01,118,192,74
8,amazon-route53,2016-01-01,369,1098,729
9,dropbox,2016-01-01,2319,3071,752


# Insert into a temp table

While you can join the results of multiple similar queries together with UNION, sometimes it's easier to break a query down into steps. You can do this by creating a temporary table and inserting rows into it.

In [38]:

query_result = execute_query("""
DROP TABLE IF EXISTS correlations;

-- Create temp table 
CREATE TEMP TABLE correlations AS
-- Select each correlation
SELECT 'profits'::varchar AS measure,
       -- Compute correlations
       CORR(profits, profits) AS profits,
       CORR(profits, profits_change) AS profits_change,
       CORR(profits, revenues_change) AS revenues_change
  FROM fortune500;
  
SELECT * FROM correlations;
  
""")
query_result

Unnamed: 0,measure,profits,profits_change,revenues_change
0,profits,1.0,0.015356,0.01792


In [37]:

query_result = execute_query("""
DROP TABLE IF EXISTS correlations;

CREATE TEMP TABLE correlations AS
SELECT 'profits'::varchar AS measure,
       corr(profits, profits) AS profits,
       corr(profits, profits_change) AS profits_change,
       corr(profits, revenues_change) AS revenues_change
  FROM fortune500;

-- Add a row for profits_change
-- Insert into what table?
INSERT INTO correlations
-- Follow the pattern of the select statement above
-- Using profits_change instead of profits
SELECT 'profits_change'::varchar AS measure,
       corr(profits_change, profits) AS profits,
       corr(profits_change, profits_change) AS profits_change,
       corr(profits_change, revenues_change) AS revenues_change
  FROM fortune500;

-- Repeat the above, but for revenues_change
INSERT INTO correlations
SELECT 'revenues_change'::varchar AS measure,
       corr(revenues_change, profits) AS profits,
       corr(revenues_change, profits_change) AS profits_change,
       corr(revenues_change, revenues_change) AS revenues_change
  FROM fortune500;
  
SELECT * FROM correlations;
""")
query_result

Unnamed: 0,measure,profits,profits_change,revenues_change
0,profits,1.0,0.015356,0.01792
1,profits_change,0.015356,1.0,-0.091687
2,revenues_change,0.01792,-0.091687,1.0


In [35]:

query_result = execute_query("""
DROP TABLE IF EXISTS correlations;

CREATE TEMP TABLE correlations AS
SELECT 'profits'::varchar AS measure,
       corr(profits, profits) AS profits,
       corr(profits, profits_change) AS profits_change,
       corr(profits, revenues_change) AS revenues_change
  FROM fortune500;

INSERT INTO correlations
SELECT 'profits_change'::varchar AS measure,
       corr(profits_change, profits) AS profits,
       corr(profits_change, profits_change) AS profits_change,
       corr(profits_change, revenues_change) AS revenues_change
  FROM fortune500;

INSERT INTO correlations
SELECT 'revenues_change'::varchar AS measure,
       corr(revenues_change, profits) AS profits,
       corr(revenues_change, profits_change) AS profits_change,
       corr(revenues_change, revenues_change) AS revenues_change
  FROM fortune500;

-- Select each column, rounding the correlations
SELECT measure, 
       ROUND(profits::NUMERIC,2) AS profits,
       ROUND(profits_change::NUMERIC,2) AS profits_change,
       ROUND(revenues_change::NUMERIC,2) AS revenues_change
  FROM correlations;
""")
query_result

Unnamed: 0,measure,profits,profits_change,revenues_change
0,profits,1.0,0.02,0.02
1,profits_change,0.02,1.0,-0.09
2,revenues_change,0.02,-0.09,1.0
