## Exercise 02 - Join

## Imports

In [1]:
import pandas as pd
import sqlite3

## 1. Connect to database

In [2]:
conn = sqlite3.connect('../data/checking-logs.sqlite')

## 2. Build datamart via single join query

In [3]:
datamart_query = """
    WITH checker_filtered AS (
        SELECT uid, labname, timestamp AS first_commit_ts
        FROM checker
        WHERE status = 'ready'
          AND numTrials = 1
          AND labname IN ('laba04', 'laba04s', 'laba05', 'laba06', 'laba06s', 'project1')
          AND uid LIKE 'user_%'
    ),
    first_views AS (
        SELECT uid, MIN(datetime) AS first_view_ts
        FROM pageviews
        WHERE uid LIKE 'user_%'
        GROUP BY uid
    )
    SELECT
        cf.uid,
        cf.labname,
        cf.first_commit_ts,
        fv.first_view_ts
    FROM checker_filtered cf
    LEFT JOIN first_views fv ON cf.uid = fv.uid
"""

conn.execute('DROP TABLE IF EXISTS datamart;')
conn.execute(f"CREATE TABLE datamart AS {datamart_query}")


<sqlite3.Cursor at 0x7fe5e24644c0>

## 3. Load datamart for analysis

In [6]:
datamart = pd.io.sql.read_sql(
    'SELECT * FROM datamart;',
    conn,
    parse_dates=['first_commit_ts', 'first_view_ts']
)
datamart.head()


Unnamed: 0,uid,labname,first_commit_ts,first_view_ts
0,user_4,project1,2020-04-17 05:19:02.744528,NaT
1,user_4,laba04,2020-04-17 11:33:17.366400,NaT
2,user_4,laba04s,2020-04-17 11:48:41.992466,NaT
3,user_17,project1,2020-04-18 07:56:45.408648,2020-04-18 10:56:55.833899
4,user_30,laba04,2020-04-18 13:36:53.971502,2020-04-17 22:46:26.785035


## 4. Create test and control dataframes

In [7]:
test = datamart[datamart['first_view_ts'].notna()].copy()
control = datamart[datamart['first_view_ts'].isna()].copy()

avg_first_view = test['first_view_ts'].mean()
control['first_view_ts'] = control['first_view_ts'].fillna(avg_first_view)


## 5. Persist datamart, test, and control tables

In [8]:
datamart.to_sql('datamart', conn, if_exists='replace', index=False)
test.to_sql('test', conn, if_exists='replace', index=False)
control.to_sql('control', conn, if_exists='replace', index=False)


81

## 6. Close connection

In [9]:
conn.close()