In [89]:
import pandas as pd
import sqlite3

## Create a connection to the database

In [90]:
con = sqlite3.connect('../data/checking-logs.sqlite')

## Get the schema of the table test

In [91]:
pd.read_sql('PRAGMA table_info(test)', con)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,uid,TEXT,0,,0
2,2,labname,TEXT,0,,0
3,3,first_commit_ts,TIMESTAMP,0,,0
4,4,first_view_ts,TIMESTAMP,0,,0


In [92]:
pd.read_sql('PRAGMA table_info(deadlines)', con)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,labs,TEXT,0,,0
2,2,deadlines,INTEGER,0,,0


## Get the first 10 rows

In [93]:
pd.read_sql('SELECT * FROM test LIMIT 10', con)

Unnamed: 0,index,uid,labname,first_commit_ts,first_view_ts
0,0,user_17,project1,2020-04-18 07:56:45.408648,2020-04-18 10:56:55.833899
1,1,user_30,laba04,2020-04-18 13:36:53.971502,2020-04-17 22:46:26.785035
2,2,user_30,laba04s,2020-04-18 14:51:37.498399,2020-04-17 22:46:26.785035
3,3,user_14,laba04,2020-04-18 15:14:00.312338,2020-04-18 10:53:52.623447
4,4,user_14,laba04s,2020-04-18 22:30:30.247628,2020-04-18 10:53:52.623447
5,5,user_19,laba04,2020-04-20 19:05:01.297780,2020-04-21 20:30:38.034966
6,6,user_25,laba04,2020-04-20 19:16:50.673054,2020-05-09 23:54:54.260791
7,7,user_21,laba04,2020-04-21 17:48:00.487806,2020-04-22 22:40:36.824081
8,8,user_30,project1,2020-04-22 12:36:24.053518,2020-04-17 22:46:26.785035
9,9,user_21,laba04s,2020-04-22 20:09:21.857747,2020-04-22 22:40:36.824081


In [94]:
pd.read_sql('SELECT * FROM deadlines LIMIT 10', con)

Unnamed: 0,index,labs,deadlines
0,0,laba04,1587945599
1,1,laba04s,1587945599
2,2,laba05,1588550399
3,4,laba06,1590364799
4,5,laba06s,1590364799
5,3,project1,1589673599


## Find among all the users the `minimum value` of the `delta` between the `first commit` of the user and the `deadline` of the corresponding lab using only one query

- do this by ***joining*** the table with the table `deadlines`
- the ***difference*** should be displayed in ***hours***
- do not take the ***lab ’project1’*** into account, it has longer deadlines and will be an outlier
- the value should be stored in the dataframe `df_min` with the corresponding `uid`

> julianday - julianday = days \
> SELECT CAST((julianday('2020-04-18 07:56:45') - julianday('1587945599', 'unixepoch')) * 24 AS INTEGER) 

In [95]:
query = """
SELECT
    t.uid,
    CAST(MIN(julianday(t.first_commit_ts) - julianday(d.deadlines, 'unixepoch')) * 24 AS INTEGER) AS min_delta
FROM (
    SELECT
        uid,
        labname,
        MIN(first_commit_ts) AS first_commit_ts
    FROM
        test
    GROUP BY
        labname
) t
INNER JOIN
    deadlines d
ON
    t.labname = d.labs
    AND
    d.labs != 'project1'
"""
df_min = pd.read_sql(query, con)
df_min

Unnamed: 0,uid,min_delta
0,user_30,-202


## `max_delta`, the dataframe name is `df_max`

In [96]:
query = """
SELECT
    t.uid,
    CAST(MAX(julianday(t.first_commit_ts) - julianday(d.deadlines, 'unixepoch')) * 24 AS INTEGER) AS max_delta
FROM (
    SELECT
        uid,
        labname,
        MAX(first_commit_ts) AS first_commit_ts
    FROM
        test
    GROUP BY
        labname
) t
INNER JOIN
    deadlines d
ON
    t.labname = d.labs
    AND
    d.labs != 'project1'
"""
df_max = pd.read_sql(query, con)
df_max

Unnamed: 0,uid,max_delta
0,user_25,-2


## `average_delta`, without uid column, and the dataframe name is `df_avg`

In [97]:
query = """
SELECT
    AVG(delta) as avg_delta
FROM (
    SELECT
        uid,
        labname,
        CAST((julianday(t.first_commit_ts) - julianday(d.deadlines, 'unixepoch')) * 24 AS INTEGER) AS delta
    FROM
        test t
    INNER JOIN
        deadlines d
    ON
        t.labname == d.labs
        AND
        d.labs != 'project1'
)
"""
df_avg = pd.read_sql(query, con)
df_avg

Unnamed: 0,avg_delta
0,-89.125


## Calculate the correlation coefficient between the number of pageviews and the difference

- using only one query, create a table with the columns: `uid`, `avg_diff`, `pageviews`
- `uid` is the uids that ***exist*** in the `test`
- `avg_diff` is the average delta between the first commit and the lab deadline per user
- `pageviews` is the number of Newsfeed visits per user
- do not take the lab ’`project1`’ into account
- store it to the dataframe `views_diff`
- use the Pandas method `corr()` to calculate the correlation coefficient between the number of pageviews and the difference

### looking up the tables we have

In [98]:
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('pageviews',), ('checker',), ('deadlines',), ('datamart',), ('test',), ('control',)]


### checking tha data stored in the `test` table

In [99]:
pd.read_sql('SELECT * FROM test LIMIT 2', con)

Unnamed: 0,index,uid,labname,first_commit_ts,first_view_ts
0,0,user_17,project1,2020-04-18 07:56:45.408648,2020-04-18 10:56:55.833899
1,1,user_30,laba04,2020-04-18 13:36:53.971502,2020-04-17 22:46:26.785035


### checking tha data stored in the `pageviews` table

In [100]:
pd.read_sql('SELECT * FROM pageviews LIMIT 2', con)

Unnamed: 0,index,uid,datetime
0,0,admin_1,2020-04-17 12:01:08.463179
1,1,admin_1,2020-04-17 12:01:23.743946


### creating new table and calculating correlation

In [101]:
query = """
SELECT
    diffs.uid,
    diffs.delta,
    pv.pageviews
FROM
    (SELECT
        uid,
        CAST((julianday(t.first_commit_ts) - julianday(d.deadlines, 'unixepoch')) * 24 AS INTEGER) AS delta
    FROM
        test t
    LEFT JOIN
        deadlines d
    ON
        t.labname = d.labs
        WHERE
        t.labname != 'project1'
    GROUP BY
        uid) diffs
    LEFT JOIN
        (SELECT
            uid,
            COUNT(uid) as pageviews
        FROM
            pageviews
        GROUP BY
            uid
        ) pv
    ON diffs.uid = pv.uid
"""
views_diff = pd.read_sql(query, con)
print(views_diff)
views_diff.corr()

        uid  delta  pageviews
0    user_1     -6         28
1   user_10    -39         89
2   user_14   -200        143
3   user_17    -81         47
4   user_18     -4          3
5   user_19   -148         16
6   user_21   -126         10
7   user_25   -148        179
8   user_28    -98        149
9    user_3    -75        317
10  user_30   -202          3


Unnamed: 0,delta,pageviews
delta,1.0,-0.062967
pageviews,-0.062967,1.0


## Closing the connection

In [102]:
con.close()