In [1]:
import pandas as pd
import sqlite3

In [2]:
conn = sqlite3.connect('../data/checking-logs.sqlite')

## using only one query for each of the groups, create two dataframes: test_results and control_results with the columns time and avg_diff and only two rows

* time should have the values: after and before
* avg_diff contains the average delta among all the users for the time period
* before each of them made their first visit to the page and afterward
* only take into account the users that have observations before and after

In [9]:
avg_first_view = pd.read_sql("""
    SELECT AVG(CAST(strftime('%s', first_view_ts) AS INT)) as avg_first_view 
    FROM test
""", conn).iloc[0,0]

test_query = """
  WITH before_or_after AS (
        SELECT 
            uid,
            'before' AS time,
            AVG((julianday(first_commit_ts) - julianday(datetime(deadlines, 'unixepoch'))) * 24) AS avg_diff
        FROM test t
        JOIN deadlines d ON t.labname = d.labs
        WHERE t.labname != 'project1'
        AND t.first_commit_ts < t.first_view_ts
        GROUP BY uid
        
        UNION ALL
        
        SELECT 
            uid,
            'after' AS time,
            AVG((julianday(first_commit_ts) - julianday(datetime(deadlines, 'unixepoch'))) * 24) AS avg_diff
        FROM test t
        JOIN deadlines d ON t.labname = d.labs
        WHERE t.labname != 'project1'
        AND t.first_commit_ts >= t.first_view_ts
        GROUP BY uid
    ),
    before_and_after AS (
        SELECT uid
        FROM before_or_after
        GROUP BY uid
        HAVING COUNT(DISTINCT time) = 2
    )
    SELECT 
        time,
        AVG(avg_diff) AS avg_diff
    FROM before_or_after
    WHERE uid IN (SELECT uid FROM before_and_after)
    GROUP BY time
    ORDER BY time
"""
test_results = pd.io.sql.read_sql(test_query, conn)
test_results

Unnamed: 0,time,avg_diff
0,after,-100.178032
1,before,-66.679398


In [67]:
control_query = f"""
    WITH before_or_after AS (
        SELECT 
            uid,
            'before' AS time,
            AVG((julianday(first_commit_ts) - julianday(datetime(deadlines, 'unixepoch'))) * 24) AS avg_diff
        FROM test t
        JOIN deadlines d ON t.labname = d.labs
        WHERE t.labname != 'project1'
        AND CAST(strftime('%s', first_commit_ts) AS INT) < {avg_first_view}
        GROUP BY uid
        
        UNION ALL
        
        SELECT 
            uid,
            'after' AS time,
            AVG((julianday(first_commit_ts) - julianday(datetime(deadlines, 'unixepoch'))) * 24) AS avg_diff
        FROM test t
        JOIN deadlines d ON t.labname = d.labs
        WHERE t.labname != 'project1'
        AND CAST(strftime('%s', first_commit_ts) AS INT) >= {avg_first_view}
        GROUP BY uid
    ),
    before_and_after AS (
        SELECT uid
        FROM before_or_after
        GROUP BY uid
        HAVING COUNT(DISTINCT time) = 2
    )
    SELECT 
        time,
        AVG(avg_diff) AS avg_diff
    FROM before_or_after
    WHERE uid IN (SELECT uid FROM before_and_after)
    GROUP BY time
    ORDER BY time
"""

control_results = pd.io.sql.read_sql(control_query, conn)
control_results

Unnamed: 0,time,avg_diff
0,after,-86.808635
1,before,-88.571782


In [68]:
test_before = test_results[test_results['time'] == 'before']['avg_diff'].values[0]
test_after = test_results[test_results['time'] == 'after']['avg_diff'].values[0]
control_before = control_results[control_results['time'] == 'before']['avg_diff'].values[0]
control_after = control_results[control_results['time'] == 'after']['avg_diff'].values[0]

if (test_before > test_after) and not (control_before > control_after):
    print("\nГипотеза верна - страница в ленте новостей положительно влияет на поведение студентов")
else:
    print("\nГипотеза ложна - никакого существенного эффекта не наблюдалось")


Гипотеза верна - страница в ленте новостей положительно влияет на поведение студентов


In [69]:
conn.close()