# Debugging why CIS-TTC comparison results in ~2x processed data #134

In [1]:
from psycopg2 import connect
import configparser
%matplotlib inline
import numpy as np
import pandas as pd
import pandas.io.sql as pandasql
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import matplotlib.ticker as ticker
import folium

CONFIG = configparser.ConfigParser()
CONFIG.read('db.cfg')
dbset = CONFIG['DBSETTINGS']
con = connect(**dbset)

from IPython.display import HTML

def print_table(sql, con):
    return HTML(pandasql.read_sql(sql, con).to_html(index=False))

## Looking at GTFS Data 

The counts for the query below for one day of the route 514 is never more than 102. So the maximum count over 7 days for any segment of a route should be 714. Looking at the data from the ttc and our analysis, the TTC seems to be fairly accurate with their counts (it could be undercounting due to filtering out some trips). Some of our general counts are very high, some are ever about 1100 for the week. So it seems that our data processing is overcounting. 

The highest count for the 504 is 259 per day, so the total count for 7 days should not exceed 1813. The TTC processed data does not exceed 1813, but our processed data can have counts as high as over 4500. 

```sql
WITH to_stop_table AS (
SELECT stop_id to_stop, t.trip_id, direction_id, arrival_time
FROM crosic.stop_times_20171119_20171125 s LEFT JOIN crosic.trips_20171119_20171125 t ON s.trip_id = t.trip_id
INNER JOIN crosic.routes_20171119_20171125 r ON t.route_id = r.route_id
WHERE route_short_name = '514' AND s.stop_id IN (SELECT to_stop_id FROM crosic.section_runs)), 

from_stop_table AS (
SELECT stop_id from_stop, t.trip_id, direction_id, departure_time
FROM crosic.stop_times_20171119_20171125 s LEFT JOIN crosic.trips_20171119_20171125 t ON s.trip_id = t.trip_id
INNER JOIN crosic.routes_20171119_20171125 r ON t.route_id = r.route_id
WHERE route_short_name = '514' AND s.stop_id IN (SELECT from_stop_id FROM crosic.section_runs)
), 

stops AS (
SELECT to_stop, from_stop, t.direction_id, t.trip_id, arrival_time, departure_time 
FROM from_stop_table f JOIN to_stop_table t ON f.trip_id = t.trip_id and f.direction_id = t.direction_id
)

SELECT to_stop, from_stop, direction_id, 
AVG(EXTRACT(EPOCH FROM arrival_time - departure_time)) /60 AS time_diff_minutes_gtfs, COUNT(*) cnt
FROM stops
WHERE arrival_time > departure_time
GROUP BY to_stop, from_stop, direction_id; 
```

## Compare trip counts of temporary tables from count queries

There might be an issue with how the data is being counted, since the `trip_id` counts between the TTC processed CIS data and our processed CIS data are very different (our data has up to 4x more trips than the TTC data). 

The first step in investigating if my queries that count the number of trips in our CIS processed data are correct is to look at the first part of the queries and look at the temporary tables created in the query. 

An example of a count query (found in `comparing_ttc_data` notebook) that counts the overall number of trips between the segments is: 
```sql 
WITH to_stop_table AS (
SELECT t.stop_id to_stop, trip_id, direction_id, arrival_time
FROM crosic.cis_504_11192017_11252017_tripids t
WHERE t.stop_id IN (SELECT to_stop_id FROM crosic.section_runs)
),

from_stop_table AS (
SELECT t.stop_id from_stop, trip_id, direction_id, departure_time
FROM crosic.cis_504_11192017_11252017_tripids t 
WHERE t.stop_id IN (SELECT from_stop_id FROM crosic.section_runs)
), 

stops AS (
SELECT to_stop, from_stop, t.direction_id, t.trip_id, arrival_time, departure_time -- , 
-- AVG(EXTRACT(EPOCH FROM arrival_time - departure_time)) /60 AS time_diff_minutes_our_cis
FROM from_stop_table f JOIN to_stop_table t ON f.trip_id = t.trip_id and f.direction_id = t.direction_id
WHERE arrival_time > departure_time and from_stop <> to_stop
-- GROUP BY to_stop, from_stop, t.direction_id
),

ttc_cis AS (
SELECT DISTINCT fromstopname, tostopname, directionid, COUNT(*) cnt, to_stop_id, from_stop_id, routenumber,
AVG(EXTRACT(EPOCH FROM (to_char(journeydate, 'YYYY-MM-DD') || ' ' || to_char(toarrstoptime, 'HH24:MI:SS'))::timestamp
- (to_char(journeydate, 'YYYY-MM-DD') || ' ' || to_char(fromstopdepaturetime, 'HH24:MI:SS'))::timestamp)  ) /60
AS time_diff_minutes_ttc
FROM section_runs
WHERE routenumber = 504 and toarrstoptime > fromstopdepaturetime
GROUP BY fromstopname, tostopname, to_stop_id, from_stop_id, directionid, routenumber
ORDER BY time_diff_minutes_ttc
)

SELECT routenumber, to_stop_id, from_stop_id,  fromstopname, tostopname, directionid, cnt cnt_ttc, time_diff_minutes_ttc, 
COUNT(*) cnt_our_cis , AVG(EXTRACT(EPOCH FROM arrival_time - departure_time)) /60 AS time_diff_minutes_our_cis
FROM stops s JOIN ttc_cis ttc ON s.to_stop = ttc.to_stop_id AND s.from_stop = ttc.from_stop_id 
WHERE s.direction_id = ttc.directionid and ttc.routenumber = 504 -- and arrival_time > departure_time
GROUP BY to_stop_id, from_stop_id,  fromstopname, tostopname, directionid, ttc.cnt, time_diff_minutes_ttc, routenumber
ORDER BY time_diff_minutes_ttc, time_diff_minutes_our_cis; 
```

To investigate this possibility further, the following queries count how many trips are being examined from both the TTC and the CIS dataset. Basically, my queries that count the number of trips that occur use multiple temporary tables to get their count numbers. Some temporary tables are created from from our processed CIS data, and some are created from the TTC's processed CIS data. The queries below look at the number of trips that are examined later in the overall count query as a result of both the temporary tables that pull from our processed data and the TTC's processed data.

A relatively similar number of trips are being pulled from the temporary queries. 

#### Route 504

In [2]:
# our processed data
# look at to_stop_table, from_stop_table, and stops temporary tables
sql_debug1_504 = '''

WITH to_stop_table AS (
SELECT t.stop_id to_stop, trip_id, direction_id, arrival_time
FROM crosic.cis_504_11192017_11252017_tripids t
WHERE t.stop_id IN (SELECT to_stop_id FROM crosic.section_runs WHERE routenumber = 504)
),

from_stop_table AS (
SELECT t.stop_id from_stop, trip_id, direction_id, departure_time
FROM crosic.cis_504_11192017_11252017_tripids t 
WHERE t.stop_id IN (SELECT from_stop_id FROM crosic.section_runs WHERE routenumber = 504)
)

SELECT COUNT(DISTINCT t.trip_id) 
FROM from_stop_table f INNER JOIN to_stop_table t ON f.trip_id = t.trip_id and f.direction_id = t.direction_id
WHERE from_stop <> to_stop; 
'''

pandasql.read_sql(sql_debug1_504, con)

Unnamed: 0,count
0,3551


In [3]:
# TTC processed data 
# look at ttc_cis temp table
sql_debug2_504 = '''

SELECT COUNT(DISTINCT ah_tripid)
FROM section_runs
WHERE routenumber = 504 AND from_stop_id <> to_stop_id; 
'''

pandasql.read_sql(sql_debug2_504, con)

Unnamed: 0,count
0,3724


#### Route 514

In [9]:
# our processed data 
# look at to_stop_table, from_stop_table, and stops temporary tables
sql_debug1_514 = '''

WITH to_stop_table AS (
SELECT t.stop_id to_stop, trip_id, direction_id, arrival_time
FROM crosic.cis_514_11192017_11252017_tripids t
WHERE t.stop_id IN (SELECT to_stop_id FROM crosic.section_runs WHERE routenumber = 514)
),

from_stop_table AS (
SELECT t.stop_id from_stop, trip_id, direction_id, departure_time
FROM crosic.cis_514_11192017_11252017_tripids t 
WHERE t.stop_id IN (SELECT from_stop_id FROM crosic.section_runs WHERE routenumber = 514)
)

SELECT COUNT(DISTINCT t.trip_id) 
FROM from_stop_table f INNER JOIN to_stop_table t ON f.trip_id = t.trip_id and f.direction_id = t.direction_id
WHERE from_stop <> to_stop; 
'''

pandasql.read_sql(sql_debug1_514, con)

Unnamed: 0,count
0,1275


In [8]:
# TTC processed data 
# look at ttc_cis temp table
sql_debug2_514 = '''

SELECT COUNT(DISTINCT ah_tripid)
FROM section_runs
WHERE routenumber = 514 AND from_stop_id <> to_stop_id; 
'''

pandasql.read_sql(sql_debug2_514, con)

Unnamed: 0,count
0,1351


In [10]:
con.close()