In [3]:
import pandas as pd
import sqlite3
import time
import numpy as np

path = '/home/td/Documents'


In [4]:
url = 'https://s3-us-west-2.amazonaws.com/pcadsassessment/parking_citations.corrupted.csv'
df = pd.read_csv(url)
df.to_csv('{path}/tickets.csv'.format(path = path), index = False)
    

  interactivity=interactivity, compiler=compiler, result=result)


In [100]:
def pandas_most_common_makes(df):
    return df['Make'].value_counts()[:25].index.tolist()


def sqlite_most_common_names(conn):
    res = conn.execute('''Select Make, COUNT(Make) As count_col
                    from tickets
                    group by Make
                    order by count_col DESC
                    LIMIT 25;''')
    
    res = [i[0] for i in res]
    print('here', res)
    return res


def pandas_most_common_color_per_make(df):
    result_series =  df.groupby('Make')['Color'].agg(lambda x: x.value_counts(dropna = False).index[0])
    return result_series.to_dict()


def sqlites_most_common_color_per_make(conn):
    res = conn.execute('''SELECT FinalTable.Make, FinalTable.Color    
                            FROM    
                             ((SELECT tickets.Make, tickets.Color, Count(tickets.Color) AS color_count
                              FROM tickets
                              GROUP BY tickets.Make, tickets.Color) As CountMakeColor
                            JOIN
                             (SELECT dT.Make, Max(dT.color_count) As max_color_count
                              FROM
                                   (SELECT tickets.Make, tickets.Color, Count(tickets.Color) AS color_count
                                    FROM tickets
                                    GROUP BY tickets.Make, tickets.Color) As dT
                              GROUP BY dT.Make) As MaxColorCount
                              
                        ON CountMakeColor.Make = MaxColorCount.Make 
                        AND CountMakeColor.color_count = MaxColorCount.max_color_count) as FinalTable''')

    res_dict = dict()
    for i in res:
        res_dict[i[0]] = i[1]
    return res_dict


def pandas_first_ticket_per_make(df):
    df_sorted = df.sort_values(by = ['Issue Date', "Issue time"])
    result_series = df_sorted.groupby('Make')['Ticket number'].agg(lambda x: x.tolist()[0])
    return result_series.to_dict()


def sqlite_first_ticket_per_make(conn):
    #TODO: add time to comparison
    res = conn.execute('''select Make, "Ticket number"
                                from tickets JOIN 
                                (select Make as group_make, min("Issue Date") as earliest_date 
                                from tickets 
                                group by group_make) 
                                on group_make = make and earliest_date = "Issue Date";''')

    res_dict = dict()
    for i in res:
        res_dict[i[0]] = i[1]
    return res_dict

In [101]:
def run_timings(df, n):
    if n:
        df = df.sample(n=n)
        
    results = dict()
    results['number_of_records'] = df.shape[0]
    
    
    with sqlite3.connect(':memory:') as conn_mem, sqlite3.connect('tickets.db') as conn_disk:
        df.to_sql('tickets', conn_mem, if_exists='replace')
        df.to_sql('tickets', conn_disk, if_exists='replace')
        
        pandas_q1_start = time.time()
        res_pd = pandas_most_common_makes(df)
        pandas_q1_end = time.time()
        
        sql_q1_disk_start = time.time()
        res_sql_disk = sqlite_most_common_names(conn_disk)
        sql_q1_disk_end = time.time()
        
        sql_q1_mem_start = time.time()
        res_sql_mem = sqlite_most_common_names(conn_mem)
        sql_q1_mem_end = time.time()
        
        results['q1_pandas_time'] = pandas_q1_end - pandas_q1_start
        results['q1_sql_disk_time'] = sql_q1_disk_end - sql_q1_disk_start
        results['q1_sql_mem_time'] = sql_q1_mem_end - sql_q1_mem_start
        print(results)
        
        try:
            assert sorted(res_pd) == sorted(res_sql_disk)
        except:
            print(sorted(res_pd))
            print(sorted(res_sql_mem))
            print('difference between pandas and sql, could still be valid in cases of ties: {}'.format(set(res_pd) ^ set(res_sql_disk)))

            
        # q2
        print()
        print('q2')
        pandas_q2_start = time.time()
        res_pd = pandas_most_common_color_per_make(df)
        pandas_q2_end = time.time()
        print(pandas_q2_end - pandas_q2_start)

        sql_q2_disk_start = time.time()
        res_sql_disk = sqlites_most_common_color_per_make(conn_disk)
        sql_q2_disk_end = time.time()
        print(sql_q2_disk_end - sql_q2_disk_start)

        sql_q2_mem_start = time.time()
        res_sql_mem = sqlites_most_common_color_per_make(conn_mem)
        sql_q2_mem_end = time.time()
        print(sql_q2_mem_end - sql_q2_mem_start)
        
        results['q2_pandas_time'] = pandas_q2_end - pandas_q2_start
        results['q2_sql_disk_time'] = sql_q2_disk_end - sql_q2_disk_start
        results['q2_sql_mem_time'] = sql_q2_mem_end - sql_q2_mem_start
        
        
        try:
            assert res_pd == res_sql_disk
        except:
            set1 = set(res_pd.items())
            set2 = set(res_sql_disk.items())
            print(set1 ^ set2)
            print('difference between pandas and sql, could still be valid in cases of ties: {}'.format(set1 ^ set2))
            

        # q3
        print()
        print('q3')
        pandas_q3_start = time.time()
        res_pd = pandas_first_ticket_per_make(df)
        pandas_q3_end = time.time()
        print(pandas_q2_end - pandas_q2_start)

        sql_q3_disk_start = time.time()
        res_sql_disk = sqlite_first_ticket_per_make(conn_disk)
        sql_q3_disk_end = time.time()
        print(sql_q3_disk_end - sql_q3_disk_start)

        sql_q3_mem_start = time.time()
        res_sql_mem = sqlite_first_ticket_per_make(conn_mem)
        sql_q3_mem_end = time.time()
        print(sql_q3_mem_end - sql_q3_mem_start)
        
        results['q3_pandas_time'] = pandas_q3_end - pandas_q3_start
        results['q3_sql_disk_time'] = sql_q3_disk_end - sql_q3_disk_start
        results['q3_sql_mem_time'] = sql_q3_mem_end - sql_q3_mem_start

        try:
            assert sorted(res_pd) == sorted(res_sql_disk)
            assert sorted(res_sql_disk) == sorted(res_sql_mem)
        except:
            print(res_pd)
            print(res_sql_disk)
            print(res_sql_mem)
            raise AssertionError
        
        return [results]
            
            
        

In [102]:
results = []

df_with_make = df.dropna(subset= ['Make'])

for i in [1000, 2000, 4000, 8000, 16000, 32000, 64000, 128000, 256000, None]:
    results.extend(run_timings(df_with_make, i))
    
res_df = pd.DataFrame.from_dict(results)

    

here ['TOYT', 'HOND', 'FORD', 'NISS', 'CHEV', 'BMW', 'MERZ', 'KIA', 'VOLK', 'HYUN', 'DODG', 'LEXS', 'JEEP', 'AUDI', 'MAZD', 'GMC', 'OTHR', 'ACUR', 'TOYO', 'CHRY', 'INFI', 'SUBA', 'LNDR', 'MITS', 'MNNI']
here ['TOYT', 'HOND', 'FORD', 'NISS', 'CHEV', 'BMW', 'MERZ', 'KIA', 'VOLK', 'HYUN', 'DODG', 'LEXS', 'JEEP', 'AUDI', 'MAZD', 'GMC', 'OTHR', 'ACUR', 'TOYO', 'CHRY', 'INFI', 'SUBA', 'LNDR', 'MITS', 'MNNI']
{'number_of_records': 1000, 'q1_pandas_time': 0.0011854171752929688, 'q1_sql_disk_time': 0.0004982948303222656, 'q1_sql_mem_time': 0.0006983280181884766}
['ACUR', 'AUDI', 'BMW', 'CHEV', 'CHRY', 'DODG', 'FORD', 'GMC', 'HOND', 'HYUN', 'INFI', 'JEEP', 'KIA', 'LEXS', 'LNDR', 'MAZD', 'MERZ', 'MITS', 'NISS', 'OTHR', 'SUBA', 'TOYO', 'TOYT', 'VOLK', 'VOLV']
['ACUR', 'AUDI', 'BMW', 'CHEV', 'CHRY', 'DODG', 'FORD', 'GMC', 'HOND', 'HYUN', 'INFI', 'JEEP', 'KIA', 'LEXS', 'LNDR', 'MAZD', 'MERZ', 'MITS', 'MNNI', 'NISS', 'OTHR', 'SUBA', 'TOYO', 'TOYT', 'VOLK']
difference between pandas and sql, could sti

OperationalError: near ")": syntax error

In [97]:
res_df['q1_sql_speed_difference'] = res_df['q1_sql_mem_time']/res_df['q1_pandas_time']
res_df['q2_sql_speed_difference'] = res_df['q2_sql_mem_time']/res_df['q2_pandas_time']
res_df['q3_sql_speed_difference'] = res_df['q3_sql_mem_time']/res_df['q3_pandas_time']

In [98]:
res_df

Unnamed: 0,number_of_records,q1_pandas_time,q1_sql_disk_time,q1_sql_mem_time,q2_pandas_time,q2_sql_disk_time,q2_sql_mem_time,q3_pandas_time,q3_sql_disk_time,q3_sql_mem_time,q1_sql_speed_difference,q2_sql_speed_difference,q3_sql_speed_difference
0,1000,0.001292,0.000437,0.000542,0.032973,0.001357,0.001264,0.004226,0.000614,0.000536,0.419819,0.038323,0.126869
1,2000,0.001033,0.000596,0.000681,0.031511,0.001774,0.001703,0.004261,0.00101,0.000974,0.658897,0.054037,0.228488
2,4000,0.001161,0.001065,0.001147,0.035756,0.00342,0.003305,0.006408,0.002155,0.00204,0.987677,0.092443,0.318327
3,8000,0.002164,0.004309,0.002671,0.059387,0.013836,0.008451,0.007716,0.007383,0.007966,1.234575,0.142299,1.032381
4,16000,0.001744,0.00418,0.004746,0.060485,0.017065,0.014569,0.016195,0.009354,0.008144,2.720749,0.240862,0.502885
5,32000,0.002737,0.008286,0.008099,0.067023,0.028697,0.028175,0.020543,0.017242,0.016897,2.959317,0.42038,0.822522
6,64000,0.004294,0.021107,0.019279,0.099973,0.071681,0.073004,0.038789,0.044285,0.041662,4.489534,0.730242,1.07406
7,128000,0.007845,0.039917,0.039057,0.131376,0.146188,0.134304,0.071325,0.091121,0.075312,4.978786,1.022287,1.0559
8,256000,0.014963,0.079597,0.08012,0.178373,0.286304,0.284016,0.134186,0.159091,0.155036,5.354424,1.592257,1.155381
9,4357544,0.197369,1.541899,1.499423,1.127066,6.160068,5.95527,1.578308,3.006889,3.095549,7.597057,5.283869,1.961309


My metrics show that my pandas code is faster at large data sizes and scales better. I find the pandas much simpler to write and understand, it is objectively much shorter.

I was surprized that sqlite was slower, it is possible that my sql code is not optimal but the benchmarks linked below show that pandas can outperform sqlite for joining and grouping operations. My code uses both.

https://blog.thedataincubator.com/2018/05/sqlite-vs-pandas-performance-benchmarks/


In [99]:
df.head()

Unnamed: 0,Ticket number,Issue Date,Issue time,Meter Id,Marked Time,RP State Plate,Plate Expiry Date,VIN,Make,Body Style,Color,Location,Route,Agency,Violation code,Violation Description,Fine amount,Latitude,Longitude
0,1103341116,2015-12-21T00:00:00,1251.0,,,CA,200304.0,,,PA,GY,13147 WELBY WAY,01521,1.0,4000A1,NO EVIDENCE OF REG,50.0,99999.0,99999.0
1,1103700150,2015-12-21T00:00:00,1435.0,,,CA,201512.0,,,VN,WH,525 S MAIN ST,1C51,1.0,4000A1,NO EVIDENCE OF REG,50.0,99999.0,99999.0
2,1104803000,2015-12-21T00:00:00,2055.0,,,CA,201503.0,,,PA,BK,200 WORLD WAY,2R2,2.0,8939,WHITE CURB,58.0,6439997.9,1802686.4
3,1104820732,2015-12-26T00:00:00,1515.0,,,CA,,,,PA,WH,100 WORLD WAY,2F11,2.0,000,17104h,,6440041.1,1802686.2
4,1105461453,2015-09-15T00:00:00,115.0,,,CA,200316.0,,CHEV,PA,BK,GEORGIA ST/OLYMPIC,1FB70,1.0,8069A,NO STOPPING/STANDING,93.0,99999.0,99999.0
