In [None]:
import duckdb
import time

DEFAULT_DB_PATH = 'dbs/job_imdb-parachute-4-0-1.duckdb'
DB_PATH = 'dbs/test-insert.duckdb'

import os
NUM_THREADS = os.cpu_count()

title_sql = '''case when title.production_year is null then 0
    when title.production_year <= 1929 then 0
    when title.production_year <= 1963 then 1
    when title.production_year <= 1974 then 2
    when title.production_year <= 1982 then 3
    when title.production_year <= 1991 then 4
    when title.production_year <= 1996 then 5
    when title.production_year <= 1999 then 6
    when title.production_year <= 2002 then 7
    when title.production_year <= 2004 then 8
    when title.production_year <= 2006 then 9
    when title.production_year <= 2007 then 10
    when title.production_year <= 2008 then 11
    when title.production_year <= 2009 then 12
    when title.production_year <= 2010 then 13
    when title.production_year <= 2011 then 14
    else 15 end
'''

ret = {}
for percentage in [0.1, 0.5, 1.0, 5.0]:
    print(f'percentage={percentage}')

    import os, subprocess
    assert os.path.isfile(DEFAULT_DB_PATH)
    subprocess.call(['cp', DEFAULT_DB_PATH, DB_PATH])
    assert os.path.isfile(DB_PATH)

    print(f'Copied!')


    ret[percentage] = dict()
    for variant in [0, -1, 1, 2, -2, 3, -3]:
        # Connect to DuckDB
        import utils
        con = utils.open_duckdb(DB_PATH, read_only=False, threads=NUM_THREADS)
        table_size = con.sql('select count(*) as table_size from cast_info;').df()['table_size'].values[0]
        last_id = con.execute('SELECT MAX(id) FROM cast_info').fetchone()[0] or 0

        print(f'\nvariant={variant}')

        con.execute('DROP TABLE IF EXISTS cast_info_sample')

        sample_size = int(percentage * table_size / 100)
        print(f'table_size={table_size}, sample_size={sample_size}')

        con.execute(f'''
            CREATE TABLE cast_info_sample AS 
            SELECT * FROM cast_info 
            ORDER BY id
            LIMIT {sample_size}
        ''')

        print(f'Created sample!')

        myval = None

        # Measure insert time without parachute
        if variant == 0:
            start_time = time.time_ns()
            con.execute(f'''
                INSERT INTO cast_info
                SELECT id + {last_id + 1}, * EXCLUDE(id) FROM cast_info_sample
            ''')
            insert_time = time.time_ns() - start_time
            myval = insert_time
            print(f'insert_time={insert_time}')

        # Measure insert + parachute_title_production_year update
        if variant == -1:
            sql_query = f'''
                select cis.*, ({title_sql.replace("title.", "t.")}) as tmp
                from cast_info_sample cis, title t
                WHERE t.id = cis.movie_id;
            '''
            print(sql_query)
            start_time = time.time_ns()
            con.execute(sql_query)
            join_parachute_year_time = time.time_ns() - start_time
            myval = join_parachute_year_time

            print(f'join_parachute_year_time={join_parachute_year_time}')

        # Measure insert + parachute_title_production_year update
        if variant == 1:
            start_time = time.time_ns()
            con.execute(f'''
                UPDATE cast_info_sample
                SET parachute_title_production_year = ({title_sql})
                FROM title
                WHERE title.id = cast_info_sample.movie_id;
            ''')
            update_parachute_year_time = time.time_ns() - start_time
            myval = update_parachute_year_time

            print(f'update_parachute_year_time={update_parachute_year_time}')

        # Measure insert + parachute_title_title update
        if variant == -2:
            start_time = time.time_ns()
            con.execute('''
                select cis.*, t.helper_title_title as tmp
                from cast_info_sample cis, title t
                WHERE t.id = cis.movie_id;
            ''')
            join_parachute_title_time = time.time_ns() - start_time
            myval = join_parachute_title_time
            print(f'join_parachute_title_time={join_parachute_title_time}')

        # Measure insert + parachute_title_title update
        if variant == 2:
            start_time = time.time_ns()
            con.execute('''
                UPDATE cast_info_sample
                SET parachute_title_title = title.helper_title_title
                FROM title
                WHERE title.id = cast_info_sample.movie_id;
            ''')
            update_parachute_title_time = time.time_ns() - start_time
            myval = update_parachute_title_time
            print(f'update_parachute_title_time={update_parachute_title_time}')

        # Measure insert + both parachute updates
        if variant == -3:
            start_time = time.time_ns()
            con.execute(f'''
                select cis.*, ({title_sql.replace("title.", "t.")}) as tmp1, t.helper_title_title as tmp2
                from cast_info_sample cis, title t
                WHERE t.id = cis.movie_id;
            ''')
            join_both_time = time.time_ns() - start_time
            myval = join_both_time
            print(f'join_both_time={join_both_time}')

        # Measure insert + both parachute updates
        if variant == 3:
            start_time = time.time_ns()
            con.execute(f'''
                UPDATE cast_info_sample
                SET
                    parachute_title_title = title.helper_title_title,
                    parachute_title_production_year = ({title_sql})
                FROM title
                WHERE title.id = cast_info_sample.movie_id;
            ''')
            update_both_time = time.time_ns() - start_time
            myval = update_both_time
            print(f'update_both_time={update_both_time}')

        ret[percentage][variant] = myval

        print(ret)

        con.close()

# And write.
utils.write_json(f'insert-{NUM_THREADS}.json', ret)