In [2]:
#Imports
import sys
import json
import csv
import yaml
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

import time
from datetime import datetime
import pprint
import psycopg2
from sqlalchemy import create_engine, text as sql_text
#Import Util
sys.path.append('benchmarking/')
import util

#LOAD ENVIRONMENT VARIABLES
dotenv_path = 'variables.env'
load_dotenv(dotenv_path=dotenv_path)

schema = os.getenv('DISC_6_SCHEMA')
port = os.getenv('DISC_6_PORT')
host = os.getenv('DISC_6_HOST')
database = os.getenv('DISC_6_DB')
password = os.getenv('DISC_6_PASSWORD')
connection = os.getenv('DISC_6_CONNECTION')

#CREATE ENGINE
db_eng = create_engine(f"postgresql+psycopg2://{connection}:{password}@{host}:{port}/{database}",
                       connect_args={'options': '-csearch_path={}'.format(schema)},
                       isolation_level = 'SERIALIZABLE')

print("Successfully created db engine.")

Successfully created db engine.


In [4]:
#1A SETUP FOR TEST 1

qupdate = """
 alter table reviews
     add column datetime timestamp;
"""

qimport = """update reviews
     set datetime = TO_TIMESTAMP((TO_CHAR(date, 'YYYY-MM-DD') || ' 12:00:00'),
                         'YYYY-MM-DD hh24:mi:ss')::timestamp without time zone;"""

with db_eng.connect() as conn:
    result = conn.execute(sql_text(qupdate))
    result = conn.execute(sql_text(qimport))


ProgrammingError: (psycopg2.errors.DuplicateColumn) column "datetime" of relation "reviews" already exists

[SQL: 
 alter table reviews
     add column datetime timestamp;
     
]
(Background on this error at: https://sqlalche.me/e/14/f405)

In [None]:
#1B SETUP for TEST 1, find number of 
q_yearinfo = """
 select COUNT(*)
 FROM reviews
 
 SORT BY date
     
"""

In [4]:
#1 RUN TEST 
all_indexes = [['datetime','reviews'], ['id','listings']]
perf_summary = util.fetch_perf_data('perf_summary.json')

count = 5
# store all of the index combinations to test
specs = [[],[['datetime','reviews']],[['id','listings']],[['datetime','reviews'],['id','listings']]] 
for spec in specs:
    
    # set up the correct indices
    print('Processing spec: ', str(spec), '\n')
    for index in all_indexes:
        if index not in spec:
            mod_index = util.add_drop_index(db_eng, 'drop', index[0], index[1])
            print('\nAfter doing the drop for', str(index), 'the indexes on table "' + index[1] + '" are: ')
            print(mod_index)

    for index in spec:
        mod_index = util.add_drop_index(db_eng, 'add', index[0], index[1])
        print('\nAfter doing the add for', str(index), 'the indexes on table "' + index[1] + '" are: ')
        print(mod_index)
    

    for i in range(0, count):
        print(i)

Processing spec:  [] 


Index name:datetime_in_reviews
QUERY TO EXECUTE:
BEGIN TRANSACTION;
DROP INDEX IF EXISTS datetime_in_reviews;
 END TRANSACTION;


After doing the drop for ['datetime', 'reviews'] the indexes on table "reviews" are: 
[('new_york_city', 'reviews', 'date_in_reviews', None, 'CREATE INDEX date_in_reviews ON new_york_city.reviews USING btree (date)')]

Index name:id_in_listings
QUERY TO EXECUTE:
BEGIN TRANSACTION;
DROP INDEX IF EXISTS id_in_listings;
 END TRANSACTION;


After doing the drop for ['id', 'listings'] the indexes on table "listings" are: 
[]
0
1
2
3
4
Processing spec:  [['datetime', 'reviews']] 


Index name:id_in_listings
QUERY TO EXECUTE:
BEGIN TRANSACTION;
DROP INDEX IF EXISTS id_in_listings;
 END TRANSACTION;


After doing the drop for ['id', 'listings'] the indexes on table "listings" are: 
[]

Index name:datetime_in_reviews
QUERY TO EXECUTE:
BEGIN TRANSACTION;
CREATE INDEX IF NOT EXISTS datetime_in_reviews
ON reviews(datetime);
 END TRANSACTION;


Af