In [1]:
import os
import io
import datetime
import logging
import sys

import numpy as np
import pandas as pd
from pandas.io.json import json_normalize 

import civis
import civis.io
from civis.futures import CivisFuture

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, RandomForestRegressor
from civis.ml import ModelPipeline

import uuid
import json
from pprint import pprint
import tempfile
import concurrent.futures
from concurrent.futures import wait
from collections import namedtuple

In [6]:
# Default feature lists for Rainbow Modeling Frame (each number corresponds to number of features)
feature_table = civis.io.read_civis_sql(sql='''select * from bernie_cherdeman.feature_list order by sort_order asc''', use_pandas = True, database='Bernie DS')
feature_list_large = list(feature_table[(feature_table['frame_large'] == 1)]['feature_name']) + ['state_code']
feature_list_medium = list(feature_table[(feature_table['frame_medium'] == 1)]['feature_name']) + ['state_code']
feature_list_small = list(feature_table[(feature_table['frame_small'] == 1)]['feature_name']) + ['state_code']

additional_features = ['attempt_count', 'num_phones_1', 'num_phones_2', 'num_phones_3', 'num_phones_other']

feature_list_large.extend(additional_features)
feature_list_medium.extend(additional_features)
feature_list_small.extend(additional_features)

table_columns = civis.io.read_civis_sql(
    sql=f'''select ordinal_position as position, column_name, data_type 
    from information_schema.columns 
    where table_name = 'rainbow_modeling_frame' and table_schema = 'bernie_data_commons' and column_name != 'person_id'
    order by ordinal_position;''', use_pandas = True, database='Bernie DS')

#exclusion_list_466 = [e for e in list(table_columns['column_name']) if e not in feature_list_466] 

In [7]:
len(feature_list_small)

204

In [8]:
# USER INPUT CELL

# DV table parameters
DATABASE = 'Bernie DS'
# Primary key in both the DV table and the Modeling Frame
PRIMARY_KEY = 'person_id' 
# Table containing recoded Dependent Variables keyed to the PRIMARY_KEY
DV_TABLE = 'bernie_cherdeman.contactibility_outcomes_cell'
# List of binarized dependent variables (accepts 1, 0, and null values) in DV_TABLE
DV_LIST = [ 'pickup_last', 'id_last'] # 'pickup_first', 'id_first',


# Modeling frame table parameters
# Table containing covariates and keyed to PRIMARY_KEY
MODELING_FRAME = 'bernie_cherdeman.contactibility_modeling_frame_cell'
# Columns in the Modeling Frame to exclude from feature list (i.e., strings or incomplete coverage)
EXCLUSION_COLUMNS = ['jsonid','state_code','census_block_group_2010', 'person_id', 'us_region', 
                     'pickup_first', 'pickup_last', 'pickup_ever', 'id_first', 'id_last', 'id_ever', 'is_cell_phone']

# Output table parameters
# Schema to contain prediction tables
SCHEMA = 'bernie_cherdeman'
# String that will be concatenated in front of the output table's name
PREFIX = 'getthru'

# Sampling parameters
# Non-response training data
    # True: automatically select people not in DV_TABLE at random from Phoenix (assumes person_id is PRIMARY_KEY)
    # False: automatically select people where the DV equals 0 from the DV_TABLE
SAMPLE_FROM_PHOENIX = False
# Number of non-response classes per target class (default is 2) 
CLASS_BALANCE = 2
# Maximum number of targets to randomly sample from DV_TABLE
MAX_TARGET_COUNT = 40000

In [9]:
datestamp = '{:%Y%m%d}'.format(datetime.date.today())

In [10]:
# Counts of positive classes
dv_sql_targets = "\n,".join(["sum({dv}) as {dv}".format(dv=i) for i in DV_LIST])
sql_collapse_targets = f"""select {dv_sql_targets} from {DV_TABLE};"""
sql_count_targets = civis.io.read_civis_sql(sql_collapse_targets, DATABASE)


In [11]:
sql_count_targets

[['pickup_last', 'id_last'], ['169865', '176807']]

In [None]:
# Determing training table proportion of positives to negatives (to avoid class imbalance problems)
# sample_share = []
# for i in range(len(DV_LIST)):
#     if int(sql_count_targets[1][i]) > MAX_TARGET_COUNT:
#         sql_count_targets[1][i] = MAX_TARGET_COUNT
#     u = round(int(sql_count_targets[1][i])*CLASS_BALANCE)
#     sample_share.append(u)      

In [None]:
# for i in range(len(DV_LIST)):
#     dv_item = DV_LIST[i]
#     random_sample = sample_share[i]
#     if SAMPLE_FROM_PHOENIX is True:
#         zero_sample = f'''(select p.person_id, 0 as {dv_item} from phoenix_analytics.person p left join (select person_id from {DV_TABLE}) d on p.person_id = d.person_id where d.person_id is null and is_deceased = false and reg_record_merged = false and reg_on_current_file = true and reg_voter_flag = true order by random() limit {random_sample})'''   
#     if SAMPLE_FROM_PHOENIX is False:
#         zero_sample = f'''(select {PRIMARY_KEY}, {dv_item} from {DV_TABLE} where {dv_item} = 0 order by random() limit {random_sample})'''


In [17]:
# Create training tables
#for i, dv_item in enumerate(DV_LIST):
#     if (int(sql_count_targets[1][i])*3) <= 1000:
    #feature_select = "\n,".join(["{feature}".format(feature=f) for f in feature_list_small])
#     if (int(sql_count_targets[1][i])*3) > 1000 & (int(sql_count_targets[1][i])*3) <= 2000:
         
    #feature_select = "\n,".join(["{feature}".format(feature=f) for f in feature_list_medium])
#     if (int(sql_count_targets[1][i])*3) > 2000:
    #feature_select = "\n,".join(["{feature}".format(feature=f) for f in feature_list_large])
    #dv_item = DV_LIST[i]
#print(dv_item)
#     random_sample = sample_share[i]
#     if SAMPLE_FROM_PHOENIX is True:
#         zero_sample = f'''(select p.person_id, 0 as {dv_item} from phoenix_analytics.person p left join (select person_id from {DV_TABLE}) d on p.person_id = d.person_id where d.person_id is null and is_deceased = false and reg_record_merged = false and reg_on_current_file = true and reg_voter_flag = true order by random() limit {random_sample})'''   
#     if SAMPLE_FROM_PHOENIX is False:
#         zero_sample = f'''(select {PRIMARY_KEY}, {dv_item} from {DV_TABLE} where {dv_item} = 0 order by random() limit {random_sample})'''
feature_select = "\n,".join(["{feature}".format(feature=f) for f in feature_list_small])
dvs = "\n,".join(["{dv}".format(dv=d) for d in DV_LIST])

regions = ['South Atlantic',
            'New England',
            'Pacific',
            'West South Central',
            'East Midwest',
            'West Midwest',
            'Other',
            'East South Central',
            'Mountain']

for i, region in enumerate(regions):

    training_sql = f"""DROP TABLE IF EXISTS {SCHEMA}.{PREFIX}_training_{i} CASCADE;
                           CREATE TABLE {SCHEMA}.{PREFIX}_training_{i} DISTKEY({PRIMARY_KEY}) SORTKEY({PRIMARY_KEY}) AS (
                               select * 
                               from (select {PRIMARY_KEY}, {dvs} 
                                   from {DV_TABLE}
                                   where stratification < 5000
                                   and us_region = '{region}')
                               join (select {PRIMARY_KEY}, {feature_select}
                                     from {MODELING_FRAME}
                                     where us_region = '{region}')
                               using({PRIMARY_KEY})
                               );"""
    print(training_sql)
    create_training_sql = civis.io.query_civis(training_sql, database=DATABASE)
    create_training_sql.result().state
    print(f"{region} done")

# training_sql2 = f"""DROP TABLE IF EXISTS {SCHEMA}.{PREFIX}_training_1 CASCADE;
#                        CREATE TABLE {SCHEMA}.{PREFIX}_training_1 DISTKEY({PRIMARY_KEY}) SORTKEY({PRIMARY_KEY}) AS (
#                            select * 
#                            from (select {PRIMARY_KEY}, {dvs} 
#                                from {DV_TABLE}
#                                where stratification between 2500 and 5000)
#                            join (select {PRIMARY_KEY}, {feature_select}
#                                  from {MODELING_FRAME})
#                            using({PRIMARY_KEY})
#                            );"""
# training = [training_sql1, training_sql2]

# for t in training:
#     create_training_sql = civis.io.query_civis(t, database=DATABASE)
#     create_training_sql.result().state
#     print("one down")
    
    

DROP TABLE IF EXISTS bernie_cherdeman.getthru_training_0 CASCADE;
                           CREATE TABLE bernie_cherdeman.getthru_training_0 DISTKEY(person_id) SORTKEY(person_id) AS (
                               select * 
                               from (select person_id, pickup_last
,id_last 
                                   from bernie_cherdeman.contactibility_outcomes_cell
                                   where stratification < 5000
                                   and us_region = 'South Atlantic')
                               join (select person_id, civis_2020_marriage
,civis_2020_children_present
,civis_2020_partisanship
,civis_2020_ideology_liberal
,civis_2020_spanish_language_preference
,civis_2018_turnout
,civis_2018_partisanship
,civis_2018_gotv
,civis_2018_ballot_dropoff
,civis_2018_congressional_gotv_raw
,civis_2018_congressional_support
,civis_2018_avev
,dnc_2018_college_graduate
,dnc_2018_income_dollars
,dnc_2018_high_school_only
,dnc_2018_income_rank
,civi

New England done
DROP TABLE IF EXISTS bernie_cherdeman.getthru_training_2 CASCADE;
                           CREATE TABLE bernie_cherdeman.getthru_training_2 DISTKEY(person_id) SORTKEY(person_id) AS (
                               select * 
                               from (select person_id, pickup_last
,id_last 
                                   from bernie_cherdeman.contactibility_outcomes_cell
                                   where stratification < 5000
                                   and us_region = 'Pacific')
                               join (select person_id, civis_2020_marriage
,civis_2020_children_present
,civis_2020_partisanship
,civis_2020_ideology_liberal
,civis_2020_spanish_language_preference
,civis_2018_turnout
,civis_2018_partisanship
,civis_2018_gotv
,civis_2018_ballot_dropoff
,civis_2018_congressional_gotv_raw
,civis_2018_congressional_support
,civis_2018_avev
,dnc_2018_college_graduate
,dnc_2018_income_dollars
,dnc_2018_high_school_only
,dnc_2018_income_

West South Central done
DROP TABLE IF EXISTS bernie_cherdeman.getthru_training_4 CASCADE;
                           CREATE TABLE bernie_cherdeman.getthru_training_4 DISTKEY(person_id) SORTKEY(person_id) AS (
                               select * 
                               from (select person_id, pickup_last
,id_last 
                                   from bernie_cherdeman.contactibility_outcomes_cell
                                   where stratification < 5000
                                   and us_region = 'East Midwest')
                               join (select person_id, civis_2020_marriage
,civis_2020_children_present
,civis_2020_partisanship
,civis_2020_ideology_liberal
,civis_2020_spanish_language_preference
,civis_2018_turnout
,civis_2018_partisanship
,civis_2018_gotv
,civis_2018_ballot_dropoff
,civis_2018_congressional_gotv_raw
,civis_2018_congressional_support
,civis_2018_avev
,dnc_2018_college_graduate
,dnc_2018_income_dollars
,dnc_2018_high_school_only
,dnc_

West Midwest done
DROP TABLE IF EXISTS bernie_cherdeman.getthru_training_6 CASCADE;
                           CREATE TABLE bernie_cherdeman.getthru_training_6 DISTKEY(person_id) SORTKEY(person_id) AS (
                               select * 
                               from (select person_id, pickup_last
,id_last 
                                   from bernie_cherdeman.contactibility_outcomes_cell
                                   where stratification < 5000
                                   and us_region = 'OtherEast South Central')
                               join (select person_id, civis_2020_marriage
,civis_2020_children_present
,civis_2020_partisanship
,civis_2020_ideology_liberal
,civis_2020_spanish_language_preference
,civis_2018_turnout
,civis_2018_partisanship
,civis_2018_gotv
,civis_2018_ballot_dropoff
,civis_2018_congressional_gotv_raw
,civis_2018_congressional_support
,civis_2018_avev
,dnc_2018_college_graduate
,dnc_2018_income_dollars
,dnc_2018_high_school_only


Mountain done


In [18]:
training_sql6 = f"""DROP TABLE IF EXISTS {SCHEMA}.{PREFIX}_training_6 CASCADE;
                       CREATE TABLE {SCHEMA}.{PREFIX}_training_6 DISTKEY({PRIMARY_KEY}) SORTKEY({PRIMARY_KEY}) AS (
                           select * 
                           from (select {PRIMARY_KEY}, {dvs} 
                               from {DV_TABLE}
                               where stratification < 5000
                               and us_region = 'Other')
                           join (select {PRIMARY_KEY}, {feature_select}
                                 from {MODELING_FRAME}
                                 where us_region = 'Other')
                           using({PRIMARY_KEY})
                           );"""
training_sql7 = f"""DROP TABLE IF EXISTS {SCHEMA}.{PREFIX}_training_7 CASCADE;
                       CREATE TABLE {SCHEMA}.{PREFIX}_training_7 DISTKEY({PRIMARY_KEY}) SORTKEY({PRIMARY_KEY}) AS (
                           select * 
                           from (select {PRIMARY_KEY}, {dvs} 
                               from {DV_TABLE}
                               where stratification < 5000
                               and us_region = 'East South Central')
                           join (select {PRIMARY_KEY}, {feature_select}
                                 from {MODELING_FRAME}
                                 where us_region = 'East South Central')
                           using({PRIMARY_KEY})
                           );"""
sql = [training_sql6, training_sql7]
for training_sql in sql:
    print(training_sql)
    create_training_sql = civis.io.query_civis(training_sql, database=DATABASE)
    create_training_sql.result().state
    print(f"{region} done")

DROP TABLE IF EXISTS bernie_cherdeman.getthru_training_6 CASCADE;
                       CREATE TABLE bernie_cherdeman.getthru_training_6 DISTKEY(person_id) SORTKEY(person_id) AS (
                           select * 
                           from (select person_id, pickup_last
,id_last 
                               from bernie_cherdeman.contactibility_outcomes_cell
                               where stratification < 5000
                               and us_region = 'Other')
                           join (select person_id, civis_2020_marriage
,civis_2020_children_present
,civis_2020_partisanship
,civis_2020_ideology_liberal
,civis_2020_spanish_language_preference
,civis_2018_turnout
,civis_2018_partisanship
,civis_2018_gotv
,civis_2018_ballot_dropoff
,civis_2018_congressional_gotv_raw
,civis_2018_congressional_support
,civis_2018_avev
,dnc_2018_college_graduate
,dnc_2018_income_dollars
,dnc_2018_high_school_only
,dnc_2018_income_rank
,civis_2020_race_native
,civis_2020_race_b

Mountain done


In [19]:
#combine training sets into view
training_sql = f"""drop view if exists {SCHEMA}.{PREFIX}_training_cell;
                   create view {SCHEMA}.{PREFIX}_training as (
                        (select * from {SCHEMA}.{PREFIX}_training_0)
                        union
                        (select * from {SCHEMA}.{PREFIX}_training_1)
                        union
                        (select * from {SCHEMA}.{PREFIX}_training_2)
                        union
                        (select * from {SCHEMA}.{PREFIX}_training_3)
                        union
                        (select * from {SCHEMA}.{PREFIX}_training_4)
                        union
                        (select * from {SCHEMA}.{PREFIX}_training_5)
                        union
                        (select * from {SCHEMA}.{PREFIX}_training_6)
                        union
                        (select * from {SCHEMA}.{PREFIX}_training_7)
                        union
                        (select * from {SCHEMA}.{PREFIX}_training_8)
                    );
                   """
combined_training_sql = civis.io.query_civis(training_sql, database=DATABASE)
combined_training_sql.result().state

'succeeded'

In [24]:
# Train models
train_list = []
model_list = []

for i, dv in enumerate(DV_LIST):
    print('TRAINING >>> {}'.format(dv))
    
    exc_list = DV_LIST.copy()
    exc_list.remove(dv)
    
    assert dv not in exc_list 
    
    for m in ['random_forest_classifier', 'sparse_logistic']:
    
        name = f"""{dv}_{m}_{datestamp}"""
        model = ModelPipeline(model=m,
                              dependent_variable=dv,
                              primary_key=PRIMARY_KEY,
                              excluded_columns=EXCLUSION_COLUMNS,
                              #calibration='sigmoid',
                              model_name=name,
                              memory_requested=15000#,
                              #disk_requested=5
                             )
    
        where_string = '{} is not null'.format(dv)
        # Use 
        train = model.train(table_name=f"""{SCHEMA}.{PREFIX}_training""", 
                            database_name=DATABASE,
                            sql_where=where_string#,
                            #fit_params={'sample_weight': WEIGHT_VAR}
                           )

        model_list.append(model)
        train_list.append(train) 



TRAINING >>> pickup_last
TRAINING >>> id_last


In [33]:
# name = f"""multioutcome_rf_{datestamp}_v4"""
# model = ModelPipeline(model='random_forest_classifier',
#                       dependent_variable=DV_LIST,
#                       primary_key=PRIMARY_KEY,
#                       excluded_columns=EXCLUSION_COLUMNS,
#                       #calibration='sigmoid',
#                       model_name=name,
#                       memory_requested=15000#,
#                       #disk_requested=5
#                      )

# train = model.train(table_name=f"""{SCHEMA}.{PREFIX}_training""", 
#                     database_name=DATABASE,
#                     sql_where='''pickup_first is not null 
#                                  and pickup_last is not null
#                                  and pickup_ever is not null
#                                  and id_first is not null
#                                  and id_last is not null
#                                  and id_ever is not null'''#,
#                     #fit_params={'sample_weight': WEIGHT_VAR}
#                    )

# model_list.append(model)
# train_list.append(train)

In [25]:
train_list[0].metadata

{'data': {'class_names': [0, 1],
  'col_types_read': ['object',
   'int32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',
   'float32',


In [26]:
# Extract successful models
model_output = model_list
train_output = train_list

jobs_list = []
for t in train_output: 
    try:
        if len(t.metadata['output']) > 0:  
            jobs_list.append(t)
            print('Job success')
    except:
        print('Job failure')
        pass

    
model_output, train_output = zip(*((m, t) for m, t in zip(model_output, train_output) if t in jobs_list))
model_output = list(model_output)
train_output = list(train_output)

Job success
Job success
Job success
Job success


In [27]:
print(len(jobs_list))
print(len(model_output))

4
4


In [28]:
# Generate validation metrics
metrics_list = []

for b in train_output:
    if b.job_id != 59761951:
        metric = {'job_id':b.job_id,
                  'run_id':b.run_id,
                  'dv': ''.join(b.metadata['run']['configuration']['data']['y']),
                  'model': b.metadata['model']['model'],
                  'time_of_train_run': b.metadata['run']['time_of_run'],
                  'n_rows': b.metadata['data']['n_rows'],
                  'n_features': b.metadata['data']['n_cols'],
                  'auc': b.metadata['metrics']['roc_auc'],
                  'deciles': b.metadata['metrics']['deciles'],
                  'confusion_matrix': b.metadata['metrics']['confusion_matrix'],
                  'accuracy': b.metadata['metrics']['accuracy'],
                  'p_correct': b.metadata['metrics']['p_correct'],
                  'pop_incidence_true': b.metadata['metrics']['pop_incidence_true'],
                  'feature_list':b.metadata['model']['parameters']['relvars']
                 }
        metrics_list.append(metric)
    
metric_order = (['job_id', 'run_id', 'dv', 'model', 'time_of_train_run', 'n_rows', 'n_features',
                 'auc', 'deciles', 'confusion_matrix', 'accuracy', 'p_correct','pop_incidence_true','feature_list'])

validation_df = pd.DataFrame.from_records(metrics_list, columns=metric_order, index='run_id')
validation_df

Unnamed: 0_level_0,job_id,dv,model,time_of_train_run,n_rows,n_features,auc,deciles,confusion_matrix,accuracy,p_correct,pop_incidence_true,feature_list
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
206871255,60207348,pickup_last,random_forest_classifier,2020-02-01T21:21:17Z,67156,203,0.651211,"[0.19446098868374032, 0.3139240506329114, 0.37...","[[29198, 6974], [19945, 11039]]",0.599157,"[0.8071989384053965, 0.3562806609863155]","[0.5386264816248735, 0.4613735183751266]","[civis_2020_marriage, civis_2020_children_pres..."
206871259,60207353,pickup_last,sparse_logistic,2020-02-01T21:20:17Z,67156,203,0.633856,"[0.24865991661703396, 0.3241995532390171, 0.37...","[[25169, 11003], [15904, 15080]]",0.599336,"[0.6958144421099193, 0.48670281435579654]","[0.5386264816248735, 0.4613735183751266]","[civis_2020_marriage, civis_2020_children_pres..."
206871263,60207357,id_last,random_forest_classifier,2020-02-01T21:22:08Z,67156,203,0.668154,"[0.10363311494937463, 0.18942665673864484, 0.2...","[[44426, 343], [21782, 605]]",0.670543,"[0.992338448479975, 0.02702461249832492]","[0.6666418488295908, 0.3333581511704092]","[civis_2020_marriage, civis_2020_children_pres..."
206871271,60207367,id_last,sparse_logistic,2020-02-01T21:21:23Z,67156,203,0.638514,"[0.15872543180464563, 0.2099776619508563, 0.25...","[[42659, 2110], [19992, 2395]]",0.670886,"[0.9528691728651523, 0.10698173046857551]","[0.6666418488295908, 0.3333581511704092]","[civis_2020_marriage, civis_2020_children_pres..."


In [29]:
# Write validation metrics to Redshift
create_validation_table = civis.io.dataframe_to_civis(df=validation_df,
                                                 database=DATABASE, 
                                                 table= f'{SCHEMA}.{PREFIX}_validation_{datestamp}', 
                                                 existing_table_rows='drop')


In [30]:
# Score the voterfile
scores_list = []
for m,t in zip(model_output, train_output):
    DV_NAME = ''.join(t.metadata['run']['configuration']['data']['y'])
    MODEL = t.metadata['model']['model']
    print(DV_NAME, MODEL)
    SCORES_TABLE = f'{SCHEMA}.{PREFIX}_{MODEL}_{DV_NAME}_{datestamp}'
    scores_list.append(SCORES_TABLE)
    scores = m.predict(primary_key=PRIMARY_KEY,
                       database_name=DATABASE, 
                       table_name=MODELING_FRAME,
                       if_exists='drop',
                       output_table=SCORES_TABLE,
                       disk_space=10)
scores.result()


pickup_last random_forest_classifier
pickup_last sparse_logistic
id_last random_forest_classifier
id_last sparse_logistic




{'container_id': 60359049,
 'error': None,
 'finished_at': '2020-02-03T05:07:41.000Z',
 'id': 207170878,
 'is_cancel_requested': False,
 'started_at': '2020-02-02T17:49:07.000Z',
 'state': 'succeeded'}

In [None]:
# Generate SQL for final output table and drop intermediary tables
input_train_list = []
output_score_list = []
for i in range(len(DV_LIST)):
    input_train = f"{SCHEMA}.{PREFIX}_training_{i}"
    input_train_list.append(input_train)
    output_score = f"{SCHEMA}.{PREFIX}_{DV_LIST[i]}_{datestamp}"
    output_score_list.append(output_score)

drop_input_train_sql = "\n".join(["drop table if exists {tbl};".format(tbl=v) for v in input_train_list])
drop_output_score_sql = "\n".join(["drop table if exists {tbl};".format(tbl=t) for t in output_score_list])  
dv_strings = "\n,".join(["{dv_score}_1 as {dv_score}".format(dv_score=dv) for dv in DV_LIST])
dv_tiles = "\n,".join(["NTILE(100) OVER (ORDER BY {dv_tile}_1) AS {dv_tile}_100".format(dv_tile=dv) for dv in DV_LIST])
join_table = []
if len(output_score_list) > 1:
    for i in output_score_list[1:]:
        j = str(' left join '+f'{i}'+f' using({PRIMARY_KEY}) ')
        join_table.append(j)
        #dv_strings = "\nleft join ".join(["{dv_score}".format(table=tbl) for tbl in table_list[i])


In [None]:
output_table_sql = f"""
set query_group to 'importers';
set wlm_query_slot_count to 3;
DROP TABLE IF EXISTS {SCHEMA}.{PREFIX}_output_{datestamp};
CREATE TABLE {SCHEMA}.{PREFIX}_output_{datestamp}
  DISTSTYLE KEY
  DISTKEY ({PRIMARY_KEY})
  SORTKEY ({PRIMARY_KEY})
  AS ("""+'select '+ f"{PRIMARY_KEY} \n," + dv_strings + "\n," + dv_tiles + ' from '+ ''.join(output_score_list[0]) + ''.join(join_table) +');'  


In [None]:
print(output_table_sql)

In [None]:
# Create final output table
create_output_table = civis.io.query_civis(sql=output_table_sql, database=DATABASE)
create_output_table.result().state


In [None]:
# Drop intermediary tables
drop_input_train_query = civis.io.query_civis(sql=drop_input_train_sql, database=DATABASE)
drop_input_train_query.result().state

drop_output_score_query = civis.io.query_civis(sql=drop_output_score_sql, database=DATABASE)
drop_output_score_query.result().state

In [None]:
print(drop_input_train_sql)
print(drop_output_score_sql)

In [None]:
# Grant team on tables
grant_statement = f"""
GRANT ALL ON SCHEMA {SCHEMA} TO GROUP bernie_data;
GRANT SELECT ON {SCHEMA}.{PREFIX}_output_{datestamp} TO GROUP bernie_data;
GRANT SELECT ON {SCHEMA}.{PREFIX}_validation_{datestamp} TO GROUP bernie_data;
"""
grant_team = civis.io.query_civis(sql=grant_statement, database=DATABASE)
grant_team.result().state

In [None]:
print(grant_statement)