In [1]:
import os
import io
import datetime
import logging
import sys

import numpy as np
import pandas as pd
from pandas.io.json import json_normalize 

import civis
import civis.io
from civis.futures import CivisFuture

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputRegressor

from civis.ml import ModelPipeline
from muffnn import MLPClassifier, MLPRegressor

import uuid
import json
import math
from pprint import pprint
import tempfile
import concurrent.futures
from concurrent.futures import wait
from collections import namedtuple

  from numpy.core.umath_tests import inner1d


In [2]:
# Set Parameter
DATABASE = 'Bernie 2020'
PRIMARY_KEY = 'person_id'
DV_TABLE = 'modeling.spoke_dvs'

DV_LIST = ['spoke_support_1box','spoke_persuasion_1plus','spoke_persuasion_2plus']

MODELING_FRAME = 'bernie_data_commons.phoenix_modeling_frame'
EXCLUSION_COLUMNS = ['jsonid']

SCHEMA = 'bernie_nmarchio2'
PREFIX = 'scored'

datestamp = '{:%Y%m%d}'.format(datetime.date.today())
score_table = f'{SCHEMA}.{PREFIX}_output_{datestamp}'

In [3]:
# Count classes and undersample to avoid class imbalance problems
dv_sql_sum = "\n".join([",sum({dv}) as {dv}".format(dv=i) for i in DV_LIST])
sql_collapse = f"""select count(*) {dv_sql_sum} from {DV_TABLE};"""
sql_count = civis.io.read_civis_sql(sql_collapse, DATABASE)

undersample_list = []
for i in (sql_count[1][1:]):
    u = (int(i)*2)
    undersample_list.append(u)


In [111]:
# Create training views
for i in range(len(sql_count[1])-1):
    dv_item = DV_LIST[i]
    random_sample = round(int(undersample_list[i])/int(sql_count[1][0]),5)
    training_sql = f"""DROP VIEW IF EXISTS {SCHEMA}.{PREFIX}_training_{i} CASCADE;
    CREATE VIEW {SCHEMA}.{PREFIX}_training_{i} AS 
    (select * from (
    (select {PRIMARY_KEY}, {dv_item} from {DV_TABLE} where {dv_item} = 1) 
    union all 
    (select {PRIMARY_KEY}, {dv_item} from {DV_TABLE} where {dv_item} = 0 and random() < {random_sample}))
    left join {MODELING_FRAME} using({PRIMARY_KEY}));"""
    print(q)
    create_training_sql = civis.io.query_civis(training_sql, database=DATABASE)
    create_training_sql.result().state
 

DROP VIEW IF EXISTS bernie_nmarchio2.spoke_training_2 CASCADE;
    CREATE VIEW bernie_nmarchio2.spoke_training_2 AS 
    (select * from (
    (select person_id, spoke_persuasion_2plus from modeling.spoke_dvs where spoke_persuasion_2plus = 1) 
    union all 
    (select person_id, spoke_persuasion_2plus from modeling.spoke_dvs where spoke_persuasion_2plus = 0 and random() < 0.01084))
    left join bernie_data_commons.phoenix_modeling_frame using(person_id));
DROP VIEW IF EXISTS bernie_nmarchio2.spoke_training_2 CASCADE;
    CREATE VIEW bernie_nmarchio2.spoke_training_2 AS 
    (select * from (
    (select person_id, spoke_persuasion_2plus from modeling.spoke_dvs where spoke_persuasion_2plus = 1) 
    union all 
    (select person_id, spoke_persuasion_2plus from modeling.spoke_dvs where spoke_persuasion_2plus = 0 and random() < 0.01084))
    left join bernie_data_commons.phoenix_modeling_frame using(person_id));
DROP VIEW IF EXISTS bernie_nmarchio2.spoke_training_2 CASCADE;
    CREATE VI

In [142]:
# Train models
train_list = []
model_list = []

for index, dv in enumerate(DV_LIST):
    print('TRAINING >>> {}'.format(dv))
    
    exc_list = DV_LIST.copy()
    exc_list.remove(dv)
    
    assert dv not in exc_list 
    
    name = f"""{dv}_{datestamp}"""
    model = ModelPipeline(model='sparse_logistic',
                          dependent_variable=dv,
                          primary_key=PRIMARY_KEY,
                          excluded_columns=EXCLUSION_COLUMNS,
                          calibration='sigmoid',
                          model_name=name,
                          memory_requested=12000)
    
    where_sql = '{} is not null'.format(dv)

    train = model.train(table_name=f"""{SCHEMA}.{PREFIX}_training_{index}""", 
                        database_name=DATABASE,
                        sql_where=where_sql#,
                        #fit_params={'sample_weight': WEIGHT_COL}
                       )
    
    model_list.append(model)
    train_list.append(train)    


TRAINING >>> spoke_support_1box
TRAINING >>> spoke_persuasion_1plus
TRAINING >>> spoke_persuasion_2plus


In [143]:
# Get output of successful models
model_output = model_list
train_output = train_list

jobs_list = []
for t in train_output: 
    try:
        if len(t.metadata['output']) > 0:  
            jobs_list.append(t)
            print('Job success')
    except:
        print('Job failure)
        pass

model_output, train_ouput = zip(*((model, train) for model, train in zip(model_output, train_output) if train in jobs_list))
model_output = list(model_output)
train_output = list(train_output)

job success
job success
job success


In [146]:
# Generate validation metrics
metrics_list = []

for a, b in enumerate(train_output):
    metric = {'job_id':b.job_id,
              'run_id':b.run_id,
              'dv': ''.join(b.metadata['run']['configuration']['data']['y']),
              'model': b.metadata['model']['model'],
              'time_of_train_run': b.metadata['run']['time_of_run'],
              'n_rows': b.metadata['data']['n_rows'],
              'n_features': b.metadata['data']['n_cols'],
              'auc': b.metadata['metrics']['roc_auc'],
              'deciles': b.metadata['metrics']['deciles'],
              'confusion_matrix': b.metadata['metrics']['confusion_matrix'],
              'accuracy': b.metadata['metrics']['accuracy'],
              'p_correct': b.metadata['metrics']['p_correct'],
              'pop_incidence_true': b.metadata['metrics']['pop_incidence_true']
             }
    metrics_list.append(metric)
    
metric_order = (['job_id', 'run_id', 'dv', 'model', 'time_of_train_run', 'n_rows', 'n_features',
                 'auc', 'deciles', 'confusion_matrix', 'accuracy', 'p_correct','pop_incidence_true'])

validation_df = pd.DataFrame.from_records(metrics_list, columns=metric_order, index='run_id')
validation_df

Unnamed: 0_level_0,job_id,dv,model,time_of_train_run,n_rows,n_features,auc,deciles,confusion_matrix,accuracy,p_correct,pop_incidence_true
run_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
189178440,51800037,spoke_support_1box,sparse_logistic,2019-12-09T04:28:52Z,123095,44,0.723725,"[0.12412672623883023, 0.1966041108132261, 0.24...","[[56442, 15610], [24461, 26582]]",0.674471,"[0.7833509132293344, 0.5207766001214662]","[0.5853365286973475, 0.4146634713026524]"
189178442,51800039,spoke_persuasion_1plus,sparse_logistic,2019-12-09T04:29:48Z,8293,44,0.562693,"[0.24125452352231605, 0.26867469879518074, 0.3...","[[5486, 15], [2779, 13]]",0.663089,"[0.9972732230503545, 0.004656160458452722]","[0.6633305197154227, 0.3366694802845773]"
189178445,51800042,spoke_persuasion_2plus,sparse_logistic,2019-12-09T04:29:14Z,2895,44,0.560332,"[0.30689655172413793, 0.27335640138408307, 0.2...","[[1938, 9], [946, 2]]",0.670121,"[0.9953775038520801, 0.002109704641350211]","[0.672538860103627, 0.32746113989637304]"


In [148]:
# Write validation metrics to Redshift
create_validation_table = civis.io.dataframe_to_civis(df=validation_df,
                                                 database=DATABASE, 
                                                 table= f'{SCHEMA}.{PREFIX}_validation_{datestamp}', 
                                                 existing_table_rows='drop')

In [149]:
# Score the voterfile
scores_list = []
for m,t in zip(model_output, train_output):
    DV_LABEL = ''.join(t.metadata['run']['configuration']['data']['y'])
    print(DV_LABEL)
    SCORES_TABLE = f'{SCHEMA}.{PREFIX}_{DV_LABEL}_{datestamp}'
    scores_list.append(SCORES_TABLE)
    scores = m.predict(primary_key=PRIMARY_KEY,
                       database_name=DATABASE, 
                       table_name=MODELING_FRAME,
                       if_exists='drop',
                       output_table=SCORES_TABLE,
                       disk_space=20)
scores.result()

spoke_support_1box
spoke_persuasion_1plus
spoke_persuasion_2plus


True

In [169]:
# Grant team on tables
grant_sql = "".join(["GRANT SELECT ON {tbl} TO GROUP bernie_data;".format(tbl=i) for i in scores_list])
grant_statement = f"""
GRANT ALL ON SCHEMA {SCHEMA} TO GROUP bernie_data;
{grant_sql}
"""
grant_team = civis.io.query_civis(sql=grant_statement, database=DATABASE)
grant_team.result().state

In [None]:
# Drop training views

drop_statement = []
for i in range(len(sql_count[1])-1):
    drop_sql = f"{SCHEMA}.{PREFIX}_training_{i} CASCADE;"
    drop_statement.append(drop_sql)
    

In [167]:
drop_training_views = f'''
DROP VIEW IF EXISTS {} CASCADE;'''


