# Query Page API

In [1]:
import pandas as pd
import numpy as np
import mysql.connector

## Connect to Database

In [2]:
endpoint = "seratestdatabase.c4cjk1vto1om.us-east-2.rds.amazonaws.com"
port = "3306"
usr = "admin"
pswd = "your_password_here"
region = "us-east-2b"
dbname = "teachsim"

In [3]:
cnx = mysql.connector.connect(user=usr, password=pswd, host=endpoint, database=dbname)

In [4]:
cursor = cnx.cursor(buffered=True)

In [5]:
def showTable(tableName, cursor):
    query = "SELECT * FROM " + tableName + ";"
    cursor.execute(query)
    result = cursor.fetchall()
    colnames = [x[0] for x in cursor.description]
    df = pd.DataFrame(result, columns=colnames)
    return df

In [6]:
def execute(query, cursor):
    cursor.execute(query)
    result = cursor.fetchall()
    colnames = [x[0] for x in cursor.description]
    df = pd.DataFrame(result, columns=colnames)
    return df

In [7]:
showTable("Identifiers", cursor)

Unnamed: 0,br_treat_cond,br_treat_cond_mr,br_treat_cond_iat,fb_treat_cond,f17_treat_cond,s18_treat_cond,fb_treat_cond_iat,id_participant,id_section,id_site,id_study,id_year
0,1.0,,,0.0,1.0,2.0,,1_1718_1_108,1.0,,,1718.0
1,1.0,,,0.0,1.0,2.0,,1_1718_1_11,1.0,,,1718.0
2,0.0,,,1.0,3.0,1.0,,1_1718_1_117,1.0,,,1718.0
3,1.0,,,1.0,2.0,3.0,,1_1718_1_12,1.0,,,1718.0
4,1.0,,,1.0,2.0,3.0,,1_1718_1_122,1.0,,,1718.0
...,...,...,...,...,...,...,...,...,...,...,...,...
189,,,,0.0,,,,1_1819_4_113,4.0,1.0,,1819.0
190,,,,0.0,,,,1_1819_4_114,4.0,1.0,,1819.0
191,,,,1.0,,,,1_1819_4_115,4.0,1.0,,1819.0
192,,,,1.0,,,,1_1819_4_118,4.0,1.0,,1819.0


In [8]:
['Identifiers', 'Participant_Measures', 'Survey_Measures', 'Performance_Measures', 'Participant_Tracker']

['Identifiers',
 'Participant_Measures',
 'Survey_Measures',
 'Performance_Measures',
 'Participant_Tracker']

## Functions

### Background Functions

In [9]:
def all_variables(cursor):
    query = """
SELECT column_name, data_type FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'Identifiers' 
OR TABLE_NAME = 'Participant_Measures'
OR TABLE_NAME = 'Survey_Measures'
OR TABLE_NAME = 'Performance_Measures'
OR TABLE_NAME = 'Participant_Tracker'
"""
    cursor.execute(query)
    result = cursor.fetchall()
    colnames = [x[0] for x in cursor.description]
    df = pd.DataFrame(result, columns=colnames)
    names = list(df['COLUMN_NAME'])
    return names
    

In [10]:
def numeric(cursor):
    query = """
SELECT column_name, data_type FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'Identifiers' 
OR TABLE_NAME = 'Participant_Measures'
OR TABLE_NAME = 'Survey_Measures'
OR TABLE_NAME = 'Performance_Measures'
OR TABLE_NAME = 'Participant_Tracker'
"""
    cursor.execute(query)
    result = cursor.fetchall()
    colnames = [x[0] for x in cursor.description]
    df = pd.DataFrame(result, columns=colnames)
    df2 = df[df['DATA_TYPE'] != b'varchar']
    names = list(df2['COLUMN_NAME'])
    return names

In [11]:
def text(cursor):
    query = """
SELECT column_name, data_type FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'Identifiers' 
OR TABLE_NAME = 'Participant_Measures'
OR TABLE_NAME = 'Survey_Measures'
OR TABLE_NAME = 'Performance_Measures'
OR TABLE_NAME = 'Participant_Tracker'
"""
    cursor.execute(query)
    result = cursor.fetchall()
    colnames = [x[0] for x in cursor.description]
    df = pd.DataFrame(result, columns=colnames)
    df2 = df[df['DATA_TYPE'] == b'varchar']
    names = list(df2['COLUMN_NAME'])
    return names

In [12]:
def table_variables(tableNames, cursor):
    if len(tableNames) == 0:
        return None
    query = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '"
    query = query + tableNames[0] + "' "
    if len(tableNames) > 1:
        for table in tableNames[1:]:
            query = query + "OR TABLE_NAME = '" + table + "' "
    cursor.execute(query)
    result = cursor.fetchall()
    colnames = [x[0] for x in cursor.description]
    df = pd.DataFrame(result, columns=colnames)
    names = list(df['COLUMN_NAME'])
    
    return names

In [13]:
def convert_semester(time_list):
    converted_list = []
    if "Fall 2017" in time_list or "Spring 2018" in time_list:
        converted_list.append(1718)
    if "Fall 2018" in time_list or "Spring 2019" in time_list:
        converted_list.append(1819)
    if "Fall 2019" in time_list or "Spring 2020" in time_list:
        converted_list.append(1920)
    if "Fall 2020" in time_list or "Spring 2021" in time_list:
        converted_list.append(2021)
    return converted_list
        

In [14]:
def generate_mapping(cursor):
    survey = table_variables(['Survey_Measures'], cursor)
    performance = table_variables(['Performance_Measures'], cursor)
    identifiers = table_variables(['Identifiers'], cursor)
    mapping = {'Survey_Measures': {'baseline_post-survey': [x for x in survey if 'base_' in x], 
                                'big5': [x for x in survey if 'big5_' in x], 
                                'behavioral_redirections_post_Treatment': [x for x in survey if 'br_post_' in x], 
                                'behavioral_redirections_pre_treatment': [x for x in survey if 'br_pre_' in x], 
                                'exit_post_survey': [x for x in survey if 'exit_' in x], 
                                'feedback_post_treatment': [x for x in survey if 'fb_post_' in x], 
                                'feedback_pre_treatment': [x for x in survey if 'fb_pre_' in x], 
                                'haberman': [x for x in survey if 'haber_' in x]},
            'Performance_Measures': {'behavioral_redirections_baseline': [x for x in performance if 'br_base_' in x], 
                                    'behavioral_redirections_pre_treatment': [x for x in performance if 'br_pre_' in x], 
                                    'behavioral_redirections_post_treatment': [x for x in performance if 'br_post_' in x], 
                                    'behavioral_redirections_exit': [x for x in performance if 'br_exit_' in x], 
                                    'feedback_baseline': [x for x in performance if 'fb_base' in x], 
                                    'feedback_pre_treatment': [x for x in performance if 'fb_pre_' in x], 
                                    'feedback_post_treatment': [x for x in performance if 'fb_post' in x], 
                                    'feedback_exit': [x for x in performance if 'fb_exit_' in x]},
            'Identifiers': {'primary_treatment_condition': [x for x in identifiers if x == 'br_treat_cond'],
                           'mental_rehearsement_treatment_condition': [x for x in identifiers if x == 'br_treat_cond_mr'],
                           'iat_treatment_condition': [x for x in identifiers if x == 'br_treat_cond_iat'],
                           'university_email_address': [x for x in identifiers if x == 'email'],
                           'primary_treatment_condition for Feedback Simulation': [x for x in identifiers if x == 'fb_treat_cond'],
                           'original_treatment_condition_fall_2017': [x for x in identifiers if x == 'f17_treat_cond'],
                           'original_treatment_condition_spring_2018': [x for x in identifiers if x == 's18_treat_cond'],
                           'iat_treatment_condition_for_feedback_simulation': [x for x in identifiers if x == 'fb_treat_cond_iat'],
                           'participant_id': [x for x in identifiers if x == 'id_participant'],
                           'section_identifier': [x for x in identifiers if x == 'id_section'],
                           'site_identifier': [x for x in identifiers if x == 'id_site'],
                           'study_identifier': [x for x in identifiers if x == 'id_study'],
                           'year_identifier': [x for x in identifiers if x == 'id_year']}}
    return mapping

### Second Level Functions

In [38]:
# this function takes the overall measures provided and the specific measures and returns 
# the relevant variables in a list
def get_variables(cursor, measure_list, specific_measure_list=[], field_type=[]):
    
    # the first half returns the relevant variables mapped from measure_list and specific_measure_list
    cols = []
    mapping = generate_mapping(cursor)
    
    # adds the table to measure_list if it was not checked but one of its subsets in specific_measure_list was included
    for key in mapping.keys():
        specifics = [x for x in specific_measure_list if x in mapping[key].keys()]
        if key not in measure_list and len(specifics) > 0:
            measure_list = measure_list + [key]
    
    for measure in measure_list:
        if measure in mapping.keys():
            specific_checked = False
            for specific in specific_measure_list:
                if specific in mapping[measure].keys():
                    specific_checked = True
                    cols = cols + mapping[measure][specific]
            if specific_checked == False:
                table_cols = table_variables([measure], cursor)
                cols = cols + table_cols
        else:
            table_cols = table_variables([measure], cursor)
            cols = cols + table_cols
    

    # the second half filters based on the field type
    numbs = numeric(cursor)
    texts = text(cursor)
    
    if field_type == None or field_type == ['Numeric', 'Text'] or field_type == ['Text', 'Numeric']:
        cols = cols
    elif field_type == ['Numeric']:
        cols = [x for x in cols if x in numbs]

    elif field_type == ['Text']:
        cols = [x for x in cols if x in texts]
    
    cols = list(set(cols))
    cols = [x for x in cols if x != "id_participant"]
    cols = ['id_participant'] + cols
    
    return cols
    
    

In [16]:
def sub_query(time_list):
    # Build main portion
    # note that the id_participant column being renamed in Participant_Tracker require
    # that it be treated differently for joins and this will include that extra column
    query = "SELECT * FROM Identifiers LEFT JOIN Participant_Measures USING (id_participant) "
    query = query + "LEFT JOIN Survey_Measures USING (id_participant) "
    query = query + "LEFT JOIN Performance_Measures USING (id_participant) "
    query = query + "LEFT JOIN Participant_Tracker c ON id_participant = c.Participant_ID "
    
    # Add Where clause to filter based on time
    time = ""
    if time_list == None or 'All' in time_list or time_list == []:
        time = ""
    else:  
        time_list = convert_semester(time_list)
        time = time + "WHERE id_year in ("
        time = time + ", ".join(map(str,time_list))
        time = time + ")"
    
    query = query + time
    return query
    

### Overall Function

In [17]:
def request(cursor, measure_list, time_list, specific_measure_list, field_type):
    query = "SELECT "
    sub = sub_query(time_list)
    measures = ['Identifiers', 'Participant_Measures', 'Survey_Measures', 'Performance_Measures', 'Participant_Tracker']
    cols = get_variables(cursor=cursor, measure_list=measures, specific_measure_list=specific_measure_list, field_type=field_type)
    for col in cols:
        query = query + "a." + col + ", "
    query = query[:-2] + " FROM ( " + sub + " ) a"
    
    return query
    

## Test

In [158]:
measures = ['Identifiers', 'Participant_Measures', 'Survey_Measures', 'Performance_Measures', 'Participant_Tracker']
query = request(cursor, measure_list=measures, time_list=['All'], specific_measure_list=[], field_type=['Numeric', 'Text'])
query

'SELECT a.br_base_b1ti_agree, a.br_exit_tot_se_agree, a.br_post_sim_sec_cmsk, a.tse_17, a.br_pre_b5cu_agree, a.br_pre_descript, a.br_pre_b1re_agree, a.fb_exit_f_desc, a.fb_post_sim_teach_change_rate, a.fb_pre_f_ntxt, a.br_base_b3su_agree, a.br_post_b3sp_agree, a.br_post_b3ti, a.br_pre_b5su, a.haber_explain_teach_suc, a.br_post_manage_app_eth_negative, a.fb_post_srcoach_beneficial, a.br_pre_rubric_suc, a.br_pre_tse_21, a.br_post_tse_12, a.big5_con_3, a.br_pre_b5ac, a.br_pre_b2re, a.br_post_app_dev_parconf, a.br_exit_b5su_agree, a.base_app_eth_referral, a.base_beh_dev_rating_impulsive, a.fb_pre_sim_txt_concerns, a.base_beh_dev_behavior, a.fb_post_tse_16, a.br_pre_tot_su, a.grit_01, a.br_post_beh_dev_smart, a.tse_24, a.base_app_dev_space, a.neo_37, a.br_base_tot_ti_agree, a.br_exit_tot_cu, a.tmas_17, a.neo_n, a.br_post_b6cu_agree, a.r_grit_08, a.br_post_tse_23, a.br_post_b2su, a.br_post_quality, a.crtse_05, a.br_post_enddate, a.br_post_rubric_dial, a.fb_pre_sim_relevant_prof, a.crtse_07, 

In [149]:
sub = sub_query(['All'])
sub

'SELECT * FROM Identifiers LEFT JOIN Participant_Measures USING (id_participant) LEFT JOIN Survey_Measures USING (id_participant) LEFT JOIN Performance_Measures USING (id_participant) LEFT JOIN Participant_Tracker c ON id_participant = c.Participant_ID '

In [159]:
execute(query, cursor)

Unnamed: 0,br_base_b1ti_agree,br_exit_tot_se_agree,br_post_sim_sec_cmsk,tse_17,br_pre_b5cu_agree,br_pre_descript,br_pre_b1re_agree,fb_exit_f_desc,fb_post_sim_teach_change_rate,fb_pre_f_ntxt,...,exit_beh_eth_excitable,r_rsq_24,br_pre_b1ti_agree,neo_32,br_exit_b5re,br_exit_b4sp_agree,fb_post_sim_prepare_differently,br_post_b6oc,br_post_tse_24,br_post_b2re
0,,,,6.0,,,,,,,...,,,,4.0,,,,,,
1,,,,3.0,,,,,,,...,,,,3.0,,,,,,
2,,,,5.0,,,,,,,...,,,,2.0,,,,,,
3,,,,4.0,,,,,,,...,,,,3.0,,,,,,
4,,,,7.0,,,,,,,...,,,,4.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,,,,3.0,,,,,,,...,,,,2.0,,,,,,
91,,,,9.0,,,,,,,...,,,,2.0,,,,,,
92,,,,7.0,,,,,,,...,,,,3.0,,,,,,
93,,,,5.0,,,,,,,...,,,,2.0,,,,,,


## Close Database Connection

In [21]:
cnx.commit()
cnx.close()