# Write Data to Database

This notebook was used to generate the code needed in the lambda function to write data from a csv file deposited in an s3 bucket to the database.

In [1]:
import pandas as pd
import numpy as np

## Connect to Database

In [2]:
import mysql.connector

In [3]:
endpoint = "seratestdatabase.c4cjk1vto1om.us-east-2.rds.amazonaws.com"
port = "3306"
usr = "admin"
pswd = "your_password_here"
region = "us-east-2b"
dbname = "teachsim"

In [4]:
cnx = mysql.connector.connect(user=usr, password=pswd, host=endpoint, database=dbname)

In [5]:
cursor = cnx.cursor(buffered=True)

## Test Connection

In [6]:
cursor.execute("SELECT id_participant, id_section, id_site, id_year, fb_treat_cond, s18_treat_cond FROM Identifiers WHERE id_participant LIKE 'testID%'")
#cursor.execute("SELECT * FROM Identifiers;")
result = cursor.fetchall()
colnames = [x[0] for x in cursor.description]
pd.DataFrame(result, columns=colnames)

Unnamed: 0,id_participant,id_section,id_site,id_year,fb_treat_cond,s18_treat_cond


In [7]:
cursor.execute("REPLACE INTO Identifiers (id_participant) VALUES ('testID3');")

In [9]:
cursor.execute("DELETE FROM Identifiers;")

## Set Working Directory

In [7]:
import os
os.getcwd()

'/Users/jmachita03/Documents/GitHub/MSDS_SERA_capstone/RDS'

In [36]:
os.chdir("/Users/jmachita03/Desktop/Capstone/RDS Set Up/Cleaned Data")

In [8]:
os.chdir("/Users/jmachita03/Desktop/Capstone/RDS Set Up/Updated Raw Data")

In [44]:
os.chdir('/Users/jmachita03/Documents/GitHub/MSDS_SERA_capstone/RDS')

## Functions

In [9]:
def getColumnNames(tableName, cursor):
    query = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'" + tableName + "'"
    cursor.execute(query)
    result = cursor.fetchall()
    result2 = [x[0] for x in result]
    
    return result2

In [10]:
def overlapColumns(cols1, cols2):
    overlap = []
    for col in cols1:
        if col in cols2:
            overlap = overlap + [col]
    return overlap

In [11]:
def getExistingIds(tableName, cursor):
    query = "SELECT id_participant FROM " + str(tableName)
    cursor.execute(query)
    result = cursor.fetchall()
    existing_ids = [x[0] for x in result]
    return existing_ids

In [12]:
def fixStrings(df):
    types = dict(df.dtypes)
    cols = [x for x in df.columns if types[x] == np.object]
    for index, row in df.iterrows():
        for col in cols:
            value = row[col]
            if type(value) == str:
                new = value.replace("'", "")
                new2 = new.replace('"', '')
                df.loc[index, col] = new2
    return df

In [13]:
# encompasses a combo of the replaceQueries and updateQueries functions
def list_queries(df, columns, tableName, existing_ids):
    queries = []
    for index, row in df.iterrows():
        pid = row['id_participant']
        valid_pid = False
        if type(pid) == str:
            if len(pid) >= 10 and len(pid) <= 12:
                valid_pid = True
                
        # if the row for that participant already exists, just update it
        if pid in existing_ids:
            req_update = False
            query = "UPDATE " + tableName + " SET "
            for col in columns:
                newval = row[col]
                if type(newval) == str:
                    query = query + col + " = '" + newval + "' , "
                    if col != "id_participant":
                        req_update = True
                elif np.isnan(float(newval)) == False: 
                    query = query + col + " = " + str(newval) + " , "
                    if col != "id_participant":
                        req_update = True
            if req_update == True:
                query = query[:-2]
                query = query + "WHERE id_participant = '" + row["id_participant"] + "';"
                queries = queries + [query]
        
        # if that participant is not in the database, create/replace the row
        elif valid_pid == True:
            req_update = False
            query = "REPLACE INTO " +  tableName + " "
            cols = "("
            vals = "("
            for col in columns:
                val = row[col] 
                if type(val) == str:
                    cols = cols + str(col) + ", "
                    vals = vals + "'" + val + "', "
                    if col != "id_participant":
                        req_update = True
                elif np.isnan(float(val)) == False: 
                    cols = cols + str(col) + ", "
                    vals = vals + str(val) + ", "
                    if col != "id_participant":
                        req_update = True
            query = query + cols[:-2] + ") " + 'VALUES ' + vals[:-2] + ");"
            if req_update == True:
                queries = queries + [query]
                existing_ids = existing_ids + [row['id_participant']]
            
    return queries

In [14]:
def generateQueries(df, tableName, cursor):
    df_cols = df.columns
    tab_cols = getColumnNames(tableName, cursor)
    overlap = overlapColumns(df_cols, tab_cols)
    existing_ids = getExistingIds(tableName, cursor)
    cleaned = fixStrings(df)
    query_list = list_queries(cleaned, overlap, tableName, existing_ids)
    return query_list

In [19]:
pd.__file__

'/Users/jmachita03/opt/anaconda3/lib/python3.7/site-packages/pandas/__init__.py'

## Get Id_Participants Already in Table

In [12]:
cursor.execute("SELECT id_participant FROM Identifiers")
result = cursor.fetchall()
existing_ids = [x[0] for x in result]
existing_ids

['nameTest', 'test_data.csv', 'testID4', 'testID5', 'testID6', 'testID7']

## Import Cleaned Dataset

In [15]:
uploaded_data = pd.read_csv("outcome_merged.csv")

In [16]:
uploaded_data

Unnamed: 0,startdate,enddate,status,ipaddress,progress,duration (in seconds),finished,recordeddate,responseid,recipientlastname,...,tse_is,tse_se,tse_total,ytrt_01,ytrt_02,ytrt_03,ytrt_04,ytrt_05,ytrt_total,ccs_gpa_miss
0,,,,,,,,,,,...,7.375,6.750,6.916666,5.0,4.0,6.0,6.0,6.0,5.4,0.0
1,,,,,,,,2019-07-14 10:16:48,,,...,7.375,6.750,6.916666,5.0,4.0,6.0,6.0,6.0,5.4,0.0
2,,,,,,,,2019-04-09 13:49:57,,,...,,,,,,,,,,1.0
3,,,,,,,,2019-04-09 14:21:04,,,...,,,,,,,,,,1.0
4,,,,,,,,2019-04-07 21:41:11,,,...,,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,,,,,,,,2019-04-07 15:44:29,,,...,,,,,,,,,,1.0
370,,,,,,,,,,,...,4.500,4.625,4.416666,5.0,5.0,7.0,7.0,5.0,5.8,0.0
371,,,,,,,,2019-04-05 13:13:43,,,...,4.500,4.625,4.416666,5.0,5.0,7.0,7.0,5.0,5.8,0.0
372,,,,,,,,2019-04-11 14:09:53,,,...,,,,,,,,,,1.0


## Test generateQueries function

In [17]:
quer = generateQueries(uploaded_data, "Identifiers", cursor)

In [18]:
quer

["REPLACE INTO Identifiers (id_participant, fb_treat_cond, id_section, id_site, id_year) VALUES ('1_1819_2_67', 0.0, 2.0, 1.0, 1819.0);",
 "REPLACE INTO Identifiers (id_participant, fb_treat_cond, id_section, id_site, id_year) VALUES ('1_1819_3_86', 0.0, 3.0, 1.0, 1819.0);",
 "UPDATE Identifiers SET id_participant = '1_1819_3_86' , fb_treat_cond = 0.0 , id_section = 3.0 , id_site = 1.0 , id_year = 1819.0 WHERE id_participant = '1_1819_3_86';",
 "REPLACE INTO Identifiers (id_participant, fb_treat_cond, id_section, id_site, id_year) VALUES ('1_1819_4_108', 0.0, 4.0, 1.0, 1819.0);",
 "UPDATE Identifiers SET id_participant = '1_1819_4_108' , fb_treat_cond = 0.0 , id_section = 4.0 , id_site = 1.0 , id_year = 1819.0 WHERE id_participant = '1_1819_4_108';",
 "REPLACE INTO Identifiers (id_participant, fb_treat_cond, id_section, id_site, id_year) VALUES ('1_1819_2_76', 1.0, 2.0, 1.0, 1819.0);",
 "UPDATE Identifiers SET id_participant = '1_1819_2_76' , fb_treat_cond = 1.0 , id_section = 2.0 , id

In [19]:
for query in quer:
    cursor.execute(query)

## Test Loop of Queries

In [20]:
tables = ['Identifiers', 'Survey_Measures', 'Participant_Measures', 'Performance_Measures']
for table in tables:
    queries = generateQueries(uploaded_data, table, cursor)
    for query in queries:
        cursor.execute(query)

## Playing Around with Schema

In [33]:
cursor.execute("SELECT column_name, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'Identifiers'")

In [34]:
cursor.fetchall()

[('br_treat_cond', b'float'),
 ('br_treat_cond_mr', b'float'),
 ('br_treat_cond_iat', b'float'),
 ('fb_treat_cond', b'float'),
 ('f17_treat_cond', b'float'),
 ('s18_treat_cond', b'float'),
 ('fb_treat_cond_iat', b'float'),
 ('id_participant', b'varchar'),
 ('id_section', b'float'),
 ('id_site', b'varchar'),
 ('id_study', b'float'),
 ('id_year', b'float')]

In [30]:
cursor.execute("SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = N'Survey_Measures'")
cols = cursor.fetchall()
cols = [x[0] for x in cols]

In [34]:
cols

['id_participant',
 'base_app_dev_adjust',
 'base_app_dev_challenge',
 'base_app_dev_coach',
 'base_app_dev_counselor',
 'base_app_dev_discuss',
 'base_app_dev_parconf',
 'base_app_dev_plan',
 'base_app_dev_referral',
 'base_app_dev_request',
 'base_app_dev_space',
 'base_app_dev_sped',
 'base_app_dev_spend_time',
 'base_app_dev_studconf',
 'base_app_eth_adjust',
 'base_app_eth_challenge',
 'base_app_eth_coach',
 'base_app_eth_counselor',
 'base_app_eth_discuss',
 'base_app_eth_parconf',
 'base_app_eth_plan',
 'base_app_eth_referral',
 'base_app_eth_request',
 'base_app_eth_space',
 'base_app_eth_sped',
 'base_app_eth_spend_time',
 'base_app_eth_studconf',
 'base_beh_dev_attention',
 'base_beh_dev_behavior',
 'base_beh_dev_contribute',
 'base_beh_dev_defiant',
 'base_beh_dev_demand',
 'base_beh_dev_distracted',
 'base_beh_dev_disturb',
 'base_beh_dev_excitable',
 'base_beh_dev_fidget',
 'base_beh_dev_hum',
 'base_beh_dev_mood',
 'base_beh_dev_quarrel',
 'base_beh_dev_rating',
 'base_be

In [35]:
cols2 = list(uploaded_data.columns)

In [36]:
cols2

['startdate',
 'enddate',
 'status',
 'ipaddress',
 'progress',
 'duration (in seconds)',
 'finished',
 'recordeddate',
 'responseid',
 'recipientlastname',
 'recipientfirstname',
 'recipientemail',
 'externalreference',
 'locationlatitude',
 'locationlongitude',
 'distributionchannel',
 'userlanguage',
 'cid',
 'id',
 'b1oc',
 'b2oc',
 'b3oc',
 'b4oc',
 'b5oc',
 'b6oc',
 'b1ac',
 'b2ac',
 'b3ac',
 'b4ac',
 'b5ac',
 'b6ac',
 'b1ti',
 'b2ti',
 'b3ti',
 'b4ti',
 'b5ti',
 'b6ti',
 'b1sp',
 'b2sp',
 'b3sp',
 'b4sp',
 'b5sp',
 'b6sp',
 'b1cu',
 'b2cu',
 'b3cu',
 'b4cu',
 'b5cu',
 'b6cu',
 'b1re',
 'b2re',
 'b3re',
 'b4re',
 'b5re',
 'b6re',
 'b1su',
 'b2su',
 'b3su',
 'b4su',
 'b5su',
 'b6su',
 'affect',
 'descript',
 'score',
 'rationale',
 'sid',
 'time',
 'vid',
 'double_code',
 'tot_oc',
 'tot_ac',
 'tot_sp',
 'tot_cu',
 'tot_nb',
 'tot_se',
 'tot_ti',
 'tot_su',
 'score_dc_avg',
 'prop_beh_ack',
 'ti_dc_avg',
 'prop_redirect',
 'su_dc_avg',
 'first_beh',
 'q12',
 'q13',
 'q141_1',
 'q1

In [38]:
intersect = [col for col in cols if col in cols2]
intersect

['id_participant']

## Test for Other Cleaned Data

In [22]:
os.chdir("/Users/jmachita03/Desktop/Capstone/RDS Set Up/Cleaned Data")

In [23]:
data = pd.read_csv("2017_2018_all_merged.csv")

In [31]:
tables = ['Identifiers', 'Survey_Measures', 'Participant_Measures', 'Performance_Measures']
for table in tables:
    queries = generateQueries(data, table, cursor)
    for query in queries:
        cursor.execute(query)

In [25]:
quer = generateQueries(data, "Performance_Measures", cursor)

In [26]:
quer

["REPLACE INTO Performance_Measures (id_participant, fb_pre_vid, fb_pre_tot_hit, fb_pre_f_perf, fb_pre_f_txt, fb_pre_f_ntxt, fb_pre_f_desc, fb_pre_f_rest, fb_pre_quality, fb_pre_dc, fb_pre_cid, fb_post_vid, fb_post_tot_hit, fb_post_f_perf, fb_post_f_txt, fb_post_f_ntxt, fb_post_f_desc, fb_post_f_rest, fb_post_quality, fb_post_tot_hit_agree, fb_post_f_perf_agree, fb_post_f_txt_agree, fb_post_f_ntxt_agree, fb_post_f_desc_agree, fb_post_f_rest_agree, fb_post_quality_agree, fb_post_dc, fb_post_cid, fb_post_cid2, br_pre_b1oc, br_pre_b2oc, br_pre_b3oc, br_pre_b4oc, br_pre_b5oc, br_pre_b6oc, br_pre_b1ac, br_pre_b2ac, br_pre_b3ac, br_pre_b4ac, br_pre_b5ac, br_pre_b6ac, br_pre_b1ti, br_pre_b2ti, br_pre_b3ti, br_pre_b4ti, br_pre_b5ti, br_pre_b6ti, br_pre_b1sp, br_pre_b2sp, br_pre_b3sp, br_pre_b4sp, br_pre_b5sp, br_pre_b6sp, br_pre_b1cu, br_pre_b2cu, br_pre_b3cu, br_pre_b4cu, br_pre_b5cu, br_pre_b6cu, br_pre_b1re, br_pre_b2re, br_pre_b3re, br_pre_b4re, br_pre_b5re, br_pre_b6re, br_pre_b1su, br_pr

In [27]:
for query in quer:
        cursor.execute(query)

In [28]:
quer2 = generateQueries(data, "Survey_Measures", cursor)

In [29]:
for query in quer2:
        cursor.execute(query)

In [30]:
cursor.execute("SELECT * FROM Survey_Measures;")
result = cursor.fetchall()
colnames = [x[0] for x in cursor.description]
pd.DataFrame(result, columns=colnames)

Unnamed: 0,id_participant,base_app_dev_adjust,base_app_dev_challenge,base_app_dev_coach,base_app_dev_counselor,base_app_dev_discuss,base_app_dev_parconf,base_app_dev_plan,base_app_dev_referral,base_app_dev_request,...,haber_explain_stud_suc,haber_explain_teach_suc,haber_fallibility,haber_org_and_plan,haber_persistence,haber_survive_in_bureaucracy,haber_testdate,haber_theory_to_practice,haber_total,haber_values_students_learning
0,1_1718_1_108,7.0,5.0,7.0,4.0,6.0,1.0,2.0,,3.0,...,2.0,2.0,1.0,1.0,3.0,1.0,10/22/2017,3.0,36.0,3.0
1,1_1718_1_11,2.0,9.0,2.0,2.0,1.0,4.0,1.0,1.0,1.0,...,,,,,,,,,,
2,1_1718_1_117,1.0,5.0,1.0,1.0,10.0,1.0,7.0,1.0,6.0,...,2.0,1.0,3.0,1.0,3.0,1.0,11/10/2017,3.0,44.0,3.0
3,1_1718_1_12,1.0,8.0,3.0,3.0,5.0,4.0,8.0,2.0,4.0,...,,,,,,,,,,
4,1_1718_1_122,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,3.0,1.0,2.0,1.0,2.0,1.0,10/29/2017,3.0,36.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,1_1718_4_84,5.0,5.0,1.0,1.0,5.0,7.0,5.0,1.0,1.0,...,2.0,1.0,1.0,1.0,3.0,1.0,10/24/2017,2.0,38.0,3.0
95,1_1718_4_86,8.0,7.0,7.0,2.0,5.0,4.0,10.0,,3.0,...,1.0,3.0,2.0,1.0,3.0,1.0,11/9/2017,2.0,40.0,3.0
96,1_1718_4_89,3.0,7.0,5.0,2.0,4.0,1.0,2.0,1.0,3.0,...,2.0,3.0,1.0,1.0,3.0,1.0,10/30/2017,3.0,38.0,3.0
97,1_1718_4_94,2.0,3.0,2.0,1.0,4.0,3.0,1.0,1.0,2.0,...,2.0,1.0,1.0,1.0,3.0,3.0,10/22/2017,3.0,35.0,1.0


In [32]:
cursor.execute("SELECT id_participant, br_post_beh_dev_contribute FROM Survey_Measures WHERE id_participant = '1_1718_1_36';")
result = cursor.fetchall()
colnames = [x[0] for x in cursor.description]
pd.DataFrame(result, columns=colnames)

Unnamed: 0,id_participant,br_post_beh_dev_contribute
0,1_1718_1_36,Whats hes interested in to incorporate that in...


## NLP Data

In [26]:
os.chdir("/Users/jmachita03/Desktop/Capstone/NLP")

In [27]:
data = pd.read_csv("normal_comparison_output_status.csv")

In [29]:
data.head()

Unnamed: 0,doctype,index,filename,study,model,skill,coach,text,cleaned_vectorized_document,similarity_score
0,transcript,21,2019_5_5C_Transcript,Behavior Study 2,behavior,1,person1,Yeah that can be tough. I totally agree. Usual...,"[0.6198429304314097, -0.16436602338468467, -0....",0.270622
1,transcript,18,2019_34_5C_Transcript,Behavior Study 2,behavior,1,person7,"So, how are you feeling about that first ? Tha...","[0.6361390756591526, -0.04385806229881961, -0....",0.300864
2,transcript,1,103-2C,Behavior Study 1,behavior,2,,alright. Why don't you have a sit and we'll-- ...,"[0.4031709161284383, -0.08087668041599247, 0.2...",0.145846
3,transcript,2,6-2C,Behavior Study 1,behavior,2,,So how do you feel about that? Really hard. Wh...,"[0.5472509215170646, -0.07767101051949, 0.2492...",0.244462
4,transcript,9,2019_58_5C_Transcript,Behavior Study 2,behavior,2,person3,alright. Nice job. Like so many questions for ...,"[0.5707874299747961, -0.07655687445423678, -0....",0.277304


## Close Database Connection

In [33]:
cnx.commit()
cnx.close()