In [3]:
'''
Workflow to obtain the df_weight used for Machine Learning and Catboost models as a .csv file
'''

import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
import requests

def snowflake_connection():
    """
    Establishes a connection to Snowflake used for later reading and writing of tables
    
    Parameters:
    -----------
    none
    
    Returns:
    --------
    conn: The connection to Snowflake
    
    
    """
    load_dotenv()

    conn = snowflake.connector.connect(
        user=os.getenv("SNOWFLAKE_USER"),
        password = os.getenv("SNOWFLAKE_PWD"),
        account = 'jl41563.us-east-2.aws',
        role='GROUSE_ROLE_C_ANALYTICS',
        warehouse = 'GROUSE_WH',
        database = 'GROUSE_DEID_ANALYTICS_DB',
        schema = 'GROUSE_SEMA')
    
    return conn

conn = snowflake_connection()

def write_to_snowflake(conn, df, name):
    '''
    Parameters:
    -----------
    conn: The snowflake connection
    df: DataFrame
    name: The name of the table to write to in Snowflake
    
    Returns:
    --------
    success, nchincks, nrows, output: Information about the table in Snowflake
    
    '''
    success, nchuncks, nrows, output = write_pandas(conn, df, name)
    return success, nchuncks, nrows, output




'''
    Write the Final Cleaned dataset from the snowflake temp table
    -------------------------------------------------------------
    
    Input:
    ------
    PRESCRIBING_INFO: table from snowflake that is all prescriptions for patients that we have cuis and labels for
    PATID_LABELS: table of all the valid patids and their subsequent labels
    
    Output:
    -------
    
    df_weight: DataFrame N x 18,855 containing all patients as rows and each column is a drug
    '''

def read_prescriptions(conn, sql = '''SELECT * FROM all_pres_final_clean;'''):
    '''
    Reads the prescriptions_data table from snowflake into a DataFrame
    
    Parameters:
    -----------
    conn: Snowflake connection
    sql: SQL query string
    
    
    Returns:
    -------
    df_prescriptions: Nx3 DataFrame 
                        Column 1: PATID
                        Column 2: DATE
                        Column 3: RXNORM_CUI
                        Column 4: INGREDIENT'''
    
    cur = conn.cursor()
    df_prescriptions = cur.execute(sql).fetch_pandas_all()
    return df_prescriptions

sql_str = '''SELECT * 
             FROM labels_final_clean'''

def read_labels(conn, sql = sql_str):
    '''
    Reads the prescriptions_data table from snowflake into a DataFrame
    
    Parameters:
    -----------
    conn: snowflake connection
    
    Returns:
    -------
    df_labels: Nx2 DataFrame 
                        Column 1: PATID
                        Column 2: LABEL
    '''
    
    cur = conn.cursor()
    df_labels = cur.execute(sql).fetch_pandas_all()
    df_labels['LABEL'] = df_labels['LABEL'].astype(float)
    return df_labels

df_prescriptions = read_prescriptions(conn)
df_labels = read_labels(conn)



df_cui_map = pd.read_csv('df_cui_map')

def make_dict_ingredients_map(df_cui_map):
    '''
    Makes a dictionary mapping the ingredient name to the column index
    
    Parameters:
    -----------
    df_cui_map: Nx3 DataFrame 
                        Column 1: INGREDIENT_NAME
                        Column 2: RXCUI_IN
                        Column 3: RXCUI_LOWER 
    
    Returns:
    -------
    dict_indredients_map: dictionary
                        key - ingredient
                        value - index
    '''
    
    dict_ingredients_map = {}
    ingredients = df_cui_map['RXCUI_IN'].unique().tolist()
    for i in range(len(ingredients)):
        ingredient = ingredients[i]
        dict_ingredients_map[ingredient] = i
    
    return dict_ingredients_map


def make_dict_indices_map(dict_ingredients_map):
    '''
    Makes a dictionary mapping the index of the column to the ingredient name
    
    Parameters:
    -----------
    dict_indredients_map: dictionary
                        key - ingredient
                        value - index
    
    Returns:
    -------
    dict_indices_map: dictionary
                        key - index
                        value - ingredient
    '''
    dict_indices_map = {v: k for k, v in dict_ingredients_map.items()}
    return dict_indices_map

def make_dict_index_to_name(df_cui_map, dict_ingredients_map, dict_indices_map):
    dict_index_to_name = {}
    for index in dict_indices_map:
        ingredient_cui = dict_indices_map[index]
        ingredient = df_cui_map[df_cui_map['RXCUI_IN'] == ingredient_cui].iloc[0,0]
        dict_index_to_name[index] = ingredient
    
    return dict_index_to_name
    
    

dict_ingredients_map = make_dict_ingredients_map(df_cui_map)
dict_indices_map = make_dict_indices_map(dict_ingredients_map)
dict_index_to_name = make_dict_index_to_name(df_cui_map, dict_ingredients_map, dict_indices_map)

def prescriptions_to_dict(df_prescriptions):
    '''
    Reads the prescriptions_data table from snowflake into a DataFrame
    
    Parameters:
    -----------
    df_prescriptions: Nx2 DataFrame 
                        Column 1: PATID
                        Column 2: LABEL
    
    Returns:
    -------
    dict_prescriptions: dictionary
                         key - patid
                         value - list of drug_ids
    '''
    '''
    dict_prescriptions = {}
    count=0
    for patid in df_prescriptions['PATID'].unique():
        idx = df_prescriptions.PATID == patid
        dict_prescriptions[patid] = list(df_prescriptions.loc[idx, 'DRUGID'])
        count += 1
        print(count)'''
    
    dict_prescriptions = df_prescriptions.groupby(['PATID'])['INGREDIENT'].apply(list).to_dict()
    return dict_prescriptions
        


def clean_dict(dict_prescriptions, dict_ingredients_map):
    '''
    Reads the prescriptions_data table from snowflake into a DataFrame
    
    Parameters:
    -----------
    dict_prescriptions: dictionary
                         key - patid
                         value - list of drug_ids
    
    Returns:
    -------
    clean_dict: dictionary
                         key - patid
                         value - encoded vector, 1s are indices of drugs
    '''
    clean_dict = {}
    for patid in dict_prescriptions:
        vector_drugs = [0 for i in range(len(dict_ingredients_map))]
        patid_drugs = dict_prescriptions[patid]
        for drug in patid_drugs:
            index = dict_ingredients_map[int(drug)]
            vector_drugs[index] = 1
        clean_dict[patid] = vector_drugs
    return clean_dict



# Annotate below here
#Create a dictionary mapping the patids to their labels
#Check this function
def labels_to_dict(df_labels):
    '''
    Transforms the labels data frams into a dictionary
    
    Parameters:
    -----------
    df_labels: DataFrame
                Column 1: Patid
                Column 2: Init_BMI
                Column 3: Final_BMI
                Column 4: Label (change in BMI)
    
    Returns:
    -------
    clean_dict: dictionary
                         key - patid (object)
                         value - label (float)
    '''
    result = {}
    for patid in df_labels['PATID'].unique():
        label_df = df_labels.loc[df_labels['PATID'] == patid]
        label = label_df.iloc[0,1]
        result[patid] = label
    return result


dict_prescriptions = prescriptions_to_dict(df_prescriptions)
dict_pres_clean = clean_dict(dict_prescriptions, dict_ingredients_map)
dict_labels = labels_to_dict(df_labels)

def write_df_weight(dict_pres_clean, dict_labels):   
    '''
    Creates a dataframe of the final data, creates
    2D list and stack for each row is a patid, from this create a dataframe for ML
    
    Parameters:
    -----------
    dict_pres_clean: dictionary mapping patids to vector indicating drugs taken (list)
    dict_labels: dictionary mapping patids to their label (int)
    
    Returns:
    -------
    df_obesity: DataFrame with predictor drug columns and variable column
    
    '''
    result = []
    dict_patids_index = {}
    i = 0
    for patid in dict_labels:
        label = float(dict_labels[patid])
        if patid in dict_pres_clean:
            pres = list(dict_pres_clean[patid])
            #Putting the label in the last column and appending this to result
            pres.append(label)
            result.append(pres)
            dict_patids_index[patid] = i
            i = i+1
    df_weight = pd.DataFrame(result)
    return df_weight, dict_patids_index

df_weight, dict_patids_index = write_df_weight(dict_pres_clean, dict_labels)


def add_demo_information(df_demo, df_weight, dict_patids_index, 
                         dict_ingredients_map, dict_indices_map, dict_index_to_name):
    '''
    Funciton to add the columns containing the demographic information and the PHECODES
    '''
    n = max(list(dict_indices_map.keys()))
    df_test = df_weight.set_axis([*df_weight.columns[:-1], 'Label'], axis=1, inplace=False)
    label = df_test.pop('Label')
    df_append = pd.DataFrame(np.zeros((df_demo.shape[0], df_demo.shape[1]-1)))
    df_append.columns = list(df_demo.columns)[1:]
    for i in range(df_demo.shape[0]):
        patid = df_demo.iloc[i,0]
        if(patid in dict_patids_index.keys()):
            index = dict_patids_index[patid]
            for j in range(1, df_demo.shape[1]):
                col = list(df_demo.columns)[j]
                df_append.loc[index,col] = int(df_demo.iloc[i,j])
    for j in range(1, df_demo.shape[1]):
        col = list(df_demo.columns)[j]
        df_test[col] = df_append[col]
        #Fix the Dictionaries
        dict_index_to_name[n+j] = col
        dict_indices_map[n+j] = col
        dict_ingredients_map[col] = n+j
    df_test['Label'] = label
    return df_test, dict_ingredients_map, dict_indices_map, dict_index_to_name



def name_columns(df_weight, dict_index_to_name):
    for index in dict_index_to_name.keys():
        name = dict_index_to_name[index]
        df_weight.columns.values[index] = name
    return df_weight
            
    
sql = '''SELECT * from sema_demo_phecd_cat_clean'''
cur = conn.cursor()
df_demo = cur.execute(sql).fetch_pandas_all()

df_weight, dict_ingredients_map, dict_indices_map, dict_index_to_name = add_demo_information(df_demo, 
                                                                                             df_weight, 
                                                                                             dict_patids_index, 
                                                                                             dict_ingredients_map, 
                                                                                             dict_indices_map, 
                                                                                            dict_index_to_name)


df_weight = df_weight.dropna()
print(df_weight.shape)
df_weight = name_columns(df_weight, dict_index_to_name)
print(df_weight.shape)


        
'''
Moving the label column to the end
'''

df_weight = df_weight[[col for col in df_weight.columns if col != 'Label'] + ['Label']]
df_weight.head()

'''
Dropping the patients that have had a pregnancy, then dropping that column
'''
print(df_weight.shape)
df_weight = df_weight[df_weight['PREG']<1]
df_weight = df_weight.drop('PREG', axis=1)
dict_index_to_patid = {v: k for k, v in dict_patids_index.items()}

df_weight.head()

(3597, 3129)
(3597, 3129)
(3597, 3129)


Unnamed: 0,4-aminobenzoic acid,aclarubicin,adenosine triphosphate,alfentanil,aluminum hydroxide,mitomycin,ammonium chloride,belladonna alkaloids,boric acid,calcium phosphate,...,EYE,CONG,METAB,BLOOD,STAT,DEV,MENTAL,HEARING,ID,Label
0,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.02963
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.045685
2,0,0,0,0,0,0,0,0,0,0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015936
3,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,-0.102639
4,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.117073


In [4]:
#Excess data cleaning
print(df_weight.shape)
df_weight = df_weight[(df_weight['Label'] < 0.8) & (df_weight['Label'] > -0.8)]
print(df_weight.shape)
df_weight.to_csv('df_weight', index=False)

(3556, 3128)
(3555, 3128)


In [5]:
df_weight.to_csv('df_weight', index=False)

In [1]:
# Install needed packages
!pip install snowflake-connector-python==2.7.7

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/mxnet_latest_p37/bin/python -m pip install --upgrade pip' command.[0m
