# Getting Data from Parenthood Server
Amy Jin

July 6th, 2018

This part is to get machine learning data from database. 

### Connect to Parenthood server:

In [3]:
# global import
import os
import sys
import numpy as np
import pandas as pd
from collections import Counter
import operator
import time

# Connect to Parenthood server
import mysql.connector
import sshtunnel
import pureyaml

# handle path
project_dir = !pwd  # dir of current script/notebook file
config_file = open(project_dir[0] + "/db.yaml");
config = pureyaml.load(config_file.read());

# argument dictionary for sshtunnel
ssh_config = {
    'ssh_address_or_host': ('parenthood.set.care', 22),
    'ssh_username':        config['ssh_username'],
    'ssh_password':        config['ssh_password'],
    'remote_bind_address': ('127.0.0.1', 3306),
    'local_bind_address':  ('0.0.0.0', 3333),
}

# argument dictionary for mysql.connector
mysql_config = {
    'user':     config['mysql_user'],
    'password': config['mysql_passwd'],
    'host':     config['mysql_host'],
    'database': 'patch',
    'port':     3333,
}

with sshtunnel.SSHTunnelForwarder(**ssh_config) as tunnel:
    print('SSH tunneling successful on port: {}'.format(tunnel.local_bind_port))
    connection = mysql.connector.connect(**mysql_config)
    cur = connection.cursor()
    print('MySQL server connected successfully!')

SSH tunneling successful on port: 3333
MySQL server connected successfully!


### Get Data From Parenthood Server

In [4]:
def GetDataFromParenthood(db_name, table_name):
    #table1 = str(db_name) + '.' + str(table_name)
    with sshtunnel.SSHTunnelForwarder(**ssh_config) as tunnel:
        connection = mysql.connector.connect(**mysql_config)
        cur = connection.cursor()
        
        # Use `DISTINCT` to reduce the calculation load
        query = ('''
            SELECT *
            FROM {db}.{t1} AS A;
        '''.format(db = db_name, t1 = table_name))

        cur.execute(query)
        rows = list(sum(cur.fetchall(), ()))
        #print rows, len(rows)
        #df = pd.DataFrame(rows)
        #print df
        #print df.shape
        
        # Get columans 
        colnames = ['npi', 'Y_is_oncologist', 'Certification', 'patient_count']
        for i in range(1, 11):  # do the following 10 times:
            colnames.append('X' + str(i))

        # Transform a long list to a (n, 14) dataframe
        # -1 simply means that it is an unknown dimension and we want numpy to figure it out.
        df_reformat =  pd.DataFrame(np.array(rows).reshape(-1, 14), columns=colnames)
        return df_reformat
            
        cur.close()
        connection.close()


In [5]:
tic = time.clock()
data = GetDataFromParenthood('ml_provider_type', 'final_table')
toc = time.clock()
toc - tic

3123.3086489999996

It took 3123.3086489999996 seconds to download data from parenthood server.

In [6]:
data.head(3)

Unnamed: 0,npi,Y_is_oncologist,Certification,patient_count,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10
0,1003000134,0,Dermatopathology,4686,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1003000407,0,Family Medicine/OMT,493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1003000480,0,Surgery,147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Split Data into Train and Test

In [7]:
# Load libraries
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [8]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

In [9]:
print train.shape
print test.shape

(148794, 14)
(37199, 14)


Reference: https://stackoverflow.com/questions/42719765/take-long-list-of-items-and-reshape-into-dataframe-rows-pandas-python-3
https://stackoverflow.com/questions/729197/sql-exclude-a-column-using-select-except-columna-from-tablea