In [1]:
import requests
import json

## Getting the results through a query

In [442]:
def send_query( executer_url , user, query):
    data = dict(query=query )
    url = '%s/query' % executer_url 
    response=requests.post(url, json=data, auth=(user, '*****'))
    if response.status_code != 200:
        if response.status_code == 400:
            raise Exception ("Query error:\n%s" % response.text)
        else:

            raise Exception ("Unknown Error:\n%s" % response.text)
    else:
        r = json.loads(response.text)
        return r

In [454]:
executer='http://diascld10.epfl.ch:54321'
user = 'federation'
# we are not using  variable_name = "APOE4_bl" as this has empty values
query = """
select distinct patient_id,( select variable_name as name, value from partition) as vars
    from exam_value
    where 
         variable_name = "DX_bl" or
         variable_name = "CDRSB_bl" or
         variable_name = "MMSE_bl" or
         variable_name = "AGE" or
         variable_name = "PTGENDER" or
         variable_name = "PTEDUCAT" 
    group by patient_id
"""
result = send_query(executer, user,query )

## Getting the list of different types of diagnosis (DX_bl)

In [455]:
user = '********'
query = """
select distinct value
    from exam_value
    where  variable_name = "DX_bl"
"""

result2 = send_query(executer, user,query )
result2

{u'compilationTime': 274,
 u'executionTime': 186,
 u'output': [u'CN', u'LMCI', u'AD', u'EMCI', u'SMC']}

In [456]:
#creating helper dictionary to map diagnosis to a float value
dx= dict()
for n,d in enumerate(result2['output']):
    dx[d]=float(n)
    print d ,"->" ,dx[d]

CN -> 0.0
LMCI -> 1.0
AD -> 2.0
EMCI -> 3.0
SMC -> 4.0


## we will try to get the feature significance
this is as see in example 
http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html

In [457]:
import numpy as np
import sklearn as sk
import sklearn.datasets as skd
import sklearn.ensemble as ske
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

### We need to transform all values to float for the algorithm

In [458]:
import random

def get_dx_value(value):
    return dx[value]

random.seed()
data = result['output']
trainX = []
trainY = []
for p in data:
    values = dict()
    y = None
    # transforms vars to a dict and converts the values to float
    for v in p['vars']:
        name =  v['name']
        value = v['value']
        if name == 'DX_bl':
            #y value for training 
            y = dx[value]
        elif  name != 'PTGENDER':
            try:
                values[name] = float(value)
            except:
                print "could not transform %s->'%s'" % (name, value)
                raise
        elif value == 'Male':
            values[name] = 1.0
        else:
            values[name] = 0.0
    
    values['Random'] =random.random()
    # sorts the keys so that all values always in the same order
    keys = values.keys()
    keys.sort()
    trainX.append([values[k] for k in keys])
    trainY.append(y)

In [459]:
keys

[u'AGE', u'CDRSB_bl', u'MMSE_bl', u'PTEDUCAT', u'PTGENDER', 'Random']

### This is using a extra trees classifier

In [460]:
forest = ske.ExtraTreesClassifier()
forest.fit(trainX, trainY)

fet_ind = np.argsort(forest.feature_importances_)[::-1]
fet_imp = forest.feature_importances_[fet_ind]

for n, idx in enumerate(fet_ind):
    print "%d,%s,%f " % (n,keys[idx], fet_imp[n])

0,CDRSB_bl,0.375090 
1,MMSE_bl,0.184442 
2,AGE,0.163156 
3,Random,0.148804 
4,PTEDUCAT,0.108878 
5,PTGENDER,0.019629 


In [415]:
forest.feature_importances_

array([ 0.16668616,  0.36634401,  0.19001983,  0.10859176,  0.02568605,
        0.14267219])

### This is using a random forest classifier

In [416]:
reg = ske.RandomForestClassifier()
reg.fit(trainX, trainY);

fet_ind = np.argsort(reg.feature_importances_)[::-1]
fet_imp = reg.feature_importances_[fet_ind]

for n, idx in enumerate(fet_ind):
    print "%d,%s,%f " % (n,keys[idx], fet_imp[n]) 

0,CDRSB_bl,0.358041 
1,MMSE_bl,0.195871 
2,AGE,0.166864 
3,Random,0.142457 
4,PTEDUCAT,0.107641 
5,PTGENDER,0.029126 


### This is using a random forest regressor
This does not work as well as the others I think

In [417]:
reg = ske.RandomForestRegressor()

reg.fit(trainX, trainY);

fet_ind = np.argsort(reg.feature_importances_)[::-1]
fet_imp = reg.feature_importances_[fet_ind]

for n, idx in enumerate(fet_ind):
    print "%d,%s,%f " % (n,keys[idx], fet_imp[n]) 

0,AGE,0.334101 
1,Random,0.216839 
2,CDRSB_bl,0.216176 
3,PTEDUCAT,0.117453 
4,MMSE_bl,0.087617 
5,PTGENDER,0.027813 


In [419]:
l = [a[4] for a in trainX]

print reduce(lambda x, y: x + y, l) / len(l)

0.568221070812


In [429]:
ls 

feature_significance.csv           separate_data.ipynb    use_case_query.ipynb
feature_significance_simple.ipynb  use_case_query2.ipynb  variables.csv


In [428]:
!cp use_case_query.ipynb use_case_query2.ipynb