# Feature significance example

In [94]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.gridspec as gridspec
import numpy as np
import matplotlib.cm as cm
import pandas as pd

%matplotlib inline

## Lets open the input file
Here we have one row per patient  and each column corresponds to a variable

From column 6 and on, '3rd Ventricle'  the variables are brain features

In [95]:
f = pd.read_csv('variables.csv')
f.head()

Unnamed: 0,DX_bl,AGE,PTEDUCAT,CDRSB,MMSE,FAQ,3rd Ventricle,4th Ventricle,Right Accumbens Area,Left Accumbens Area,...,Right SPL superior parietal lobule,Left SPL superior parietal lobule,Right STG superior temporal gyrus,Left STG superior temporal gyrus,Right TMP temporal pole,Left TMP temporal pole,Right TrIFG triangular part of the inferior frontal gyrus,Left TrIFG triangular part of the inferior frontal gyrus,Right TTG transverse temporal gyrus,Left TTG transverse temporal gyrus
0,AD,81.3,18,4.5,20,10,2.698475,2.907981,0.464328,0.531223,...,9.256819,9.524609,7.733975,6.767715,7.742289,7.17597,3.924761,4.199281,1.665726,1.69567
1,CN,73.7,16,0.0,29,0,1.533795,1.811766,0.419238,0.437089,...,11.197308,11.037356,7.317633,7.339268,8.354081,7.362619,3.840922,3.667695,1.822566,1.817234
2,AD,73.9,12,5.0,24,11,1.088313,1.364788,0.352754,0.418043,...,10.559799,10.672373,6.593533,6.888605,7.594158,7.097453,3.490602,3.769902,1.462676,1.540255
3,CN,65.4,9,0.0,28,0,1.392329,1.968157,0.372398,0.384105,...,8.074973,8.589588,6.606351,6.715842,7.644831,7.624443,3.304219,3.824991,1.438875,1.556557
4,CN,73.1,18,0.0,29,0,1.463126,1.627738,0.36117,0.400582,...,10.335401,10.100816,6.740743,5.922343,7.601225,7.448935,3.550582,4.310352,1.415202,1.458031


## We transform AD (alzheimer) to 1 and CN (control) to 0

In [96]:
f.DX_bl[f.DX_bl=='AD'] = 1.0
f.DX_bl[f.DX_bl=='CN'] = 0.0
f.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,DX_bl,AGE,PTEDUCAT,CDRSB,MMSE,FAQ,3rd Ventricle,4th Ventricle,Right Accumbens Area,Left Accumbens Area,...,Right SPL superior parietal lobule,Left SPL superior parietal lobule,Right STG superior temporal gyrus,Left STG superior temporal gyrus,Right TMP temporal pole,Left TMP temporal pole,Right TrIFG triangular part of the inferior frontal gyrus,Left TrIFG triangular part of the inferior frontal gyrus,Right TTG transverse temporal gyrus,Left TTG transverse temporal gyrus
0,1,81.3,18,4.5,20,10,2.698475,2.907981,0.464328,0.531223,...,9.256819,9.524609,7.733975,6.767715,7.742289,7.17597,3.924761,4.199281,1.665726,1.69567
1,0,73.7,16,0.0,29,0,1.533795,1.811766,0.419238,0.437089,...,11.197308,11.037356,7.317633,7.339268,8.354081,7.362619,3.840922,3.667695,1.822566,1.817234
2,1,73.9,12,5.0,24,11,1.088313,1.364788,0.352754,0.418043,...,10.559799,10.672373,6.593533,6.888605,7.594158,7.097453,3.490602,3.769902,1.462676,1.540255
3,0,65.4,9,0.0,28,0,1.392329,1.968157,0.372398,0.384105,...,8.074973,8.589588,6.606351,6.715842,7.644831,7.624443,3.304219,3.824991,1.438875,1.556557
4,0,73.1,18,0.0,29,0,1.463126,1.627738,0.36117,0.400582,...,10.335401,10.100816,6.740743,5.922343,7.601225,7.448935,3.550582,4.310352,1.415202,1.458031


## Lets create our training arrays and test arrays

In [97]:
import random

train = np.array(f)
print 'total length ', len(train)

n_samples= 100
train = list(train)
test = []

# we are selecting 100 random subjects for testing
for i in range(n_samples):
  n = random.randrange(0,len(train))
  test.append (train.pop(n) )
    
print 'train array ', len(train)
print 'test array ', len(test)

total length  487
train array  387
test array  100


In [98]:
#first value is 1 for alzheimer and 0 control
train=np.array(train).astype(float)
trainX=train[:,1:]
trainY=train[:,0]

trainX[:, 0]

array([ 81.3,  73.7,  73.9,  73.1,  72.6,  71.7,  64.1,  77.7,  73.2,
        76.2,  80.1,  75.5,  74.5,  74.5,  72.9,  78.2,  78.3,  76. ,
        70.6,  73.2,  75.3,  65.1,  77.2,  71.1,  79.6,  72.8,  82.6,
        80.4,  72. ,  73.3,  70.6,  73.1,  59.7,  87.7,  74.1,  76.4,
        78.3,  70.6,  74.8,  78.3,  80.4,  80.5,  78. ,  62.9,  84.3,
        67.4,  78. ,  74.4,  80. ,  78. ,  73.8,  78.5,  85.8,  85.5,
        70.4,  65.9,  56.4,  84.2,  82.7,  78.1,  71.5,  80.9,  70.2,
        70. ,  76.6,  83.4,  75.8,  74. ,  71.8,  77. ,  80.2,  81.3,
        71.6,  56.5,  76. ,  79. ,  75.9,  72.3,  84.6,  69.2,  76.3,
        87.8,  76.1,  76.3,  72.6,  85.8,  85.5,  81.9,  72.7,  81.8,
        82.7,  72.9,  81.1,  86.6,  72. ,  72.6,  77.3,  73.5,  70.7,
        76.4,  70.9,  87.3,  77.5,  75.7,  77.4,  74.9,  87.6,  81.8,
        74.1,  77.7,  69.9,  70. ,  70.8,  81. ,  62.8,  76.8,  82.7,
        71.8,  75.6,  73.6,  84.6,  71.6,  79.8,  79.3,  74.6,  86.2,
        77.5,  72.4,

In [99]:
#lets correct that
trainY= trainY.reshape(len(trainY))
trainY

array([ 1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,
        1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  1.,  0.,
        1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,
        0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
        0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,
        1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,
        0.,  1.,  0.,  1.,  1.,  0.,  0.,  1.,  1.,  1.,  1.,  0

In [100]:
#last value is 1 for alzheimer and 0 control
test=np.array(test).astype(float)
testX=test[:,1:]
testY=test[:,0]

testY=testY.reshape(len(testY))


## Lets run a machine learning algorithm on the data

In [101]:
# import the necessary packages
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn import svm

clf = svm.SVC()
clf.fit(trainX, trainY)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [102]:
prev= clf.predict(testX)
print classification_report(prev, testY)

             precision    recall  f1-score   support

        0.0       0.98      0.95      0.97        63
        1.0       0.92      0.97      0.95        37

avg / total       0.96      0.96      0.96       100



## Lets get run a random forest regressor on the data

In [103]:
import numpy as np
import sklearn as sk
import sklearn.datasets as skd
import sklearn.ensemble as ske
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

reg = ske.RandomForestRegressor()

# it will use only features (ignores the first 5)
reg.fit(trainX[:,5:], trainY);

In [104]:
reg.feature_importances_

array([ 0.01205241,  0.00382566,  0.00229758,  0.        ,  0.04931832,
        0.2659821 ,  0.00806726,  0.00658771,  0.        ,  0.        ,
        0.00104708,  0.00092245,  0.00196883,  0.0016498 ,  0.        ,
        0.00191568,  0.00999203,  0.04340159,  0.03234874,  0.11575901,
        0.        ,  0.00104156,  0.0023519 ,  0.        ,  0.00105692,
        0.00624943,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.0053437 ,  0.        ,  0.01457484,
        0.00267409,  0.00676385,  0.00072939,  0.00665168,  0.00184872,
        0.00362347,  0.00092977,  0.00107406,  0.00183298,  0.        ,
        0.        ,  0.        ,  0.01073779,  0.01520129,  0.00097334,
        0.        ,  0.        ,  0.07440229,  0.00191645,  0.00081071,
        0.00380694,  0.        ,  0.00664167,  0.00279486,  0.00504746,
        0.00465535,  0.00696413,  0.00250531,  0.00262581,  0.01995526,
        0.01316042,  0.00033503,  0.00213649,  0.        ,  0.00

In [105]:
fet_ind = np.argsort(reg.feature_importances_)[::-1]
fet_imp = reg.feature_importances_[fet_ind]

In [106]:
names = f.columns[6:]
print "names", len(names)
fi = open('feature_significance.csv', 'w')
for n, idx in enumerate(fet_ind):
    print "%d,%s,%f " % (n,names[idx], fet_imp[n]) 
    fi.write("%d,%s,%f \n" % (n,names[idx], fet_imp[n]) )
fi.close()

names 136
0,Left Amygdala,0.265982 
1,Left Inf Lat Vent,0.115759 
2,Right Ent entorhinal area,0.074402 
3,Right Amygdala,0.049318 
4,Left Hippocampus,0.043402 
5,Right Inf Lat Vent,0.032349 
6,Right ITG inferior temporal gyrus,0.019955 
7,Left MFC medial frontal cortex,0.018485 
8,Left PO parietal operculum,0.015820 
9,Right CO central operculum,0.015201 
10,Cerebellar Vermal Lobules VI-VII,0.014575 
11,Right MOG middle occipital gyrus,0.014565 
12,Right MTG middle temporal gyrus,0.014240 
13,Left MTG middle temporal gyrus,0.013770 
14,Left ITG inferior temporal gyrus,0.013160 
15,3rd Ventricle,0.012052 
16,Right MFC medial frontal cortex,0.010750 
17,Left Calc calcarine cortex,0.010738 
18,Right Hippocampus,0.009992 
19,Left PP planum polare,0.009427 
20,Left TMP temporal pole,0.008897 
21,Left PrG precentral gyrus,0.008778 
22,Brain Stem,0.008067 
23,Right OFuG occipital fusiform gyrus,0.007495 
24,Right MSFG superior frontal gyrus medial segment,0.007493 
25,Left GRe gyrus rectus,0.

In [111]:
len(reg.feature_importances_)

136

In [117]:
forest = ske.ExtraTreesClassifier()
forest.fit(trainX[:,5:], trainY);

fet_ind = np.argsort(forest.feature_importances_)[::-1]
fet_imp = forest.feature_importances_[fet_ind]

for n, idx in enumerate(fet_ind):
    print "%d,%s,%f " % (n,names[idx], fet_imp[n])

0,Right Inf Lat Vent,0.070306 
1,Left Amygdala,0.044082 
2,Left PHG parahippocampal gyrus,0.037485 
3,Left Ent entorhinal area,0.030246 
4,Left Hippocampus,0.028205 
5,Right Amygdala,0.027289 
6,Left FuG fusiform gyrus,0.026194 
7,Left Inf Lat Vent,0.025366 
8,Right Ent entorhinal area,0.023776 
9,Right TMP temporal pole,0.020047 
10,Right ITG inferior temporal gyrus,0.019582 
11,Right PHG parahippocampal gyrus,0.018970 
12,Left ITG inferior temporal gyrus,0.014881 
13,Left Lateral Ventricle,0.013874 
14,Left TMP temporal pole,0.012661 
15,Right MTG middle temporal gyrus,0.011918 
16,Right IOG inferior occipital gyrus,0.011865 
17,Left MTG middle temporal gyrus,0.010922 
18,Left PO parietal operculum,0.010259 
19,Right SMC supplementary motor cortex,0.009932 
20,Right AIns anterior insula,0.009617 
21,Left IOG inferior occipital gyrus,0.009409 
22,Right Lateral Ventricle,0.009378 
23,Left FRP frontal pole,0.009182 
24,Left OCP occipital pole,0.008989 
25,Right Pallidum,0.008978 
26,Lef

In [122]:
forest = ske.RandomForestClassifier()
forest.fit(trainX[:,5:], trainY);

fet_ind = np.argsort(forest.feature_importances_)[::-1]
fet_imp = forest.feature_importances_[fet_ind]

for n, idx in enumerate(fet_ind):
    print "%d,%s,%f " % (n,names[idx], fet_imp[n])

0,Left Ent entorhinal area,0.077960 
1,Right Amygdala,0.069881 
2,Right Inf Lat Vent,0.052138 
3,Left Amygdala,0.050660 
4,Left PHG parahippocampal gyrus,0.041317 
5,Left MTG middle temporal gyrus,0.038854 
6,Left Inf Lat Vent,0.033726 
7,Left SCA subcallosal area,0.029014 
8,Right Lateral Ventricle,0.027367 
9,Right Hippocampus,0.026869 
10,Left Hippocampus,0.024643 
11,Left Basal Forebrain,0.023913 
12,Right Ent entorhinal area,0.019752 
13,Left TMP temporal pole,0.012128 
14,Right ITG inferior temporal gyrus,0.011732 
15,Right CO central operculum,0.011630 
16,Right TTG transverse temporal gyrus,0.010942 
17,Right PHG parahippocampal gyrus,0.010867 
18,Right ACgG anterior cingulate gyrus,0.010753 
19,Left Thalamus Proper,0.010713 
20,Right PO parietal operculum,0.010661 
21,Left ACgG anterior cingulate gyrus,0.010309 
22,Right MPrG precentral gyrus medial segment,0.010300 
23,Left FO frontal operculum,0.009942 
24,Left MCgG middle cingulate gyrus,0.008873 
25,Right Basal Forebrain,0

In [119]:
len(names)

136

In [120]:
len(names)

136

In [121]:
len(trainX[0])

141