# Feature significance example

In [1]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.gridspec as gridspec
import numpy as np
import matplotlib.cm as cm
import pandas as pd
from nifti import NiftiImage

%matplotlib inline

## Lets open the input file
Here we have one row per patient  and each column corresponds to a variable

From column 6 and on, '3rd Ventricle'  the variables are brain features

In [7]:
f = pd.read_csv('variables.csv')
f.head()

Unnamed: 0,DX_bl,AGE,PTEDUCAT,CDRSB,MMSE,FAQ,3rd Ventricle,4th Ventricle,Right Accumbens Area,Left Accumbens Area,...,Right SPL superior parietal lobule,Left SPL superior parietal lobule,Right STG superior temporal gyrus,Left STG superior temporal gyrus,Right TMP temporal pole,Left TMP temporal pole,Right TrIFG triangular part of the inferior frontal gyrus,Left TrIFG triangular part of the inferior frontal gyrus,Right TTG transverse temporal gyrus,Left TTG transverse temporal gyrus
0,AD,81.3,18,4.5,20,10,2.698475,2.907981,0.464328,0.531223,...,9.256819,9.524609,7.733975,6.767715,7.742289,7.17597,3.924761,4.199281,1.665726,1.69567
1,CN,73.7,16,0.0,29,0,1.533795,1.811766,0.419238,0.437089,...,11.197308,11.037356,7.317633,7.339268,8.354081,7.362619,3.840922,3.667695,1.822566,1.817234
2,AD,73.9,12,5.0,24,11,1.088313,1.364788,0.352754,0.418043,...,10.559799,10.672373,6.593533,6.888605,7.594158,7.097453,3.490602,3.769902,1.462676,1.540255
3,CN,65.4,9,0.0,28,0,1.392329,1.968157,0.372398,0.384105,...,8.074973,8.589588,6.606351,6.715842,7.644831,7.624443,3.304219,3.824991,1.438875,1.556557
4,CN,73.1,18,0.0,29,0,1.463126,1.627738,0.36117,0.400582,...,10.335401,10.100816,6.740743,5.922343,7.601225,7.448935,3.550582,4.310352,1.415202,1.458031


## We transform AD (alzheimer) to 1 and CN (control) to 0

In [8]:
f.DX_bl[f.DX_bl=='AD'] = 1.0
f.DX_bl[f.DX_bl=='CN'] = 0.0
f.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


Unnamed: 0,DX_bl,AGE,PTEDUCAT,CDRSB,MMSE,FAQ,3rd Ventricle,4th Ventricle,Right Accumbens Area,Left Accumbens Area,...,Right SPL superior parietal lobule,Left SPL superior parietal lobule,Right STG superior temporal gyrus,Left STG superior temporal gyrus,Right TMP temporal pole,Left TMP temporal pole,Right TrIFG triangular part of the inferior frontal gyrus,Left TrIFG triangular part of the inferior frontal gyrus,Right TTG transverse temporal gyrus,Left TTG transverse temporal gyrus
0,1,81.3,18,4.5,20,10,2.698475,2.907981,0.464328,0.531223,...,9.256819,9.524609,7.733975,6.767715,7.742289,7.17597,3.924761,4.199281,1.665726,1.69567
1,0,73.7,16,0.0,29,0,1.533795,1.811766,0.419238,0.437089,...,11.197308,11.037356,7.317633,7.339268,8.354081,7.362619,3.840922,3.667695,1.822566,1.817234
2,1,73.9,12,5.0,24,11,1.088313,1.364788,0.352754,0.418043,...,10.559799,10.672373,6.593533,6.888605,7.594158,7.097453,3.490602,3.769902,1.462676,1.540255
3,0,65.4,9,0.0,28,0,1.392329,1.968157,0.372398,0.384105,...,8.074973,8.589588,6.606351,6.715842,7.644831,7.624443,3.304219,3.824991,1.438875,1.556557
4,0,73.1,18,0.0,29,0,1.463126,1.627738,0.36117,0.400582,...,10.335401,10.100816,6.740743,5.922343,7.601225,7.448935,3.550582,4.310352,1.415202,1.458031


## Lets create our training arrays and test arrays

In [20]:
import random

train = np.array(f)
print 'total length ', len(train)

n_samples= 100
train = list(train)
test = []

# we are selecting 100 random subjects for testing
for i in range(n_samples):
  n = random.randrange(0,len(train))
  test.append (train.pop(n) )
    
print 'train array ', len(train)
print 'test array ', len(test)

total length  487
train array  387
test array  100


In [14]:
#first value is 1 for alzheimer and 0 control
train=np.array(train).astype(float)
trainX=train[:,1:]
trainY=train[:,0]

trainX[:, 0]

array([ 73.7,  73.9,  65.4,  73.1,  72.6,  71.7,  64.1,  77.7,  73.2,
        76.2,  80.1,  70.1,  74.5,  74.5,  74.4,  72.9,  74. ,  78.2,
        78.3,  76. ,  70.6,  73.2,  75.3,  80.3,  65.1,  77.2,  79.6,
        70.1,  82.6,  80.4,  72. ,  73.3,  80.2,  65.9,  59.7,  87.7,
        74.1,  77.9,  72.5,  78.3,  70.6,  73.2,  74.8,  72.4,  78.3,
        80.4,  80.5,  78. ,  72.4,  62.9,  74.4,  67.4,  78. ,  80.2,
        80. ,  78. ,  73.8,  85.8,  85.5,  70.4,  65.9,  84.8,  89.1,
        56.4,  84.2,  70.8,  78.1,  82.8,  71.5,  80.9,  70.2,  68.3,
        70. ,  76.6,  83.4,  75.8,  74. ,  71.8,  77. ,  81.3,  73.1,
        71.6,  79. ,  75.9,  75.5,  72.3,  84.6,  69.2,  76.3,  87.8,
        76.1,  76.3,  72.6,  70.2,  73.4,  79.6,  85.5,  81.9,  72.7,
        81.8,  82.7,  72.9,  86.6,  72. ,  72.6,  73.5,  70.7,  76.4,
        70.9,  87.3,  77.5,  75.7,  70.3,  77.4,  87.6,  81.8,  74.1,
        77.7,  69.9,  81. ,  76.8,  82.7,  71.8,  75.6,  73.6,  84.6,
        77. ,  79.8,

In [15]:
#lets correct that
trainY= trainY.reshape(len(trainY))
trainY

array([ 0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  1.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,
        0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  0.,
        0.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  0

In [22]:
#last value is 1 for alzheimer and 0 control
test=np.array(test).astype(float)
testX=test[:,1:]
testY=test[:,0]

testY=testY.reshape(len(testY))


In [23]:
prev= clf.predict(testX)
print classification_report(prev, testY)

             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00        61
        1.0       1.00      1.00      1.00        39

avg / total       1.00      1.00      1.00       100



## Lets run a machine learning algorithm on the data

In [19]:
# import the necessary packages
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn import svm

clf = svm.SVC()
clf.fit(trainX, trainY)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

## Lets get run a random forest regressor on the data

In [24]:
import numpy as np
import sklearn as sk
import sklearn.datasets as skd
import sklearn.ensemble as ske
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

reg = ske.RandomForestRegressor()

# it will use only features (ignores the first 5)
reg.fit(trainX[:,5:], trainY);

In [26]:
reg.feature_importances_

array([ 0.0054522 ,  0.01886212,  0.00200681,  0.00562831,  0.04304608,
        0.14105851,  0.00469215,  0.00295104,  0.00282501,  0.00262475,
        0.        ,  0.        ,  0.00089683,  0.        ,  0.00054847,
        0.00234882,  0.01019614,  0.06331761,  0.06512554,  0.05158676,
        0.00341994,  0.        ,  0.00068463,  0.00420329,  0.00103347,
        0.00818031,  0.        ,  0.        ,  0.00070124,  0.00099259,
        0.        ,  0.        ,  0.00207983,  0.00201517,  0.00302138,
        0.00164594,  0.00140248,  0.        ,  0.01272276,  0.01905437,
        0.00392957,  0.        ,  0.00604386,  0.00072228,  0.00098651,
        0.        ,  0.00300925,  0.00512626,  0.00754386,  0.00298859,
        0.00186556,  0.00330962,  0.13946161,  0.03809687,  0.00124525,
        0.00915354,  0.        ,  0.00377613,  0.0039731 ,  0.        ,
        0.        ,  0.        ,  0.00860694,  0.00090179,  0.00336743,
        0.00345559,  0.00101479,  0.0009642 ,  0.        ,  0.00

In [29]:
fet_ind = np.argsort(reg.feature_importances_)[::-1]
fet_imp = reg.feature_importances_[fet_ind]

In [30]:
names = f.columns[6:]
fi = open('feature_significance.csv', 'w')
for n, idx in enumerate(fet_ind):
    print "%d,%s,%f " % (n,names[idx], fet_imp[n]) 
    fi.write("%d,%s,%f \n" % (n,names[idx], fet_imp[n]) )
f.close()

0,Left Amygdala,0.141059 
1,Right Ent entorhinal area,0.139462 
2,Right Inf Lat Vent,0.065126 
3,Left Hippocampus,0.063318 
4,Left Inf Lat Vent,0.051587 
5,Right Amygdala,0.043046 
6,Left Ent entorhinal area,0.038097 
7,Right MFC medial frontal cortex,0.024796 
8,Right PIns posterior insula,0.019356 
9,Left ACgG anterior cingulate gyrus,0.019054 
10,4th Ventricle,0.018862 
11,Right ACgG anterior cingulate gyrus,0.012723 
12,Left MTG middle temporal gyrus,0.011442 
13,Left PO parietal operculum,0.011268 
14,Right SCA subcallosal area,0.011109 
15,Right TTG transverse temporal gyrus,0.010918 
16,Right MCgG middle cingulate gyrus,0.010698 
17,Right MOG middle occipital gyrus,0.010234 
18,Right Hippocampus,0.010196 
19,Right MSFG superior frontal gyrus medial segment,0.010090 
20,Left MFC medial frontal cortex,0.009541 
21,Right MTG middle temporal gyrus,0.009522 
22,Right OFuG occipital fusiform gyrus,0.009163 
23,Left FO frontal operculum,0.009154 
24,Left PoG postcentral gyrus,0.009068 

AttributeError: 'DataFrame' object has no attribute 'close'