# Feature significance example

In [1]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.gridspec as gridspec
import numpy as np
import matplotlib.cm as cm
import pandas as pd
from nifti import NiftiImage

%matplotlib inline

## Lets open the input file
Here we have one row per patient  and each column corresponds to a variable

From column 6 and on, '3rd Ventricle'  the variables are brain features

In [2]:
f = pd.read_csv('variables.csv')
f.head()

Unnamed: 0,DX_bl,AGE,PTEDUCAT,CDRSB,MMSE,FAQ,3rd Ventricle,4th Ventricle,Right Accumbens Area,Left Accumbens Area,...,Right SPL superior parietal lobule,Left SPL superior parietal lobule,Right STG superior temporal gyrus,Left STG superior temporal gyrus,Right TMP temporal pole,Left TMP temporal pole,Right TrIFG triangular part of the inferior frontal gyrus,Left TrIFG triangular part of the inferior frontal gyrus,Right TTG transverse temporal gyrus,Left TTG transverse temporal gyrus
0,AD,81.3,18,4.5,20,10,2.698475,2.907981,0.464328,0.531223,...,9.256819,9.524609,7.733975,6.767715,7.742289,7.17597,3.924761,4.199281,1.665726,1.69567
1,CN,73.7,16,0.0,29,0,1.533795,1.811766,0.419238,0.437089,...,11.197308,11.037356,7.317633,7.339268,8.354081,7.362619,3.840922,3.667695,1.822566,1.817234
2,AD,73.9,12,5.0,24,11,1.088313,1.364788,0.352754,0.418043,...,10.559799,10.672373,6.593533,6.888605,7.594158,7.097453,3.490602,3.769902,1.462676,1.540255
3,CN,65.4,9,0.0,28,0,1.392329,1.968157,0.372398,0.384105,...,8.074973,8.589588,6.606351,6.715842,7.644831,7.624443,3.304219,3.824991,1.438875,1.556557
4,CN,73.1,18,0.0,29,0,1.463126,1.627738,0.36117,0.400582,...,10.335401,10.100816,6.740743,5.922343,7.601225,7.448935,3.550582,4.310352,1.415202,1.458031


## We transform AD (alzheimer) to 1 and CN (control) to 0

In [3]:
f.DX_bl[f.DX_bl=='AD'] = 1.0
f.DX_bl[f.DX_bl=='CN'] = 0.0
f.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app


Unnamed: 0,DX_bl,AGE,PTEDUCAT,CDRSB,MMSE,FAQ,3rd Ventricle,4th Ventricle,Right Accumbens Area,Left Accumbens Area,...,Right SPL superior parietal lobule,Left SPL superior parietal lobule,Right STG superior temporal gyrus,Left STG superior temporal gyrus,Right TMP temporal pole,Left TMP temporal pole,Right TrIFG triangular part of the inferior frontal gyrus,Left TrIFG triangular part of the inferior frontal gyrus,Right TTG transverse temporal gyrus,Left TTG transverse temporal gyrus
0,1,81.3,18,4.5,20,10,2.698475,2.907981,0.464328,0.531223,...,9.256819,9.524609,7.733975,6.767715,7.742289,7.17597,3.924761,4.199281,1.665726,1.69567
1,0,73.7,16,0.0,29,0,1.533795,1.811766,0.419238,0.437089,...,11.197308,11.037356,7.317633,7.339268,8.354081,7.362619,3.840922,3.667695,1.822566,1.817234
2,1,73.9,12,5.0,24,11,1.088313,1.364788,0.352754,0.418043,...,10.559799,10.672373,6.593533,6.888605,7.594158,7.097453,3.490602,3.769902,1.462676,1.540255
3,0,65.4,9,0.0,28,0,1.392329,1.968157,0.372398,0.384105,...,8.074973,8.589588,6.606351,6.715842,7.644831,7.624443,3.304219,3.824991,1.438875,1.556557
4,0,73.1,18,0.0,29,0,1.463126,1.627738,0.36117,0.400582,...,10.335401,10.100816,6.740743,5.922343,7.601225,7.448935,3.550582,4.310352,1.415202,1.458031


## Lets create our training arrays and test arrays

In [4]:
import random

train = np.array(f)
print 'total length ', len(train)

n_samples= 100
train = list(train)
test = []

# we are selecting 100 random subjects for testing
for i in range(n_samples):
  n = random.randrange(0,len(train))
  test.append (train.pop(n) )
    
print 'train array ', len(train)
print 'test array ', len(test)

total length  487
train array  387
test array  100


In [5]:
#first value is 1 for alzheimer and 0 control
train=np.array(train).astype(float)
trainX=train[:,1:]
trainY=train[:,0]

trainX[:, 0]

array([ 81.3,  73.7,  73.9,  73.1,  72.6,  71.7,  77.7,  73.2,  76.2,
        70.1,  74.5,  74.4,  74. ,  78.2,  78.3,  70.6,  73.2,  75.3,
        65.1,  77.2,  82.6,  80.4,  72. ,  73.3,  73.7,  70.6,  80.2,
        73.1,  59.7,  87.7,  74.1,  72.5,  76.4,  78.3,  70.6,  74.8,
        78.3,  80.4,  80.5,  78. ,  72.4,  62.9,  84.3,  74.4,  80.2,
        73.8,  78.5,  85.8,  70.4,  78.4,  84.8,  89.1,  56.4,  84.2,
        70.8,  82.7,  82.8,  71.5,  80.9,  70.2,  68.3,  70. ,  76.6,
        75.8,  74. ,  77. ,  80.2,  81.3,  73.1,  71.6,  56.5,  76. ,
        79. ,  75.5,  72.3,  84.6,  69.2,  76.3,  87.8,  70.2,  73.4,
        85.8,  85.5,  85. ,  81.9,  72.7,  81.8,  82.7,  81.1,  86.6,
        72. ,  72.6,  77.3,  73.5,  70.7,  70.9,  87.3,  77.5,  75.7,
        70.3,  74.9,  87.6,  81.8,  74.1,  77.7,  69.9,  70. ,  70.8,
        62.8,  76.8,  72.2,  71.8,  75.6,  73.6,  71.6,  77. ,  79.8,
        79.3,  74.6,  86.2,  77.5,  72.4,  77. ,  77.9,  76.6,  75.6,
        68.5,  79. ,

In [6]:
#lets correct that
trainY= trainY.reshape(len(trainY))
trainY

array([ 1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,
        0.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  1.,
        1.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
        1.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,
        0.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,
        0.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  1

In [9]:
#last value is 1 for alzheimer and 0 control
test=np.array(test).astype(float)
testX=test[:,1:]
testY=test[:,0]

testY=testY.reshape(len(testY))


## Lets run a machine learning algorithm on the data

In [10]:
# import the necessary packages
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn import svm

clf = svm.SVC()
clf.fit(trainX, trainY)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [11]:
prev= clf.predict(testX)
print classification_report(prev, testY)

             precision    recall  f1-score   support

        0.0       0.94      0.97      0.96        69
        1.0       0.93      0.87      0.90        31

avg / total       0.94      0.94      0.94       100



## Lets get run a random forest regressor on the data

In [12]:
import numpy as np
import sklearn as sk
import sklearn.datasets as skd
import sklearn.ensemble as ske
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

reg = ske.RandomForestRegressor()

# it will use only features (ignores the first 5)
reg.fit(trainX[:,5:], trainY);

In [13]:
reg.feature_importances_

array([ 0.00195836,  0.01008958,  0.        ,  0.00045924,  0.10127054,
        0.18752797,  0.00631767,  0.00167794,  0.        ,  0.00198444,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.02255744,  0.02336296,  0.08113542,  0.05819937,
        0.        ,  0.00109409,  0.00289051,  0.0032566 ,  0.00145878,
        0.00753477,  0.        ,  0.00231723,  0.        ,  0.00738418,
        0.        ,  0.        ,  0.00838006,  0.        ,  0.00108751,
        0.00379865,  0.        ,  0.        ,  0.00648649,  0.01207665,
        0.01088286,  0.00261827,  0.        ,  0.        ,  0.00488345,
        0.00103549,  0.        ,  0.02209438,  0.02550546,  0.0038522 ,
        0.00054709,  0.00549215,  0.07718841,  0.00479609,  0.        ,
        0.        ,  0.0037477 ,  0.00190827,  0.00131635,  0.        ,
        0.        ,  0.        ,  0.00483105,  0.00251631,  0.02019486,
        0.01225598,  0.        ,  0.        ,  0.        ,  0.00

In [14]:
fet_ind = np.argsort(reg.feature_importances_)[::-1]
fet_imp = reg.feature_importances_[fet_ind]

In [15]:
names = f.columns[6:]
fi = open('feature_significance.csv', 'w')
for n, idx in enumerate(fet_ind):
    print "%d,%s,%f " % (n,names[idx], fet_imp[n]) 
    fi.write("%d,%s,%f \n" % (n,names[idx], fet_imp[n]) )
f.close()

0,Left Amygdala,0.187528 
1,Right Amygdala,0.101271 
2,Right Inf Lat Vent,0.081135 
3,Right Ent entorhinal area,0.077188 
4,Left Inf Lat Vent,0.058199 
5,Right CO central operculum,0.025505 
6,Left Hippocampus,0.023363 
7,Right Hippocampus,0.022557 
8,Left Calc calcarine cortex,0.022094 
9,Right ITG inferior temporal gyrus,0.020195 
10,Right MSFG superior frontal gyrus medial segment,0.018543 
11,Right PIns posterior insula,0.013864 
12,Left ITG inferior temporal gyrus,0.012256 
13,Left ACgG anterior cingulate gyrus,0.012077 
14,Left PP planum polare,0.010896 
15,Right AIns anterior insula,0.010883 
16,Right SCA subcallosal area,0.010385 
17,4th Ventricle,0.010090 
18,Right PO parietal operculum,0.008748 
19,Left MFC medial frontal cortex,0.008387 
20,Optic Chiasm,0.008380 
21,Left Putamen,0.007535 
22,Left Ventral DC,0.007384 
23,Right PoG postcentral gyrus,0.007151 
24,Right MCgG middle cingulate gyrus,0.006689 
25,Left MSFG superior frontal gyrus medial segment,0.006671 
26,Left STG

AttributeError: 'DataFrame' object has no attribute 'close'