# Feature selection by Gini

In [2]:
import joblib as jl
x_1gram_data = jl.load('x_data_1gram_count.jl')
fea_1gram_list = jl.load('hexnum_list.jl')

In [15]:
x_1gram_data.shape # 256 features

(53080, 256)

In [22]:
from sklearn import tree

y_data = jl.load('sample_y.jl')
clf = tree.DecisionTreeClassifier() # default is gini, criterion='entropy' to get by iformation gain
clf = clf.fit(x_1gram_data, y_data)

In [25]:
# get top 100 most important index
index_gini = clf.feature_importances_.argsort()[-100:][::-1]

In [26]:
index_gini

array([ 82, 178,  20,   4, 117,  84,  73, 127,  97,  86,  68, 193, 200,
        92, 226,   5,  85,  79, 141,   7, 167,  41,  98, 224,  46, 165,
       148,  83, 132, 113,  69,  75,  87, 130,  38, 106,  52, 131, 111,
       176, 149, 216,  72, 139,  89,  95, 194,  33,  96,  32,  50, 211,
       142, 220, 202, 150, 201, 210,  70,   2,  19, 179,  26, 156, 112,
        29,  44, 196, 138, 109,   0, 143,  14,  30,   8, 222, 116, 225,
        43, 213, 135,  74, 159, 219,  28,  11, 118,  39,  94,  15, 103,
        17,  13, 208, 100, 212, 190,  18,   1, 221])

In [27]:
# sorted importance
clf.feature_importances_[index_gini]

array([ 0.65020845,  0.03055145,  0.02663763,  0.0180863 ,  0.01764678,
        0.01579626,  0.00770341,  0.00625089,  0.00579705,  0.00551854,
        0.00542049,  0.0043786 ,  0.00420017,  0.00418254,  0.00400344,
        0.0039669 ,  0.00382641,  0.00375717,  0.00361299,  0.00359737,
        0.00348964,  0.00304117,  0.0030107 ,  0.00290184,  0.0028556 ,
        0.00283379,  0.00282371,  0.00276759,  0.00273228,  0.00257945,
        0.00255858,  0.00247988,  0.00245132,  0.00237445,  0.00230947,
        0.00228631,  0.00228349,  0.00227456,  0.00221821,  0.0021832 ,
        0.00216814,  0.00211726,  0.00208029,  0.00194125,  0.00189284,
        0.00184418,  0.00180992,  0.00178687,  0.00176755,  0.00176179,
        0.00172712,  0.00172283,  0.00170906,  0.0016851 ,  0.00164527,
        0.00156932,  0.00153189,  0.00151799,  0.00151754,  0.00150734,
        0.00149456,  0.00141896,  0.00140464,  0.00138463,  0.00136834,
        0.00136261,  0.00131474,  0.00130094,  0.00127897,  0.00

In [13]:
x_selected = x_1gram_data[:, index]

In [14]:
x_selected.shape # selected 100 from 256 feature

(53080, 100)

# Feature selection by Information Gain

In [28]:
from sklearn import tree

y_data = jl.load('sample_y.jl')
clf = tree.DecisionTreeClassifier(criterion='entropy') # default is gini, criterion='entropy' to get by iformation gain
clf = clf.fit(x_1gram_data, y_data)

In [29]:
# get top 100 most important index
index_infogain = clf.feature_importances_.argsort()[-100:][::-1]

In [30]:
index_infogain

array([ 82, 111, 130,  20, 178, 117,  73, 201,  79,  89, 131, 226,  83,
        30, 181,   7,  70,  94,  84, 123,  16,  13, 104,  68, 165,  99,
       211,  47, 106, 141,   0, 219, 113,  85, 144,  51, 152,  60, 213,
        96,  44, 176, 183,  88,  19,  34,  52, 103, 142,  41,  98, 220,
        33, 192,  92, 157, 114,  87,  28, 187, 200, 194,  39, 137, 182,
        26, 179,  48, 148, 210, 163,   8,  56, 132,  32, 116,  53,   1,
        55,  38, 149,  54,  23, 164,  75, 154, 129, 214,   5,  57,  43,
       120,  35, 119,  78,  17,  24,   4, 110, 125])

In [31]:
# sorted importance
clf.feature_importances_[index_infogain]

array([ 0.55758536,  0.03647461,  0.03279714,  0.03107535,  0.02361915,
        0.01598621,  0.01455498,  0.01035915,  0.007868  ,  0.00755263,
        0.00728152,  0.00643142,  0.00638054,  0.00634877,  0.0060165 ,
        0.00599335,  0.0048804 ,  0.00485054,  0.00436165,  0.00430109,
        0.00414421,  0.00410998,  0.00408367,  0.00401372,  0.00384321,
        0.0038064 ,  0.00375437,  0.0036102 ,  0.00348271,  0.00343168,
        0.00340571,  0.00313639,  0.00309622,  0.00297357,  0.00294184,
        0.0027363 ,  0.00263638,  0.00258408,  0.00253766,  0.00250399,
        0.00249101,  0.00244437,  0.00242677,  0.00238536,  0.00235906,
        0.00232526,  0.00226297,  0.0022328 ,  0.0021853 ,  0.00216697,
        0.00211706,  0.00202823,  0.0020123 ,  0.00200536,  0.00193664,
        0.00188293,  0.00185048,  0.00183179,  0.00178988,  0.00175411,
        0.00168146,  0.0016705 ,  0.0016524 ,  0.00163426,  0.00163263,
        0.00162179,  0.00160587,  0.00158853,  0.00155302,  0.00

In [32]:
x_selected = x_1gram_data[:, index_infogain]

In [33]:
x_selected.shape # selected 100 from 256 feature

(53080, 100)

# Features selected by information gain & gini is diffirent

In [34]:
set(index_infogain) - set(index_gini)

{16,
 23,
 24,
 34,
 35,
 47,
 48,
 51,
 53,
 54,
 55,
 56,
 57,
 60,
 78,
 88,
 99,
 104,
 110,
 114,
 119,
 120,
 123,
 125,
 129,
 137,
 144,
 152,
 154,
 157,
 163,
 164,
 181,
 182,
 183,
 187,
 192,
 214}