In [1]:
from tqdm import tqdm
import numpy as np


raw_train_sentences = []
for i in tqdm(range(10000)):
    five_sentences = ""
    for line in open("./all/data/descriptions_train/" + str(i) + ".txt"):
        five_sentences += line.strip() + " "
    raw_train_sentences.append(five_sentences.strip())
    
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

embeddings_index = {}
f = open('glove.42B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

xtrain_glove = [sent2vec(x) for x in tqdm(raw_train_sentences)]


raw_test_sentences = []
for i in tqdm(range(2000)):
    five_sentences = ""
    for line in open("./all/data/descriptions_test/" + str(i) + ".txt"):
        five_sentences += line.strip() + " "
    raw_test_sentences.append(five_sentences.strip())
    
xtest_glove = [sent2vec(x) for x in tqdm(raw_test_sentences)]

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:14<00:00, 686.14it/s]
1917495it [09:11, 3479.23it/s]
100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:15<00:00, 660.08it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:02<00:00, 863.57it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:02<00:00, 817.95it/s]


In [3]:
import csv

# Reading 2048-d train features 
features_train_2048 = np.zeros((10000,2048))
with open('./all/data/features_train/features_resnet1000intermediate_train.csv', 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for line in csv_reader:
        image_name = line[0].strip(".jpg")[13:]
        row = []
        for i in range(len(line)):
            if i > 0:
                row.append(float(line[i]))
        features_train_2048[int(image_name)] = row
        
# Reading 1000-d train features 
features_train_1000 = np.zeros((10000,1000))
with open('./all/data/features_train/features_resnet1000_train.csv', 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for line in csv_reader:
        image_name = line[0].strip(".jpg")[13:]
        row = []
        for i in range(len(line)):
            if i > 0:
                row.append(float(line[i]))
        features_train_1000[int(image_name)] = row
        
# Find set of supercategories, categories 
supercategory_set = set()
category_set = set()
for i in range (10000):
    file = open("./all/data/tags_train/" + str(i) + ".txt", "r")
    lines = file.readlines() 
    for  line in lines:
        words = line.strip().split(':')
        supercategory_set.add(words[0])
        category_set.add(words[1])
    file.close()

# mapping from (super) category to index
supercategory_dict = {item:val for val, item in enumerate(supercategory_set)}
category_dict = {item:val+1 for val, item in enumerate(category_set)}

# Vectorize train tags
train_tags = []
for i in range (10000):
    file = open("./all/data/tags_train/" + str(i) + ".txt", "r")
    lines = file.readlines() 
    row = np.zeros(len(supercategory_set))
    for line in lines:
        words = line.strip().split(':')		
        supercategory_column = supercategory_dict.get(words[0])
        category_index = category_dict.get(words[1])
        row[supercategory_column] = category_index
    train_tags.append(row)
    file.close()

In [5]:
features_train_concat = np.concatenate((np.array(features_train_2048),np.array(features_train_1000)), axis=1)
features_train_concat = np.concatenate((features_train_concat, np.array(train_tags)), axis=1)

X_train_1 = np.concatenate((xtrain_glove,features_train_concat),axis=1)
Y_train_1 = np.ones(10000)

train_query_reordered = np.concatenate((xtrain_glove[5000:10000],xtrain_glove[0:5000]),axis=0)
X_train_0 = np.concatenate((train_query_reordered,features_train_concat),axis=1)
Y_train_0 = np.zeros(10000)

X_train = np.concatenate((X_train_1,X_train_0),axis=0)
Y_train = np.concatenate((Y_train_1,Y_train_0),axis=0)

In [6]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(max_depth=15, random_state=0,n_estimators=100,verbose=3)
clf_RF.fit(X_train,Y_train)

building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s


building tree 2 of 100


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.9s remaining:    0.0s


building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  3.8min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=3, warm_start=False)

In [8]:
print (clf_RF.predict(X_train[:1]))
print (clf_RF.predict_proba(X_train[:1]))

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[1.]


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


[[0.32932239 0.67067761]]


In [13]:
importance = clf_RF.feature_importances_
importance_i = []
for i in range(len(importance)):
    importance_i.append((importance[i],i))
importance_i.sort()
print(importance_i)

[(2.2840727679568994e-05, 3357), (4.4653728484262924e-05, 814), (4.7120758815693016e-05, 2249), (4.8928217965582544e-05, 2304), (5.4581825680436056e-05, 1114), (5.877551855926196e-05, 559), (5.9151052477624524e-05, 2269), (6.088717699426954e-05, 817), (6.259121018795748e-05, 1176), (6.304285877212332e-05, 1497), (6.349153155637008e-05, 3348), (6.356215975004297e-05, 2030), (6.494246264105714e-05, 3350), (6.505395363551953e-05, 3032), (6.703312236646799e-05, 977), (7.023582644713689e-05, 1509), (7.101418256444516e-05, 2266), (7.232323054879409e-05, 1385), (7.253788941835071e-05, 697), (7.373247410041483e-05, 613), (7.41776373166937e-05, 1711), (7.43238417842321e-05, 465), (7.433297300120435e-05, 430), (7.463383310517408e-05, 841), (7.539154053564674e-05, 558), (7.576976193175678e-05, 1354), (7.589590849820924e-05, 1847), (7.599602610655582e-05, 1069), (7.602076755668594e-05, 1310), (7.607963499207596e-05, 2991), (7.617929960379191e-05, 2455), (7.703213556016607e-05, 1177), (7.7400885306




In [14]:
clf_RF = RandomForestClassifier(n_jobs=-1,random_state=0,verbose=3)
clf_RF.fit(X_train,Y_train)
print (clf_RF.predict(X_train[:1]))
print (clf_RF.predict_proba(X_train[:1]))

building tree 1 of 10building tree 2 of 10building tree 3 of 10building tree 4 of 10building tree 5 of 10building tree 6 of 10building tree 7 of 10


building tree 8 of 10




building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    3.6s remaining:    8.6s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    4.3s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.0s finished
[Parallel(n_jobs=8)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished


[1.]


[Parallel(n_jobs=8)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    0.0s finished


[[0.4 0.6]]


In [15]:
clf_RF = RandomForestClassifier(n_jobs=-1,max_depth=100, random_state=0,n_estimators=500,verbose=3)
clf_RF.fit(X_train,Y_train)
print (clf_RF.predict(X_train[:1]))
print (clf_RF.predict_proba(X_train[:1]))

building tree 1 of 500building tree 2 of 500building tree 3 of 500building tree 4 of 500building tree 5 of 500building tree 6 of 500building tree 7 of 500building tree 8 of 500







building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    9.1s


building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
building tree 43 of 500
building tree 44 of 500
building tree 45 of 500
building tree 46 of 500
building tree 47 of 500
building tree 48 of 500
building tree 49 of 500
building tree 50 of 500
building tree 51 of 500
building tree 52 of 500
building tree 53 of 500
building tree 54 of 500
building tree 55 of 500
building tree 56 of 500
building tree 57 of 500
building tree 58 of 500
building tree 59 of 500
building tree 60 of 500
building tree 61 of 500
building tree 62 of 500
building tree 63 of 500
building tree 64 of 500
building tree 65

[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.0min


building tree 120 of 500
building tree 121 of 500
building tree 122 of 500
building tree 123 of 500
building tree 124 of 500
building tree 125 of 500
building tree 126 of 500
building tree 127 of 500
building tree 128 of 500
building tree 129 of 500
building tree 130 of 500
building tree 131 of 500
building tree 132 of 500
building tree 133 of 500
building tree 134 of 500
building tree 135 of 500
building tree 136 of 500
building tree 137 of 500
building tree 138 of 500
building tree 139 of 500
building tree 140 of 500
building tree 141 of 500
building tree 142 of 500
building tree 143 of 500
building tree 144 of 500
building tree 145 of 500
building tree 146 of 500
building tree 147 of 500
building tree 148 of 500
building tree 149 of 500
building tree 150 of 500
building tree 151 of 500
building tree 152 of 500
building tree 153 of 500
building tree 154 of 500
building tree 155 of 500
building tree 156 of 500
building tree 157 of 500
building tree 158 of 500
building tree 159 of 500


[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  2.4min



building tree 281 of 500
building tree 282 of 500
building tree 283 of 500
building tree 284 of 500
building tree 285 of 500
building tree 286 of 500
building tree 287 of 500
building tree 288 of 500
building tree 289 of 500
building tree 290 of 500
building tree 291 of 500
building tree 292 of 500
building tree 293 of 500
building tree 294 of 500
building tree 295 of 500
building tree 296 of 500
building tree 297 of 500
building tree 298 of 500
building tree 299 of 500
building tree 300 of 500
building tree 301 of 500
building tree 302 of 500
building tree 303 of 500
building tree 304 of 500
building tree 305 of 500
building tree 306 of 500
building tree 307 of 500
building tree 308 of 500
building tree 309 of 500
building tree 310 of 500
building tree 311 of 500
building tree 312 of 500
building tree 313 of 500
building tree 314 of 500
building tree 315 of 500
building tree 316 of 500
building tree 317 of 500
building tree 318 of 500
building tree 319 of 500
building tree 320 of 500

[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.4min finished
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.1s finished


[1.]


[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 272 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.1s finished


[[0.234 0.766]]


In [17]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial', n_jobs=-1,verbose=3).fit(X_train,Y_train)
print (clf.predict(X_train[:1]))
print (clf.predict_proba(X_train[:1]))

[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   21.8s finished


[0.]
[[0.5 0.5]]


In [18]:
clf = LogisticRegression(random_state=0,n_jobs=-1,verbose=3).fit(X_train,Y_train)
print (clf.predict(X_train[:1]))
print (clf.predict_proba(X_train[:1]))

  " = {}.".format(self.n_jobs))


[LibLinear][0.]
[[0.5 0.5]]


In [19]:
clf_RF = RandomForestClassifier(n_jobs=-1,max_depth=100, random_state=0,n_estimators=500,verbose=3,max_features=None)
clf_RF.fit(X_train,Y_train)

building tree 1 of 500building tree 2 of 500building tree 3 of 500building tree 4 of 500building tree 5 of 500building tree 6 of 500
building tree 7 of 500building tree 8 of 500






building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 10.5min


building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Sonia\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-e0b294d4dc40>", line 2, in <module>
    clf_RF.fit(X_train,Y_train)
  File "C:\Users\Sonia\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py", line 328, in fit
    for i, t in enumerate(trees))
  File "C:\Users\Sonia\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 789, in __call__
    self.retrieve()
  File "C:\Users\Sonia\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 699, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "C:\Users\Sonia\Anaconda3\lib\multiprocessing\pool.py", line 638, in get
    self.wait(timeout)
  File "C:\Users\Sonia\Anaconda3\lib\multiprocessing\pool.py", line 635, in wait
    self._event.wait(timeout)
  File "C:\Users\Sonia\Anaconda3\lib\threading.py", line 551, in

KeyboardInterrupt: 

In [20]:
print(np.amax(X_train))
print(np.amin(X_train))

80.0
-9.151806831359863


In [21]:
print (X_train[0])

[-6.59739673e-02  6.31461572e-03  1.76676270e-02 ...  0.00000000e+00
  4.10000000e+01  2.10000000e+01]


In [24]:
X_train_random = []
import random
for i in range(len(X_train)):
    X_train_random.append(np.append(X_train[i],np.array([random.randint(1,80)])))
print(X_train.shape)

(20000, 3360)


In [26]:
X_train_random = np.array(X_train_random)
print(X_train_random.shape)

(20000, 3361)


In [32]:
clf_RF = RandomForestClassifier(max_depth=100, random_state=0,n_estimators=500,verbose=3,n_jobs=-1)
clf_RF.fit(X_train_random,Y_train)

building tree 1 of 500building tree 2 of 500building tree 3 of 500building tree 4 of 500
building tree 5 of 500building tree 6 of 500
building tree 7 of 500
building tree 8 of 500




building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.3s


building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500building tree 36 of 500

building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
building tree 43 of 500
building tree 44 of 500
building tree 45 of 500
building tree 46 of 500
building tree 47 of 500
building tree 48 of 500
building tree 49 of 500
building tree 50 of 500
building tree 51 of 500
building tree 52 of 500
building tree 53 of 500
building tree 54 of 500
building tree 55 of 500
building tree 56 of 500
building tree 57 of 500
building tree 58 of 500
building tree 59 of 500
building tree 60 of 500
building tree 61 of 500
building tree 62 of 500
building tree 63 of 500
building tree 64 of 500
building tree 65

[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   29.7s



building tree 121 of 500
building tree 122 of 500
building tree 123 of 500
building tree 124 of 500
building tree 125 of 500
building tree 126 of 500
building tree 127 of 500
building tree 128 of 500building tree 129 of 500

building tree 130 of 500
building tree 131 of 500
building tree 132 of 500
building tree 133 of 500
building tree 134 of 500
building tree 135 of 500
building tree 136 of 500
building tree 137 of 500
building tree 138 of 500
building tree 139 of 500
building tree 140 of 500
building tree 141 of 500
building tree 142 of 500
building tree 143 of 500
building tree 144 of 500
building tree 145 of 500
building tree 146 of 500
building tree 147 of 500
building tree 148 of 500
building tree 149 of 500
building tree 150 of 500
building tree 151 of 500
building tree 152 of 500
building tree 153 of 500
building tree 154 of 500
building tree 155 of 500
building tree 156 of 500
building tree 157 of 500
building tree 158 of 500
building tree 159 of 500
building tree 160 of 500

[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.4min


building tree 280 of 500
building tree 281 of 500
building tree 282 of 500building tree 283 of 500

building tree 284 of 500
building tree 285 of 500
building tree 286 of 500
building tree 287 of 500
building tree 288 of 500
building tree 289 of 500
building tree 290 of 500
building tree 291 of 500
building tree 292 of 500
building tree 293 of 500
building tree 294 of 500
building tree 295 of 500
building tree 296 of 500building tree 297 of 500

building tree 298 of 500
building tree 299 of 500
building tree 300 of 500
building tree 301 of 500
building tree 302 of 500
building tree 303 of 500
building tree 304 of 500
building tree 305 of 500
building tree 306 of 500
building tree 307 of 500
building tree 308 of 500
building tree 309 of 500
building tree 310 of 500
building tree 311 of 500
building tree 312 of 500
building tree 313 of 500
building tree 314 of 500
building tree 315 of 500
building tree 316 of 500
building tree 317 of 500
building tree 318 of 500
building tree 319 of 500


[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  3.3min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=100, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=0, verbose=3, warm_start=False)

In [34]:
importance = clf_RF.feature_importances_
importance_i = []
for i in range(len(importance)):
    importance_i.append((importance[i],i))

meaningless = importance_i[-1]
print(meaningless)

(0.0004025596425334409, 3360)


In [37]:
importance = clf_RF.feature_importances_
meaningless = importance[-1]
print(meaningless)

print (len(importance))

features_index = []
dummy=[]
for i in range(len(importance) - 1):
    if importance[i] < meaningless:
        features_index.append(i)
    else:
        dummy.append(i)
        
print (len(features_index))
print (dummy)

0.0004025596425334409
3361
2934
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 21

In [38]:
print (len(dummy))

426


In [39]:
print(importance_i)

[(0.0011104435088609833, 0), (0.0009969374194456712, 1), (0.0008772856785959365, 2), (0.0009535558031429365, 3), (0.002262408600180496, 4), (0.0009431997974202156, 5), (0.0007485060505604918, 6), (0.000636194059288148, 7), (0.0008728143326385663, 8), (0.001845522868958355, 9), (0.0011863919787514522, 10), (0.0007680132921855197, 11), (0.0009134764381941141, 12), (0.0011525848601704645, 13), (0.002032820461153131, 14), (0.0007640279583414187, 15), (0.0013208074187690806, 16), (0.0009898402083126023, 17), (0.0008750807746353608, 18), (0.0010195445614205013, 19), (0.0009073503205158386, 20), (0.000997192066550147, 21), (0.0009278696966924716, 22), (0.0011953077948087274, 23), (0.0007358840987014151, 24), (0.000908297502356151, 25), (0.000725612642753275, 26), (0.0007965236587766691, 27), (0.0010592532209052553, 28), (0.002618311978474563, 29), (0.001107553211122022, 30), (0.0006665791185616742, 31), (0.0010530475446724687, 32), (0.0007972625474287184, 33), (0.001320762489343201, 34), (0.0




In [40]:
clf_RF = RandomForestClassifier(max_depth=15, random_state=0,n_estimators=100,verbose=3)
clf_RF.fit(X_train,Y_train)

building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s remaining:    0.0s


building tree 2 of 100


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s


building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.5min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=0, verbose=3, warm_start=False)

In [41]:
importance = clf_RF.feature_importances_
importance_i = []
for i in range(len(importance)):
    importance_i.append((importance[i],i))
importance_i.sort()
print(importance_i)

[(2.2840727679568994e-05, 3357), (4.4653728484262924e-05, 814), (4.7120758815693016e-05, 2249), (4.8928217965582544e-05, 2304), (5.4581825680436056e-05, 1114), (5.877551855926196e-05, 559), (5.9151052477624524e-05, 2269), (6.088717699426954e-05, 817), (6.259121018795748e-05, 1176), (6.304285877212332e-05, 1497), (6.349153155637008e-05, 3348), (6.356215975004297e-05, 2030), (6.494246264105714e-05, 3350), (6.505395363551953e-05, 3032), (6.703312236646799e-05, 977), (7.023582644713689e-05, 1509), (7.101418256444516e-05, 2266), (7.232323054879409e-05, 1385), (7.253788941835071e-05, 697), (7.373247410041483e-05, 613), (7.41776373166937e-05, 1711), (7.43238417842321e-05, 465), (7.433297300120435e-05, 430), (7.463383310517408e-05, 841), (7.539154053564674e-05, 558), (7.576976193175678e-05, 1354), (7.589590849820924e-05, 1847), (7.599602610655582e-05, 1069), (7.602076755668594e-05, 1310), (7.607963499207596e-05, 2991), (7.617929960379191e-05, 2455), (7.703213556016607e-05, 1177), (7.7400885306




In [42]:
for r in range(len(importance_i)):
    a = importance_i[r][1]
    if (a>=0 and a <300):
        print(r)

2809
2831
2888
2889
2892
2918
2929
2935
2948
2950
2951
2957
2961
2962
2978
2979
2981
2982
2984
2985
2989
2990
2991
2992
3001
3002
3007
3014
3017
3020
3022
3024
3025
3028
3029
3032
3035
3036
3037
3039
3040
3042
3043
3044
3046
3048
3049
3053
3055
3056
3059
3061
3062
3066
3067
3069
3074
3075
3076
3078
3079
3080
3081
3082
3083
3086
3087
3088
3090
3091
3093
3094
3097
3098
3100
3101
3102
3104
3105
3106
3108
3109
3110
3111
3114
3115
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3132
3133
3134
3135
3136
3137
3138
3140
3141
3143
3144
3146
3147
3148
3150
3151
3152
3153
3154
3155
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3171
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3188
3190
3191
3193
3194
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3211
3212
3214
3215
3216
3219
3220
3221
3223
3224
3225
3226
3227
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3246
3247
3248
3249
3250
3251
3254


In [43]:
print (len(importance_i))

3360
