In [2]:
import os

import pandas as pd
import numpy as np

from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [3]:
TRAIN_DATA = os.path.join('..', 'data', 'processed', 'train_balanced.csv')
TEST_DATA = os.path.join('..', 'data', 'processed', 'test.csv')

df_train = pd.read_csv(TRAIN_DATA)
df_test = pd.read_csv(TEST_DATA)
FEATURES = [
    # 'UserID',
    # 'UUID',
    # 'Version',
    # 'TimeStemp',
    'GyroscopeStat_x_MEAN',
    'GyroscopeStat_z_MEAN',
    'GyroscopeStat_COV_z_x',
    'GyroscopeStat_COV_z_y',
    'MagneticField_x_MEAN',
    'MagneticField_z_MEAN',
    'MagneticField_COV_z_x',
    'MagneticField_COV_z_y',
    'Pressure_MEAN',
    'LinearAcceleration_COV_z_x',
    'LinearAcceleration_COV_z_y',
    'LinearAcceleration_x_MEAN',
    'LinearAcceleration_z_MEAN',
    # 'attack'
    ]


x_train = df_train[FEATURES]
y_train = df_train['attack']

x_test = df_test[FEATURES]
y_test = df_test['attack']


In [5]:
clf = RandomForestClassifier(n_estimators=512, n_jobs=-1)


In [6]:
param_dist = {  "max_features": ['auto', 'sqrt'],
                "max_depth": [10,9,8,7,6,5,4,3,2,None],
                "min_samples_split": sp_randint(2, 50),
                "min_samples_leaf": sp_randint(1, 50),
                "bootstrap": [True, False],
                "criterion": ["gini","entropy"]}

random_search = RandomizedSearchCV(clf,
            scoring= 'f1_micro',
            param_distributions = param_dist,
            n_iter = 10, verbose=1)

random_search.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   37.7s finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators=512, n_jobs=-1,
                                                  

In [7]:
def report(results, n_top=3): # Función para mostrar resultados
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
report(random_search.cv_results_)

Model with rank: 1
Mean validation score: 0.908 (std: 0.019)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 21}

Model with rank: 2
Mean validation score: 0.904 (std: 0.022)
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 48}

Model with rank: 3
Mean validation score: 0.877 (std: 0.024)
Parameters: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 16, 'min_samples_split': 9}



In [13]:
clf_rf = RandomForestClassifier(n_estimators = 512, criterion = 'gini',#'entropy', 
                                max_depth=5, max_features = 'auto', 
                                min_samples_leaf = 2, min_samples_split = 21,
                                bootstrap=False, n_jobs=-1, 
                                class_weight=None)

clf_rf.fit(x_train, y_train) # Construcción del modelo

preds_rf = clf_rf.predict(x_test) # Test del modelo


In [14]:
from sklearn.metrics import classification_report

print("Random Forest: \n",
      classification_report(y_true=y_test, y_pred=preds_rf))

# Confussion Matrix

print("Confussion Matrixn:\n")
matriz = pd.crosstab(y_test, preds_rf, rownames=['actual'], colnames=['preds'])
print(matriz)

# Variables relevantes

print("Feature Relevance:\n")
print(pd.DataFrame({'Feature': FEATURES ,
              'Relevancy': clf_rf.feature_importances_}),"\n")
print("Maximum relevance RF :" , max(clf_rf.feature_importances_), "\n")

Random Forest: 
               precision    recall  f1-score   support

           0       1.00      0.96      0.98      4896
           1       0.04      1.00      0.08         8

    accuracy                           0.96      4904
   macro avg       0.52      0.98      0.53      4904
weighted avg       1.00      0.96      0.98      4904

Confussion Matrixn:

preds      0    1
actual           
0       4715  181
1          0    8
Feature Relevance:

                       Feature  Relevancy
0         GyroscopeStat_x_MEAN   0.121315
1         GyroscopeStat_z_MEAN   0.064074
2        GyroscopeStat_COV_z_x   0.097467
3        GyroscopeStat_COV_z_y   0.199771
4         MagneticField_x_MEAN   0.019682
5         MagneticField_z_MEAN   0.068737
6        MagneticField_COV_z_x   0.032932
7        MagneticField_COV_z_y   0.025690
8                Pressure_MEAN   0.026069
9   LinearAcceleration_COV_z_x   0.183760
10  LinearAcceleration_COV_z_y   0.115922
11   LinearAcceleration_x_MEAN   0.0210