In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_regression, mutual_info_classif
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [2]:
data = pd.read_csv(r"Datos/dataset.csv")
labels = pd.read_csv(r"Datos/labels.csv")

In [3]:
data_limpio = data.copy(deep=True)
data_limpio.drop('En.Anterior.', inplace=True, axis=1)

var_thr = VarianceThreshold(threshold = 0.01)

var_thr.fit(data_limpio.iloc[:,2: ])
var_thr.get_support()
concol = [column for column in data_limpio.iloc[:,2: ].columns 
          if column not in data_limpio.iloc[:,2: ].columns[var_thr.get_support()]]


data_limpio.drop(concol, inplace=True, axis=1)
data_numeric = pd.get_dummies(data['idEye'])
data_num = pd.concat([data , data_numeric], axis=1).reindex(data.index)
data_num.drop(columns=['idEye', 'En.Anterior.'], axis=1, inplace=True)
le = LabelEncoder()
data_le = data_limpio.copy(deep = True)
label = le.fit_transform(data_le['idEye'].values)
data_le.drop('idEye', axis=1, inplace=True)
data_le['idEye'] = label
X =data_le.iloc[:,2:].values
y= labels.iloc[:,-1].values
le = LabelEncoder()
for i in range(X.shape[1]):
    X[:,i] = le.fit_transform(X[:,i])
y = le.fit_transform(y)
type(X)
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
classes = np.unique(y)
nClasses = len(classes)
print('Total number of outputs : ', nClasses)
print('Output classes : ', classes)
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.85,random_state=42)

Total number of outputs :  4
Output classes :  [0 1 2 3]


In [4]:
pca = PCA(n_components=4)
 
X_train_pca = pca.fit_transform(X_train) #fit para el entrenamiento
X_test_pca = pca.transform(X_test)

In [5]:
param_grid = {
    'n_estimators': (200, 300, 400, 500, 600, 700),
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=None)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                          cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_pca, y_train)

print("Parámetros:", grid_search.best_params_)
best_model1 = grid_search.best_estimator_

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Parámetros: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}


In [6]:
param_dist = {
    'n_estimators': randint(200, 700),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['sqrt', 'log2', None]
}

rf = RandomForestClassifier(random_state=None)
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                 n_iter=200, cv=3, n_jobs=-1, verbose=2)
random_search.fit(X_train_pca, y_train)

print("Parámetros:", random_search.best_params_)
best_model2 = random_search.best_estimator_

Fitting 3 folds for each of 200 candidates, totalling 600 fits
Parámetros: {'max_depth': 18, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 226}
