# Scikit-Learn

In [74]:
import numpy as np
from sklearn.model_selection import train_test_split

In [75]:
X = np.load('particiones-datos-balanceados.npz')['X']
Y = np.load('particiones-datos-balanceados.npz')['Y']

In [76]:
X

array([[-0.75275929,  1.11852895, -7.5592353 ],
       [ 2.70428584, -3.60506139, -0.0964618 ],
       [ 1.39196365, -2.07855351, -9.31222958],
       [ 0.59195091, -1.33638157,  8.18640804],
       [-2.06388816, -0.43930016, -4.82440037],
       [-2.06403288,  2.85175961,  3.25044569],
       [-2.65149833, -3.00326218, -3.76577848],
       [ 2.19705687,  0.14234438,  0.40136042],
       [ 0.60669007,  0.92414569,  0.93420559],
       [ 1.24843547, -4.53549587, -6.30291089],
       [-2.87649303,  1.07544852,  9.39169256],
       [ 2.81945911, -3.29475876,  5.50265647],
       [ 1.99465584, -4.34948407,  8.78997883],
       [-1.72596534,  4.48885537,  7.89654701],
       [-1.9090502 ,  4.65632033,  1.95799958],
       [-1.89957294,  3.08397348,  8.4374847 ],
       [-1.17454654, -1.95386231, -8.23014996],
       [ 0.14853859, -4.02327886, -6.08034275],
       [-0.40832989,  1.84233027, -9.09545422],
       [-1.25262516, -0.59847506, -3.49339338]])

In [77]:
Y

array([1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0])

In [78]:
# Partición 60% (train) y resto (40%)
x_train, x_resto, y_train, y_resto = train_test_split(
    X, Y, test_size=0.4, random_state=42
)

# Partición "resto" en 2 mitades
x_val, x_test, y_val, y_test = train_test_split(
    x_resto, y_resto, test_size=0.5, random_state=43
)

# Verificación
print('Tamaños: ')
print('\tDataset original: ', X.shape, Y.shape)
print('\tEntrenamiento: ', x_train.shape, y_train.shape)
print('\tValidación: ', x_val.shape, y_val.shape)
print('\tPrueba: ', x_test.shape, y_test.shape)

print('Proporciones categorías (0s/1s): ')
print(f'\tDataset original: {np.sum(Y==0)/len(Y)}/{np.sum(Y==1)/len(Y)}')
print(f'\tEntrenamiento: {np.sum(y_train==0)/len(y_train)}/{np.sum(y_train==1)/len(y_train)}')
print(f'\tValidación: {np.sum(y_val==0)/len(y_val)}/{np.sum(y_val==1)/len(y_val)}')
print(f'\tPrueba: {np.sum(y_test==0)/len(y_test)}/{np.sum(y_test==1)/len(y_test)}')

Tamaños: 
	Dataset original:  (20, 3) (20,)
	Entrenamiento:  (12, 3) (12,)
	Validación:  (4, 3) (4,)
	Prueba:  (4, 3) (4,)
Proporciones categorías (0s/1s): 
	Dataset original: 0.55/0.45
	Entrenamiento: 0.5/0.5
	Validación: 0.5/0.5
	Prueba: 0.75/0.25


In [79]:
X = np.load('particiones-datos-desbalanceados.npz')['X']
Y = np.load('particiones-datos-desbalanceados.npz')['Y']

In [80]:
print('Proporciones categorías (0s/1s) set desbalanceado: ')
print(f'\t{np.sum(Y==0)/len(Y)}/{np.sum(Y==1)/len(Y)}')

Proporciones categorías (0s/1s) set desbalanceado: 
	0.8/0.2


In [81]:
# Partición con muestreo estratificado

# Partición 60% (train) y resto (40%)
x_train, x_resto, y_train, y_resto = train_test_split(
    X, Y, test_size=0.4, random_state=20,
    stratify=Y, #*** MUESTREO ESTRATIFICADO ***
)

# Partición "resto" en 2 mitades (también estratificado)
x_val, x_test, y_val, y_test = train_test_split(
    x_resto, y_resto, test_size=0.5, random_state=321,
    stratify = y_resto, #*** MUESTREO ESTRATIFICADO ***
)

# Verificación
print('Tamaños: ')
print('\tDataset original: ', X.shape, Y.shape)
print('\tEntrenamiento: ', x_train.shape, y_train.shape)
print('\tValidación: ', x_val.shape, y_val.shape)
print('\tPrueba: ', x_test.shape, y_test.shape)

print('Proporciones categorías (0s/1s): ')
print(f'\tDataset original: {np.sum(Y==0)/len(Y)}/{np.sum(Y==1)/len(Y)}')
print(f'\tEntrenamiento: {np.sum(y_train==0)/len(y_train)}/{np.sum(y_train==1)/len(y_train)}')
print(f'\tValidación: {np.sum(y_val==0)/len(y_val)}/{np.sum(y_val==1)/len(y_val)}')
print(f'\tPrueba: {np.sum(y_test==0)/len(y_test)}/{np.sum(y_test==1)/len(y_test)}')

Tamaños: 
	Dataset original:  (20, 3) (20,)
	Entrenamiento:  (12, 3) (12,)
	Validación:  (4, 3) (4,)
	Prueba:  (4, 3) (4,)
Proporciones categorías (0s/1s): 
	Dataset original: 0.8/0.2
	Entrenamiento: 0.8333333333333334/0.16666666666666666
	Validación: 0.75/0.25
	Prueba: 0.75/0.25


In [82]:
print(f'x_train: {x_train.min(axis=0)}/{x_train.max(axis=0)}')
print(f'x_val: {x_val.min(axis=0)}/{x_val.max(axis=0)}')
print(f'x_test: {x_test.min(axis=0)}/{x_test.max(axis=0)}')

x_train: [-2.87649303 -4.34948407 -9.31222958]/[2.81945911 3.08397348 9.39169256]
x_val: [-2.65149833 -3.00326218 -9.09545422]/[2.19705687 1.84233027 0.40136042]
x_test: [-1.9090502  -4.53549587 -8.23014996]/[1.24843547 4.65632033 7.89654701]


In [83]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1,1))

In [84]:
# fit_transform() sobre el set de entrenamiento
x_train_s = scaler.fit_transform(x_train)

In [85]:
print(f'Mínimos de "x_train": {x_train.min(axis=0)}')
print(f'Mínimos calculados por el escalador: {scaler.data_min_}')
print('-'*50)
print(f'Máximos de "x_train": {x_train.max(axis=0)}')
print(f'Máximos calculados por el escalador: {scaler.data_max_}')

Mínimos de "x_train": [-2.87649303 -4.34948407 -9.31222958]
Mínimos calculados por el escalador: [-2.87649303 -4.34948407 -9.31222958]
--------------------------------------------------
Máximos de "x_train": [2.81945911 3.08397348 9.39169256]
Máximos calculados por el escalador: [2.81945911 3.08397348 9.39169256]


In [86]:
print(f'x_train_s: {x_train_s.min(axis=0)}/{x_train_s.max(axis=0)}')

x_train_s: [-1. -1. -1.]/[1. 1. 1.]


In [87]:
x_val_s = scaler.transform(x_val)
x_test_s = scaler.transform(x_test)

In [88]:
print(f'x_val_s: {x_val_s.min(axis=0)}/{x_val_s.max(axis=0)}')
print(f'x_test_: {x_test_s.min(axis=0)}/{x_test_s.max(axis=0)}')

x_val_s: [-0.92099839 -0.63779388 -0.97682033]/[0.78145805 0.66593117 0.03866878]
x_test_: [-0.66030514 -1.05004718 -0.88429383]/[0.44837189 1.42304589 0.84012492]


In [89]:
# Importar el módulo
from sklearn.ensemble import RandomForestClassifier

# Y crear la instancia
bosque = RandomForestClassifier()

In [90]:
# 2. Entrenamiento
bosque.fit(x_train, y_train)

In [91]:
bosque.score

<bound method ClassifierMixin.score of RandomForestClassifier()>

In [92]:
print(f'Exactitud promedio entrenamiento: {bosque.score(x_train,y_train)}')
print(f'Exactitud promedio validación: {bosque.score(x_val, y_val)}')

Exactitud promedio entrenamiento: 1.0
Exactitud promedio validación: 0.5


In [93]:
bosque.score(x_test,y_test)

0.5

In [94]:
y_pred = bosque.predict(x_test)

In [95]:
print('Categorías reales:   ', y_test)
print('Categorías predichas:', y_pred)

Categorías reales:    [0 1 0 0]
Categorías predichas: [1 0 0 0]
