In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X, y = make_classification(n_samples=10000, 
                           n_classes=5, 
                           n_features=20,
                           n_informative=10,
                           random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=10, 
                                 learning_rate=0.3, 
                                 max_depth=6)
gbc.fit(X_train, y_train)
accuracy_score(y_test, gbc.predict(X_test))

0.7688

In [5]:
from decision_tree_boosting import GradientBoostingClassifierFromScratch

gbcfs = GradientBoostingClassifierFromScratch(n_estimators=10, 
                                              learning_rate=0.3, 
                                              max_depth=6)
gbcfs.fit(X_train, y_train)
accuracy_score(y_test, gbcfs.predict(X_test))

0.7764

## Ejercicio 3: clasificación multiclase y arboles de decisión con boosting

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
df = pd.read_csv('Data/car.csv') 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   buying         1728 non-null   object
 1   maint          1728 non-null   object
 2   doors          1728 non-null   object
 3   persons        1728 non-null   object
 4   lug_boot       1728 non-null   object
 5   safety         1728 non-null   object
 6   acceptability  1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [2]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,acceptability
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
display(df.doors.unique())
display(df.persons.unique())
display(df.acceptability.unique())

array(['2', '3', '4', '5more'], dtype=object)

array(['2', '4', 'more'], dtype=object)

array(['unacc', 'acc', 'vgood', 'good'], dtype=object)

Vamos a codificar los atributos usando un esquema One Hot, es decir, los consideraremos como variables categóricas. También vamos a codificar el target usando el `LabelEncoder`.

In [4]:
from sklearn.preprocessing import LabelEncoder

lab_enc = LabelEncoder()
lab_enc.fit(df['acceptability'])

In [5]:
y = lab_enc.transform(df['acceptability'])
X = pd.get_dummies(df.drop('acceptability', axis=1),drop_first=True)

In [6]:
# vemos la forma final la matriz de features
print(X.shape)
X.head()

(1728, 15)


Unnamed: 0,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med
0,False,False,True,False,False,True,False,False,False,False,False,False,True,True,False
1,False,False,True,False,False,True,False,False,False,False,False,False,True,False,True
2,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False
3,False,False,True,False,False,True,False,False,False,False,False,True,False,True,False
4,False,False,True,False,False,True,False,False,False,False,False,True,False,False,True


In [7]:
# vemos la forma final del vector target
print(y.shape)
y

(1728,)


array([2, 2, 2, ..., 2, 1, 3])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

Para que los resultados sean consistentes hay que exponer los modelos exactamente al mismo esquema de validación cruzada.

In [9]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
cv = StratifiedKFold(n_splits=5, random_state=41, shuffle=True)

### 1. Arbol de decisión

In [15]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=1)
dt

In [16]:
# Funcion que toma como input un estimador y un string con el nombre que le queremos poner, y ejecuta cross_val_score
from sklearn.metrics import f1_score, make_scorer

def evaluar_rendimiento(modelo, nombre, X, y, cv):
    # Define a custom scoring function using F1 score
    custom_scorer = make_scorer(f1_score, average='weighted')
    # Use cross_val_score with the custom scoring function
    s = cross_val_score(modelo, X, y, cv=cv, scoring=custom_scorer, n_jobs=-1)
    # Print the mean and standard deviation of F1 scores
    print("Rendimiento de {}:\t{:0.3} ± {:0.3}".format(
        nombre, s.mean().round(3), s.std().round(3)))

In [17]:
evaluar_rendimiento(dt,"Árbol de decisión", X_train, y_train, cv)

Rendimiento de Árbol de decisión:	0.883 ± 0.022


### 2. Gradient Boosting Classifier

Implementación de sklearn:

In [18]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb

In [19]:
evaluar_rendimiento(gb, "GradientBoostingClassifier", X_train, y_train, cv)

Rendimiento de GradientBoostingClassifier:	0.946 ± 0.017


Implementación from scratch:

In [21]:
from decision_tree_boosting import GradientBoostingClassifierFromScratch

gbcfs = GradientBoostingClassifierFromScratch(n_estimators=100, 
                                              learning_rate=0.1, 
                                              max_depth=3)
evaluar_rendimiento(gb, "GradientBoostingClassifier", X_train, y_train, cv)

Rendimiento de GradientBoostingClassifier:	0.943 ± 0.015


Optimización de hiperparámetros de Gradient Boosting

### 3. Regresión logistica

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv('Data/car.csv')

# Encode the target variable using LabelEncoder
lab_enc = LabelEncoder()
y = lab_enc.fit_transform(df['acceptability'])

# One-hot encode the categorical features
X = pd.get_dummies(df.drop('acceptability', axis=1), drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

# Create logistic regression model
lr = LogisticRegression(max_iter=1000)

# Define a custom scoring function using F1 score
custom_scorer = make_scorer(f1_score, average='weighted')

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=5, random_state=41, shuffle=True)

# Use cross_val_score with the custom scoring function
s = cross_val_score(lr, X_train, y_train, cv=cv, scoring=custom_scorer, n_jobs=-1)

# Print the mean and standard deviation of F1 scores
print("Rendimiento de la regresión logística:")
print("Mean F1 score:", s.mean())
print("Std F1 score:", s.std())


Rendimiento de la regresión logística:
Mean F1 score: 0.8806650152276134
Std F1 score: 0.0134956733848372


### 4. MLP

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# Cargar el conjunto de datos
df = pd.read_csv('Data/car.csv')

# Codificar la variable objetivo usando LabelEncoder
lab_enc = LabelEncoder()
y = lab_enc.fit_transform(df['acceptability'])

# Codificar las características categóricas usando One-Hot Encoding
X = pd.get_dummies(df.drop('acceptability', axis=1), drop_first=True)

# Dividir el conjunto de datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=41)

# Crear el modelo MLP
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=1)

# Definir una función de puntuación personalizada utilizando F1 score
custom_scorer = make_scorer(f1_score, average='weighted')

# Definir la estrategia de validación cruzada
cv = StratifiedKFold(n_splits=5, random_state=41, shuffle=True)

# Utilizar cross_val_score con la función de puntuación personalizada
s = cross_val_score(mlp, X_train, y_train, cv=cv, scoring=custom_scorer, n_jobs=-1)

# Imprimir la media y la desviación estándar de los puntajes F1
print("Rendimiento del MLP:")
print("Media del puntaje F1:", s.mean())
print("Desviación estándar del puntaje F1:", s.std())


Rendimiento del MLP:
Media del puntaje F1: 0.9754979730023294
Desviación estándar del puntaje F1: 0.00969024312869907
