In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Etapa 1. Procesamiento de datos

In [41]:
df = pd.read_csv('penguins_size.csv')
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [45]:
from sklearn.impute import SimpleImputer
### Eliminar los valores nulos
imputer = SimpleImputer(strategy='most_frequent')
df.iloc[:,:] = imputer.fit_transform(df)
df.isna().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [46]:
from sklearn.preprocessing import LabelEncoder
# Convertir variables categóricas a numéricas usando Label Encoding
label_encoders = {}
for col in ['island', 'sex','species']:  # Variables categóricas
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Guardamos los encoders por si se necesitan después

In [47]:
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,1
1,0,2,39.5,17.4,186.0,3800.0,0
2,0,2,40.3,18.0,195.0,3250.0,0
3,0,2,41.1,17.0,190.0,3800.0,1
4,0,2,36.7,19.3,193.0,3450.0,0


### Transformacion de datos

In [33]:
X = df.drop('species',axis = 1)
y = df['species']

In [34]:
X.head()

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,2,39.1,18.7,181.0,3750.0,1
1,2,39.5,17.4,186.0,3800.0,0
2,2,40.3,18.0,195.0,3250.0,0
3,2,,,,,2
4,2,36.7,19.3,193.0,3450.0,0


In [35]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: species, dtype: int32

# Árbol de Decisión

In [36]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier

# Dividir datos en entrenamiento y prueba (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar un árbol de decisión
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluar el modelo
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Mostrar precisión del modelo
print(f'accuracy: {accuracy}')
print(f'recall: {recall}')
print(f'precision: {precision}')
print(f'f1: {f1}')

accuracy: 0.9565217391304348
recall: 0.9565217391304348
precision: 0.9560093690528473
f1: 0.9558827444820149


# XGBOOST

In [37]:
from xgboost import XGBClassifier

# Crear el modelo XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)

# Entrenar el modelo con los datos de entrenamiento
xgb_model.fit(X_train, y_train)

# Hacer predicciones en el conjunto de prueba
y_pred_xgb = xgb_model.predict(X_test)

# Evaluar el modelo
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

# Mostrar la precisión del modelo XGBoost
accuracy_xgb

0.9420289855072463

# Guardar los modelos

In [38]:
import joblib

# Guardar el modelo de Árbol de Decisión
joblib.dump(clf, 'decision_tree_model.pkl')

# Guardar el modelo XGBoost
joblib.dump(xgb_model, 'xgboost_model.pkl')

# Guardar los encoders
joblib.dump(label_encoders, 'label_encoders.pkl')

print("Modelos y encoders guardados correctamente.")


Modelos y encoders guardados correctamente.
