In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC, SVC
from sklearn import tree
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import classification_report, RocCurveDisplay
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from xgboost import plot_tree
from sklearn import datasets
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_predict, GridSearchCV
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.pylab import rcParams
import seaborn as sns
%matplotlib inline

In [None]:
sns.set_style("darkgrid")
plt.rcParams["font.size"] = 8
plt.rcParams["figure.figsize"] = (4,3)
plt.rcParams["figure.facecolor"] = "#00000000"

In [None]:
# get data
wine = datasets.load_wine()
x, y = pd.DataFrame(wine.data, columns=wine.feature_names), pd.DataFrame(wine.target, columns=["class"])
y_labels = wine.target_names
pd.concat([x,y], axis=1).head()

In [None]:
pd.concat([x,y], axis=1).info()

In [None]:
# k folders
kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# metrics url = https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
scoring = ["accuracy", "f1_macro","recall_macro"]

# split data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
len(x_train), len(x_test), len(y_train), len(y_test)

In [None]:
# Create function for model
def function_model(model, x, y, scoring, **kwargs):

    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

    model.fit(x_train, y_train)
    train_scores = pd.DataFrame(cross_validate(model, x_train, y_train, cv=kfold, scoring=scoring))
    test_scores = pd.DataFrame(cross_validate(model, x_test, y_test, cv=kfold, scoring=scoring))
    return pd.concat([train_scores.mean(), test_scores.mean()], keys=["Train Mean","Test Mean"],  axis=1)

In [None]:
# Logistic model with Grid
param_grid = {'C': [0.1, 1, 10, 100],  'penalty': [ 'l1','l2']}

logistic_model= LogisticRegression(max_iter=1000, solver="liblinear").fit(x_train, y_train.squeeze()) # tiene warning, usar squeeze

train_scores = pd.DataFrame(cross_validate(GridSearchCV(estimator=logistic_model, param_grid=param_grid), 
                                           x_train, y_train.squeeze(), cv=kfold, scoring=scoring))
test_scores = pd.DataFrame(cross_validate(GridSearchCV(estimator=logistic_model, param_grid=param_grid), 
                                          x_test, y_test.squeeze(), cv=kfold, scoring=scoring))
pd.concat([train_scores.mean(), test_scores.mean()], keys=["Train Mean","Test Mean"],  axis=1).round(3)

In [None]:
# Logistic model
logistic_model= LogisticRegression(max_iter=1000, solver="newton-cg").fit(x_train, y_train.squeeze()) # tiene warning, usar squeeze
train_scores = pd.DataFrame(cross_validate(logistic_model, x_train, y_train.squeeze(), cv=kfold, scoring=scoring))
test_scores = pd.DataFrame(cross_validate(logistic_model, x_test, y_test.squeeze(), cv=kfold, scoring=scoring))
pd.concat([train_scores.mean(), test_scores.mean()], keys=["Train Mean","Test Mean"],  axis=1).round(3)

In [None]:
# Example  of ROC
wine = datasets.load_wine()
target_names = wine.target_names
x, y = wine.data, wine.target
y = wine.target_names[y]

# k folders
kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# split data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, stratify=y, random_state=0)

# Create model and kfold
logistic_model= LogisticRegression(max_iter=1000, solver="newton-cg").fit(x_train, y_train)
train_scores = cross_val_predict(logistic_model, x_train, y_train, cv=kfold, method='predict_proba')

# variable transform of number to label
label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)  # (n_samples, n_classes)

class_of_interest = "class_2"
label_binarizer.transform([ class_of_interest])
class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]

# Plot ROC
display = RocCurveDisplay.from_predictions(y_onehot_test[:, class_id],
                                           train_scores[:, class_id],
                                           name=f"{class_of_interest} vs the rest",
                                           color="darkorange",
                                           plot_chance_level=True )

_ = display.ax_.set(xlabel="False Positive Rate",
                    ylabel="True Positive Rate",
                    title="One-vs-Rest ROC curves:\class_2 vs (class_0 & class_1)")

In [None]:
# Logits model
function_model(model=LogisticRegression(max_iter=1000, solver="newton-cg"), x=x, y=y, scoring=scoring)

In [None]:
# Support Vector Machine model
function_model(SVC(C=2, max_iter=1000), x=x,y=y, scoring=scoring)

In [None]:
# Gaussian Naive Bayes  model
function_model(GaussianNB(), x=x, y=y, scoring=scoring)

In [None]:
# Gaussian Naive Bayes  model
gnb_model = GaussianNB().fit(x_train, y_train)
train_scores = pd.DataFrame(cross_validate(gnb_model, x_train, y_train, cv=kfold, scoring=scoring) )
test_scores = pd.DataFrame(cross_validate(gnb_model, x_test, y_test, cv=kfold, scoring=scoring) )
pd.concat([train_scores.mean(), test_scores.mean()], keys=["Train Mean","Test Mean"],  axis=1)

In [None]:
# QuadraticDiscriminantAnalysis model
qda_model = QuadraticDiscriminantAnalysis().fit(x_train, y_train)
train_scores = pd.DataFrame(cross_validate(qda_model, x_train, y_train, cv=kfold, scoring=scoring) )
test_scores = pd.DataFrame(cross_validate(qda_model, x_test, y_test, cv=kfold, scoring=scoring) )
pd.concat([train_scores.mean(), test_scores.mean()], keys=["Train Mean","Test Mean"],  axis=1)

In [None]:
# Decisions tree
tree_model = tree.DecisionTreeClassifier(max_depth=3, random_state=42).fit(x_train, y_train)
train_scores = pd.DataFrame(cross_validate(tree_model, x_train, y_train, cv=kfold, scoring=scoring) )
test_scores = pd.DataFrame(cross_validate(tree_model, x_test, y_test, cv=kfold, scoring=scoring) )
#pd.concat([train_scores.mean(), test_scores.mean()], keys=["Train Mean","Test Mean"],  axis=1)

In [None]:
# Random Forest Classifier
rfc_model = RandomForestClassifier(n_estimators=100).fit(x_train, y_train)
train_scores = pd.DataFrame(cross_validate(rfc_model, x_train, y_train, cv=kfold, scoring=scoring) )
test_scores = pd.DataFrame(cross_validate(rfc_model, x_test, y_test, cv=kfold, scoring=scoring) )
pd.concat([train_scores.mean(), test_scores.mean()], keys=["Train Mean","Test Mean"],  axis=1)

In [None]:
# Gradient Boosting Classifier
gbc_model = GradientBoostingClassifier(n_estimators=100).fit(x_train, y_train)
test_pred = gbc_model.predict(x_test)
print(classification_report(y_test, test_pred ))

In [None]:
# Bagging Classifier
bg_model = BaggingClassifier(n_estimators=100).fit(x_train, y_train)
test_pred = bg_model.predict(x_test)
print(classification_report(y_test, test_pred ))

In [None]:
# Linear Suport vector model
lsvc_model = LinearSVC(C=0.01, penalty="l1", dual=False).fit(x_train, y_train)
test_pred = lsvc_model.predict(x_test)
print(classification_report(y_test, test_pred ))

## Pipeline

In [None]:
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Cargamos el conjunto de datos
iris = load_iris()
X, y = iris.data, iris.target

# Agregamos características categóricas falsas
X = np.hstack((X, np.random.choice(['A', 'B', 'C'], size=(X.shape[0], 1))))

# Dividimos los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Definimos las transformaciones para las características numéricas
numeric_features = [0, 1, 2, 3]  # Índices de las características numéricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Definimos las transformaciones para las características categóricas
categorical_features = [4]  # Índice de la característica categórica
categorical_transformer = Pipeline(steps=[
    #('imputer', CategoricalImputer()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinamos las transformaciones utilizando ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Creamos un pipeline que incluye el preprocesamiento y el modelo
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

# Entrenamos el pipeline
pipeline.fit(X_train, y_train)

# Evaluamos el rendimiento del modelo
accuracy = pipeline.score(X_test, y_test)
print("Accuracy:", accuracy)


## Miss Values

In [None]:
from sklearn.datasets import load_iris
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,KNNImputer, IterativeImputer
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

iris = load_iris()

X, y = iris.data, iris.target

# Create values nan
np.random.seed(42)
values_random = np.random.choice(len(X),5)
X[values_random] = np.nan
print("Values miss id: ", values_random)
print("Number of Values nan: ", np.isnan(X).sum())

In [None]:
X[[102, 92, 14, 106, 71]]

In [None]:
imputer = SimpleImputer(fill_value=np.nan, strategy="mean")
imputer.fit(X)
X_sin_nan = imputer.transform(X)
print(f"X 2 number values nan: ", np.isnan(X_sin_nan).sum())

In [None]:
X_sin_nan[[102, 92, 14, 106, 71]]

## Miss Values with KNN y IterativeImputer

In [None]:
imputer = KNNImputer(n_neighbors=3, weights="uniform")
imputer.fit_transform(X)
X_knn= imputer.transform(X)
X_knn[[102, 92, 14, 106, 71]]

In [None]:
imputer = IterativeImputer(random_state=42, max_iter=10) # mejor q knn
imputer.fit_transform(X)
X_iterative= imputer.transform(X)
X_iterative[[102, 92, 14, 106, 71]]

In [None]:
mean_squared_error(iris.data, X_iterative)