### Se trata de un modelo de clasificación, ya que trata de predecir valores 0--> blanco ó 1--> tinto

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.model_selection import train_test_split

In [15]:
df = pd.read_csv('wine_limpio.csv')

In [17]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red,white
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.450000,8.8,6,0,1
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.490000,9.5,6,0,1
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.440000,10.1,6,0,1
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,6,0,1
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,6,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.580000,10.5,5,1,0
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.531215,11.2,6,1,0
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.750000,11.0,6,1,0
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.710000,10.2,5,1,0


In [18]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'red', 'white'],
      dtype='object')

In [19]:
X = df.drop(columns=['red', 'white'], axis=1)
y = df['red']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Support Vector Machine', SVC()),
    ('Random Forest', RandomForestClassifier()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('AdaBoost', AdaBoostClassifier()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('CatBoost', CatBoostClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier())
]


In [24]:
for model_name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print(f"Modelo: {model_name}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print("------------------------")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Modelo: Logistic Regression
Accuracy: 0.9715384615384616
Precision: 0.9570957095709571
Recall: 0.9235668789808917
------------------------
Modelo: Support Vector Machine
Accuracy: 0.926923076923077
Precision: 0.9294117647058824
Recall: 0.7547770700636943
------------------------
Modelo: Random Forest
Accuracy: 0.9961538461538462
Precision: 1.0
Recall: 0.9840764331210191
------------------------
Modelo: K-Nearest Neighbors
Accuracy: 0.9361538461538461
Precision: 0.9052631578947369
Recall: 0.821656050955414
------------------------
Modelo: AdaBoost
Accuracy: 0.9915384615384616
Precision: 0.9809523809523809
Recall: 0.9840764331210191
------------------------
Modelo: Decision Tree
Accuracy: 0.9884615384615385
Precision: 0.9869706840390879
Recall: 0.964968152866242
------------------------
Learning rate set to 0.020824
0:	learn: 0.6431602	total: 152ms	remaining: 2m 31s
1:	learn: 0.6026469	total: 157ms	remaining: 1m 18s
2:	learn: 0.5594750	total: 162ms	remaining: 53.9s
3:	learn: 0.5175542	to

#### De acuerdo con las métricas, el RandomForestClassifier resulta el modelo mas interesante a explorar. Por lo que mediante el método GridSearch() busco los mejores hiperparámetros.

In [25]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier()
gs = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

gs.fit(X_train, y_train)

In [26]:
best_params = gs.best_params_
print("Mejores hiperparámetros:", best_params)

Mejores hiperparámetros: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}


Guardo estos hiper parametros para el EntranamientoML

In [27]:
best_model = gs.best_estimator_
best_model

In [28]:
y_pred = best_model.predict(X_test)
y_pred

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [29]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

In [30]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.9969230769230769
Precision: 1.0
Recall: 0.9872611464968153


Estas metricas tan buenas se deben a que el dataframe esta muy bien preparado y muy limpio