## Scikit-learn

Choose a dataset of your choice from https://archive.ics.uci.edu/datasets so that it contains both ordinal and numerical features. After cleaning up and preprocessing the dataset apply:
  * an AutoML pipeline as in the example shown in the lecture
  * an ensemble learning pipeline as in the example of the lecture

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier  # Third Model
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import VotingClassifier

from sklearn.pipeline import Pipeline

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544) 
  
# data (as pandas dataframes) 
X = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features 
y = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets 
  
# metadata 
print(estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.metadata) 
  
# variable information 
print(estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.variables) 


{'uci_id': 544, 'name': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition ', 'repository_url': 'https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition', 'data_url': 'https://archive.ics.uci.edu/static/public/544/data.csv', 'abstract': 'This dataset include data for the estimation of obesity levels in individuals from the countries of Mexico, Peru and Colombia, based on their eating habits and physical condition. ', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Regression', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 2111, 'num_features': 16, 'feature_types': ['Integer'], 'demographics': ['Gender', 'Age'], 'target_col': ['NObeyesdad'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2019, 'last_updated': 'Tue Sep 10 2024', 'dataset_doi': '10.24432/C5H31Z', 'creators': [], 'intro_paper': {'ID': 358, 'type': 

In [None]:
# Detect categorical and numerical columns
categorical_columns = X.select_dtypes(include=["object"]).columns
numerical_columns = X.select_dtypes(exclude=["object"]).columns

# Encode categorical features
oe = OrdinalEncoder()
X[categorical_columns] = oe.fit_transform(X[categorical_columns])

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=69)

# Scale numerical features
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])




# Logistic Regression - Hyperparameter Optimization
log_reg_param_grid = {
    'C': [0.1, 1, 10],  # Regularization strength
    'solver': ['lbfgs', 'liblinear'],  # Solvers
    'multi_class': ['ovr', 'multinomial']
}
log_reg = GridSearchCV(LogisticRegression(random_state=31), log_reg_param_grid, cv=5)
log_reg.fit(X_train, y_train)
best_log_reg = log_reg.best_estimator_
print("Best Logistic Regression Params:", log_reg.best_params_)




# K-Nearest Neighbor - Hyperparameter Optimization
knn_param_grid = {"n_neighbors": range(2, 11), "p": [1, 2]}  # L1 and L2 distances
knn = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv=5)
knn.fit(X_train, y_train)
best_knn = knn.best_estimator_
print("Best KNN Params:", knn.best_params_)


# Gradient Boosting - Hyperparameter Optimization
gb_param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7]
}
gb = GridSearchCV(GradientBoostingClassifier(random_state=13), gb_param_grid, cv=5)
gb.fit(X_train, y_train)
best_gb = gb.best_estimator_
print("Best Gradient Boosting Params:", gb.best_params_)



# Evaluation for All Models
models = {
    'Logistic Regression': best_log_reg,
    'KNN': best_knn,
    'Gradient Boosting': best_gb
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n{name} - Accuracy: {acc}")
    print(f"{name} - Confusion Matrix:\n{cm}")

# Ensemble Learning: Voting Classifier
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(estimators=[
    ('log_reg', best_log_reg),
    ('knn', best_knn),
    ('gb', best_gb)
], voting='hard')
ensemble.fit(X_train, y_train)

# Evaluate Ensemble
y_pred_ensemble = ensemble.predict(X_test)
ensemble_acc = accuracy_score(y_test, y_pred_ensemble)
ensemble_cm = confusion_matrix(y_test, y_pred_ensemble)

print(f"\nEnsemble Model - Accuracy: {ensemble_acc}")
print(f"Ensemble Model - Confusion Matrix:\n{ensemble_cm}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[categorical_columns] = oe.fit_transform(X[categorical_columns])
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear

Best Logistic Regression Params: {'C': 10, 'multi_class': 'multinomial', 'solver': 'lbfgs'}


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Best KNN Params: {'n_neighbors': 2, 'p': 1}


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [None]:
X_train

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
804,1.0,0.363037,1.058557,0.005567,0.0,1.0,0.441719,-0.276283,2.0,0.0,-0.022341,0.0,1.181924,-0.897889,3.0,0.0
1147,0.0,-0.119117,-0.030468,-0.244041,1.0,1.0,-0.775388,0.388469,2.0,0.0,0.610739,0.0,-0.089415,1.158364,2.0,3.0
1351,0.0,2.282017,-1.247357,-0.260036,1.0,1.0,-0.737173,-1.900765,2.0,0.0,-0.134556,0.0,1.144006,-1.057149,2.0,0.0
829,0.0,-0.493663,0.438454,-0.302633,1.0,1.0,0.176333,-2.209175,2.0,0.0,-0.022341,0.0,1.323624,-1.057149,2.0,3.0
637,1.0,-0.986427,1.525210,-1.020945,1.0,1.0,1.088602,1.687290,2.0,0.0,-0.022341,0.0,1.181924,-1.057149,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,1.0,1.478765,-0.137690,-0.107854,1.0,0.0,-0.775388,0.388469,2.0,0.0,1.613930,0.0,1.181924,-1.057149,3.0,0.0
1780,1.0,0.342364,-0.134887,0.621892,1.0,1.0,0.435098,-2.053184,2.0,0.0,-1.407795,0.0,-0.929193,-0.036763,3.0,3.0
866,0.0,-0.986427,-2.186249,-1.196847,0.0,1.0,-0.775388,0.388469,2.0,0.0,-1.209099,1.0,-1.032205,0.555943,2.0,3.0
74,1.0,-0.061980,-0.030682,-0.069809,1.0,1.0,-0.775388,0.388469,1.0,0.0,1.613930,0.0,-1.186004,0.591262,1.0,3.0
