# Machine Learning - Homework 5.2


In [None]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix
from sklearn.ensemble import (
    BaggingClassifier,
    AdaBoostClassifier,
    RandomForestClassifier,
    BaggingRegressor,
    AdaBoostRegressor,
    RandomForestRegressor
)
import xgboost as xgb

In [None]:
REGRESSION_MODEL_NAMES = [
    'BaggingRegressor',
    'AdaBoostRegressor',
    'RandomForestRegressor'
]

def read_csv_and_analyze_dataset(path_to_csv):
  dataframe = pd.read_csv(path_to_csv)

  print("The dataset:")
  print(dataframe, end='\n\n')

  print("All the unique values for all columns that contain discrete values:")

  for column_name in dataframe:
    if not pd.api.types.is_numeric_dtype(dataframe[column_name]):
      print(f"\n{column_name}:")

      for unique_value in dataframe[column_name].unique():
        print(f"{unique_value} ({len(dataframe.loc[dataframe[column_name] == unique_value])})")

  print()

  return dataframe

def transform_binary_and_nominal_columns(dataframe, column_names):
  transformer = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), column_names),
    remainder='passthrough'
  )

  transformed = transformer.fit_transform(dataframe)

  return pd.DataFrame(
      transformed,
      columns=transformer.get_feature_names_out()
  )

def transform_ordinal_columns(dataframe, column_names):
  dataframe[column_names] = OrdinalEncoder().fit_transform(
      dataframe[column_names]
  )

  return dataframe

def train_test_and_visualize_model(X, y, model):
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        shuffle=True
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    if type(model).__name__ in REGRESSION_MODEL_NAMES:
        print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
        print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
    else:
        print("Accuracy: %.2f" % model.score(X_test, y_test))
        print("Confusion matrix:")
        print(confusion_matrix(y_test, y_pred))

## Apply bagging, boosting and random forests in R or Python for the Pima Indians Diabetes data set

In [None]:
diabetes_df = read_csv_and_analyze_dataset('diabetes.csv')

The dataset:
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1     

In [None]:
diabetes_y = diabetes_df.pop('Outcome')

In [None]:
train_test_and_visualize_model(diabetes_df, diabetes_y, BaggingClassifier())

Accuracy: 0.81
Confusion matrix:
[[95 10]
 [19 30]]


In [None]:
train_test_and_visualize_model(diabetes_df, diabetes_y, AdaBoostClassifier())

Accuracy: 0.78
Confusion matrix:
[[81 14]
 [20 39]]


In [None]:
train_test_and_visualize_model(
    diabetes_df,
    diabetes_y,
    RandomForestClassifier()
)

Accuracy: 0.69
Confusion matrix:
[[76 17]
 [31 30]]


## Apply bagging and boosting in R or Python for regression on the Boston data set

In [None]:
boston_df = read_csv_and_analyze_dataset('Boston.csv')

The dataset:
     Unnamed: 0     crim    zn  indus  chas    nox     rm   age     dis  rad  \
0             1  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1   
1             2  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2   
2             3  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2   
3             4  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3   
4             5  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3   
..          ...      ...   ...    ...   ...    ...    ...   ...     ...  ...   
501         502  0.06263   0.0  11.93     0  0.573  6.593  69.1  2.4786    1   
502         503  0.04527   0.0  11.93     0  0.573  6.120  76.7  2.2875    1   
503         504  0.06076   0.0  11.93     0  0.573  6.976  91.0  2.1675    1   
504         505  0.10959   0.0  11.93     0  0.573  6.794  89.3  2.3889    1   
505         506  0.04741   0.0  11.93     0  0.573  6.030  80.8  2.5050    1   

     tax  ptratio   black 

In [None]:
boston_y = boston_df.pop('medv')

In [None]:
train_test_and_visualize_model(boston_df, boston_y, BaggingRegressor())

Mean squared error: 10.73
Coefficient of determination: 0.85


In [None]:
train_test_and_visualize_model(boston_df, boston_y, AdaBoostRegressor())

Mean squared error: 12.81
Coefficient of determination: 0.75


## Apply random forests in R or Python for the car price data set regression task

In [None]:
car_df = read_csv_and_analyze_dataset('CarPrice.csv')

The dataset:
     car_ID  symboling                   CarName fueltype aspiration  \
0         1          3        alfa-romero giulia      gas        std   
1         2          3       alfa-romero stelvio      gas        std   
2         3          1  alfa-romero Quadrifoglio      gas        std   
3         4          2               audi 100 ls      gas        std   
4         5          2                audi 100ls      gas        std   
..      ...        ...                       ...      ...        ...   
200     201         -1           volvo 145e (sw)      gas        std   
201     202         -1               volvo 144ea      gas      turbo   
202     203         -1               volvo 244dl      gas        std   
203     204         -1                 volvo 246   diesel      turbo   
204     205         -1               volvo 264gl      gas      turbo   

    doornumber      carbody drivewheel enginelocation  wheelbase  ...  \
0          two  convertible        rwd          f

In [None]:
car_df = car_df.drop(['CarName'], axis=1)

binary_and_nominal_column_names = ['fueltype', 'aspiration', 'doornumber',
                                   'carbody', 'drivewheel', 'enginelocation',
                                   'enginetype', 'cylindernumber', 'fuelsystem']

car_df = transform_binary_and_nominal_columns(
    car_df,
    binary_and_nominal_column_names
)

car_y = car_df.pop('remainder__price')

In [None]:
train_test_and_visualize_model(car_df, car_y, RandomForestRegressor())

Mean squared error: 2459397.42
Coefficient of determination: 0.94


## Research XGBoost (gradient boosting), explain its concept of weak learners training on residuals and apply it on the Iris data set in either R or Python

XGBoost is a powerful algorithm that employs an ensemble of decision trees as weak learners. What sets it apart is its iterative training process. In each step, it builds a new tree to correct the errors (residuals) of the current ensemble. This sequential learning, combined with gradient descent optimization, makes XGBoost highly effective in gradually improving its predictive performance.

In [None]:
iris_df = read_csv_and_analyze_dataset('iris.csv')

The dataset:
     sepal.length  sepal.width  petal.length  petal.width    variety
0             5.1          3.5           1.4          0.2     Setosa
1             4.9          3.0           1.4          0.2     Setosa
2             4.7          3.2           1.3          0.2     Setosa
3             4.6          3.1           1.5          0.2     Setosa
4             5.0          3.6           1.4          0.2     Setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  Virginica
146           6.3          2.5           5.0          1.9  Virginica
147           6.5          3.0           5.2          2.0  Virginica
148           6.2          3.4           5.4          2.3  Virginica
149           5.9          3.0           5.1          1.8  Virginica

[150 rows x 5 columns]

All the unique values for all columns that contain discrete values:

variety:
Setosa (50)
Versicolor (50)
Virginica (50)



In [None]:
# This column is not actually ordinal, but we will need to apply an 'ordinal'
# transformation to it in order to use it as 'y' in a multiclass classification
# problem
iris_df = transform_ordinal_columns(iris_df, ['variety'])

iris_y = iris_df.pop('variety')

In [None]:
xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",
    num_class=len(set(iris_y))
)

train_test_and_visualize_model(iris_df, iris_y, xgb_model)

Accuracy: 1.00
Confusion matrix:
[[10  0  0]
 [ 0 10  0]
 [ 0  0 10]]
