# Machine Learning - Homework 5.1


In [None]:
!pip install dtreeviz

Collecting dtreeviz
  Downloading dtreeviz-2.2.2-py3-none-any.whl (91 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/91.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m61.4/91.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.8/91.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dtreeviz
Successfully installed dtreeviz-2.2.2


In [None]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix
import logging
import dtreeviz

In [None]:
logging.getLogger("matplotlib.font_manager").setLevel(logging.ERROR)

In [None]:
def read_csv_and_analyze_dataset(path_to_csv):
  dataframe = pd.read_csv(path_to_csv)

  print("The dataset:")
  print(dataframe, end='\n\n')

  print("All the unique values for all columns that contain discrete values:")

  for column_name in dataframe:
    if not pd.api.types.is_numeric_dtype(dataframe[column_name]):
      print(f"\n{column_name}:")

      for unique_value in dataframe[column_name].unique():
        print(f"{unique_value} ({len(dataframe.loc[dataframe[column_name] == unique_value])})")

  print()

  return dataframe

def transform_binary_and_nominal_columns(dataframe, column_names):
  transformer = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), column_names),
    remainder='passthrough'
  )

  transformed = transformer.fit_transform(dataframe)

  return pd.DataFrame(
      transformed,
      columns=transformer.get_feature_names_out()
  )

def train_test_and_visualize_model(X, y, model, name):
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        shuffle=True
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    if type(model).__name__ == 'DecisionTreeRegressor':
        print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
        print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
    else:
        print("Accuracy: %.2f" % model.score(X_test, y_test))
        print("Confusion matrix:")
        print(confusion_matrix(y_test, y_pred))

    if hasattr(model, 'tree_') and hasattr(model, 'feature_importances_'):
        feature_names = X.columns if hasattr(X, 'columns') else None
        class_names = model.classes_.astype(str) if hasattr(model, 'classes_') else None

        viz = dtreeviz.model(model,
                             X,
                             y,
                             target_name='label/target',
                             feature_names=feature_names,
                             class_names=class_names)

        v = viz.view()

        v.save(f"{name}.svg")

## Implement in R or Python a decision tree model for Pima Indians Diabet

In [None]:
diabetes_df = read_csv_and_analyze_dataset('diabetes.csv')

The dataset:
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1     

In [None]:
diabetes_y = diabetes_df.pop('Outcome')

In [None]:
train_test_and_visualize_model(
    diabetes_df,
    diabetes_y,
    DecisionTreeClassifier(),
    'diabetes'
  )

Accuracy: 0.68
Confusion matrix:
[[75 26]
 [24 29]]




## Build a decision tree for the tips data set regression task in either R or Python

In [None]:
tips_df = read_csv_and_analyze_dataset('tips.csv')

The dataset:
     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]

All the unique values for all columns that contain discrete values:

sex:
Female (87)
Male (157)

smoker:
No (151)
Yes (93)

day:
Sun (76)
Sat (87)
Thur (62)
Fri (19)

time:
Dinner (176)
Lunch (68)



In [None]:
tips_df = tips_df.replace({'day': { 'Thur': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7 }})

binary_column_names = ['sex', 'smoker', 'time']

tips_df = transform_binary_and_nominal_columns(tips_df, binary_column_names)

y = tips_df.pop('remainder__tip')

In [None]:
train_test_and_visualize_model(tips_df, y, DecisionTreeRegressor(), 'tips')

Mean squared error: 1.31
Coefficient of determination: 0.37




## Build a decision tree for a car price data set regression task in either R or Python

In [None]:
car_df = read_csv_and_analyze_dataset('CarPrice.csv')

The dataset:
     car_ID  symboling                   CarName fueltype aspiration  \
0         1          3        alfa-romero giulia      gas        std   
1         2          3       alfa-romero stelvio      gas        std   
2         3          1  alfa-romero Quadrifoglio      gas        std   
3         4          2               audi 100 ls      gas        std   
4         5          2                audi 100ls      gas        std   
..      ...        ...                       ...      ...        ...   
200     201         -1           volvo 145e (sw)      gas        std   
201     202         -1               volvo 144ea      gas      turbo   
202     203         -1               volvo 244dl      gas        std   
203     204         -1                 volvo 246   diesel      turbo   
204     205         -1               volvo 264gl      gas      turbo   

    doornumber      carbody drivewheel enginelocation  wheelbase  ...  \
0          two  convertible        rwd          f

In [None]:
car_df = car_df.drop(['CarName'], axis=1)

binary_and_nominal_column_names = ['fueltype', 'aspiration', 'doornumber',
                                   'carbody', 'drivewheel', 'enginelocation',
                                   'enginetype', 'cylindernumber', 'fuelsystem']

car_df = transform_binary_and_nominal_columns(
    car_df,
    binary_and_nominal_column_names
)

car_y = car_df.pop('remainder__price')

In [None]:
train_test_and_visualize_model(
    car_df,
    car_y,
    DecisionTreeRegressor(),
    'car_prices'
)

Mean squared error: 7506797.07
Coefficient of determination: 0.91


