# Machine Learning - Homework 3

In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, accuracy_score, classification_report

## Implement in R or Python a model of an SVM classifier for Wisconsin breast cancer diagnosis

- Drop first column as non-informative
- Treat missing values (delete, change to 0 etc.)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import RFE

data = pd.read_csv("breast-cancer-wisconsin.csv")
data = data.replace('?', 0)
data = data.iloc[:, 1:]
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scalers = {
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler(),
}

def scale_data(X_train, X_test, scaler):
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

def build_and_test_model(svm_model, X_train, X_test, y_train, y_test):
  for scaler_name, scaler in scalers.items():
      X_train_scaled, X_test_scaled = scale_data(X_train, X_test, scaler)

      num_features_to_select = 5
      rfe = RFE(estimator=svm_model, n_features_to_select=num_features_to_select)
      X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)
      X_test_rfe = rfe.transform(X_test_scaled)

      svm_model.fit(X_train_rfe, y_train)

      y_pred = svm_model.predict(X_test_rfe)
      accuracy = accuracy_score(y_test, y_pred)
      report = classification_report(y_test, y_pred)

      print(f"{scaler_name}:")
      print("Selected Features:", rfe.support_)
      print("Feature Ranking:", rfe.ranking_)
      print("Accuracy:", accuracy)
      print("Classification Report:\n", report)

build_and_test_model(SVC(kernel='linear'), X_train, X_test, y_train, y_test)

StandardScaler:
Selected Features: [ True False  True  True False  True  True False False]
Feature Ranking: [1 4 1 1 3 1 1 5 2]
Accuracy: 0.9642857142857143
Classification Report:
               precision    recall  f1-score   support

           2       0.96      0.99      0.97        95
           4       0.98      0.91      0.94        45

    accuracy                           0.96       140
   macro avg       0.97      0.95      0.96       140
weighted avg       0.96      0.96      0.96       140

MinMaxScaler:
Selected Features: [ True False  True False  True  True  True False False]
Feature Ranking: [1 4 1 3 1 1 1 5 2]
Accuracy: 0.9571428571428572
Classification Report:
               precision    recall  f1-score   support

           2       0.96      0.98      0.97        95
           4       0.95      0.91      0.93        45

    accuracy                           0.96       140
   macro avg       0.96      0.95      0.95       140
weighted avg       0.96      0.96      0.

## Implement in R or Python an SVM model for regression for the Tips data set from the previous lecture


In [None]:
def read_csv_and_analyze_dataset(path_to_csv):
  dataframe = pd.read_csv(path_to_csv)

  print("The dataset:")
  print(dataframe, end='\n\n')

  print("All the unique values for all columns that contain discrete values:")

  for column_name in dataframe:
    if not pd.api.types.is_numeric_dtype(dataframe[column_name]):
      print(f"\n{column_name}:")

      for unique_value in dataframe[column_name].unique():
        print(f"{unique_value} ({len(dataframe.loc[dataframe[column_name] == unique_value])})")

  print()

  return dataframe

def transform_binary_and_nominal_columns(dataframe, column_names):
  transformer = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), column_names),
    remainder='passthrough'
  )

  transformed = transformer.fit_transform(dataframe)

  return pd.DataFrame(
      transformed,
      columns=transformer.get_feature_names_out()
  )

def transform_ordinal_columns(dataframe, column_names):
  dataframe[column_names] = OrdinalEncoder().fit_transform(
      dataframe[column_names]
  )

  return dataframe

def train_and_test_model(X, y, model):
  X_train, X_test, y_train, y_test = train_test_split(
      X,
      y,
      test_size=0.2,
      shuffle=True
  )

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
  print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))


### Analyze the dataset

In [None]:
tips_df = read_csv_and_analyze_dataset('tips.csv')

The dataset:
     total_bill   tip     sex smoker   day    time  size
0         16.99  1.01  Female     No   Sun  Dinner     2
1         10.34  1.66    Male     No   Sun  Dinner     3
2         21.01  3.50    Male     No   Sun  Dinner     3
3         23.68  3.31    Male     No   Sun  Dinner     2
4         24.59  3.61  Female     No   Sun  Dinner     4
..          ...   ...     ...    ...   ...     ...   ...
239       29.03  5.92    Male     No   Sat  Dinner     3
240       27.18  2.00  Female    Yes   Sat  Dinner     2
241       22.67  2.00    Male    Yes   Sat  Dinner     2
242       17.82  1.75    Male     No   Sat  Dinner     2
243       18.78  3.00  Female     No  Thur  Dinner     2

[244 rows x 7 columns]

All the unique values for all columns that contain discrete values:

sex:
Female (87)
Male (157)

smoker:
No (151)
Yes (93)

day:
Sun (76)
Sat (87)
Thur (62)
Fri (19)

time:
Dinner (176)
Lunch (68)



### Preprocess the dataset

In [None]:
tips_df = tips_df.replace({'day': { 'Thur': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7 }})

binary_column_names = ['sex', 'smoker', 'time']

tips_df = transform_binary_and_nominal_columns(tips_df, binary_column_names)

tips_df

Unnamed: 0,onehotencoder__sex_Male,onehotencoder__smoker_Yes,onehotencoder__time_Lunch,remainder__total_bill,remainder__tip,remainder__day,remainder__size
0,0.0,0.0,0.0,16.99,1.01,7.0,2.0
1,1.0,0.0,0.0,10.34,1.66,7.0,3.0
2,1.0,0.0,0.0,21.01,3.50,7.0,3.0
3,1.0,0.0,0.0,23.68,3.31,7.0,2.0
4,0.0,0.0,0.0,24.59,3.61,7.0,4.0
...,...,...,...,...,...,...,...
239,1.0,0.0,0.0,29.03,5.92,6.0,3.0
240,0.0,1.0,0.0,27.18,2.00,6.0,2.0
241,1.0,1.0,0.0,22.67,2.00,6.0,2.0
242,1.0,0.0,0.0,17.82,1.75,6.0,2.0


### Train and test the model

In [None]:
y = tips_df.pop('remainder__tip')

In [None]:
train_and_test_model(tips_df, y, SVR())

Mean squared error: 1.17
Coefficient of determination: 0.46
