# Machine Learning - Homework 4

In [None]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix

In [None]:
def read_csv_and_analyze_dataset(path_to_csv):
  dataframe = pd.read_csv(path_to_csv)

  print("The dataset:")
  print(dataframe, end='\n\n')

  print("All the unique values for all columns that contain discrete values:")

  for column_name in dataframe:
    if not pd.api.types.is_numeric_dtype(dataframe[column_name]):
      print(f"\n{column_name}:")

      for unique_value in dataframe[column_name].unique():
        print(f"{unique_value} ({len(dataframe.loc[dataframe[column_name] == unique_value])})")

  print()

  return dataframe

def normalize_values(dataframe):
    scaler = MinMaxScaler()

    normalized_values = scaler.fit_transform(dataframe.values)
    normalized = pd.DataFrame(normalized_values, columns=dataframe.columns)

    return normalized

def transform_ordinal_columns(dataframe, column_names):
  dataframe[column_names] = OrdinalEncoder().fit_transform(
      dataframe[column_names]
  )

  return dataframe

def transform_binary_and_nominal_columns(dataframe, column_names):
  transformer = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), column_names),
    remainder='passthrough'
  )

  transformed = transformer.fit_transform(dataframe)

  return pd.DataFrame(
      transformed,
      columns=transformer.get_feature_names_out()
  )

def train_and_test_model(X, y, model):
  X_train, X_test, y_train, y_test = train_test_split(
      X,
      y,
      test_size=0.2,
      shuffle=True
  )

  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)

  if type(model).__name__ == 'MLPRegressor':
    print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
  else:
    print("Accuracy: %.2f" % model.score(X_test, y_test))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))


## Implement in R or Python a neural network model for the Pima Indians Diabetes problem

In [None]:
diabetes_df = read_csv_and_analyze_dataset('diabetes.csv')

The dataset:
     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1     

In [None]:
diabetes_df = normalize_values(diabetes_df)

diabetes_y = diabetes_df.pop('Outcome')

In [None]:
neural_network_model = MLPClassifier(
    solver='adam',
    hidden_layer_sizes=(10, 10, 10),
    max_iter=3000
)

train_and_test_model(diabetes_df, diabetes_y, neural_network_model)

Accuracy: 0.80
Confusion matrix:
[[87 12]
 [19 36]]


## Implement in R or Python a support vector machine model for the iris data set to be compared to the result of the neural network from this lecture

In [None]:
iris_df = read_csv_and_analyze_dataset('iris.csv')

The dataset:
     sepal.length  sepal.width  petal.length  petal.width    variety
0             5.1          3.5           1.4          0.2     Setosa
1             4.9          3.0           1.4          0.2     Setosa
2             4.7          3.2           1.3          0.2     Setosa
3             4.6          3.1           1.5          0.2     Setosa
4             5.0          3.6           1.4          0.2     Setosa
..            ...          ...           ...          ...        ...
145           6.7          3.0           5.2          2.3  Virginica
146           6.3          2.5           5.0          1.9  Virginica
147           6.5          3.0           5.2          2.0  Virginica
148           6.2          3.4           5.4          2.3  Virginica
149           5.9          3.0           5.1          1.8  Virginica

[150 rows x 5 columns]

All the unique values for all columns that contain discrete values:

variety:
Setosa (50)
Versicolor (50)
Virginica (50)



In [None]:
# This column is not actually ordinal, but we will need to apply an 'ordinal'
# transformation to it in order to use it as 'y' in a multiclass classification
# problem
iris_df = transform_ordinal_columns(iris_df, ['variety'])

iris_y = iris_df.pop('variety')

iris_df = normalize_values(iris_df)

In [None]:
support_vector_model = SVC()

train_and_test_model(iris_df, iris_y, support_vector_model)

Accuracy: 0.97
Confusion matrix:
[[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]


## Build a neural network for a car price data set regression task in either R or Python

In [None]:
car_df = read_csv_and_analyze_dataset('CarPrice.csv')

The dataset:
     car_ID  symboling                   CarName fueltype aspiration  \
0         1          3        alfa-romero giulia      gas        std   
1         2          3       alfa-romero stelvio      gas        std   
2         3          1  alfa-romero Quadrifoglio      gas        std   
3         4          2               audi 100 ls      gas        std   
4         5          2                audi 100ls      gas        std   
..      ...        ...                       ...      ...        ...   
200     201         -1           volvo 145e (sw)      gas        std   
201     202         -1               volvo 144ea      gas      turbo   
202     203         -1               volvo 244dl      gas        std   
203     204         -1                 volvo 246   diesel      turbo   
204     205         -1               volvo 264gl      gas      turbo   

    doornumber      carbody drivewheel enginelocation  wheelbase  ...  \
0          two  convertible        rwd          f

In [None]:
car_df = car_df.drop(['CarName'], axis=1)

binary_and_nominal_column_names = ['fueltype', 'aspiration', 'doornumber',
                                   'carbody', 'drivewheel', 'enginelocation',
                                   'enginetype', 'cylindernumber', 'fuelsystem']

car_df = transform_binary_and_nominal_columns(
    car_df,
    binary_and_nominal_column_names
)

car_df = normalize_values(car_df)

car_y = car_df.pop('remainder__price')

In [None]:
neural_network_regression_model = MLPRegressor()

train_and_test_model(car_df, car_y, neural_network_regression_model)

Mean squared error: 0.01
Coefficient of determination: 0.82
