In [59]:
# Import the necessary libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [60]:
# Load various printing functions

# Pretty prints a data frame without display limits
def print_df(df):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        print(df)


# Pretty prints the results of the classifier performance evaluation
def print_classifier_results(accuracy, precision, recall, f1, mode):
    print()
    print('~~~~~~~~~~~~~~~~ CLASSIFICATION RESULTS (' + mode.upper() + ') ~~~~~~~~~~~~~~~~')
    print(' Accuracy: ' + str(accuracy))
    print('Precision: ' + str(precision))
    print('   Recall: ' + str(recall))
    print(' F1 Score: ' + str(f1))


# Pretty prints the results of the regressor performance evaluation
def print_regressor_results(rmse, mae, r_squared, mode):
    print()
    print('~~~~~~~~~~~~~~~~ REGRESSION RESULTS (' + mode.upper() + ') ~~~~~~~~~~~~~~~~')
    print('Root Mean Squared Error (RMSE): ' + str(rmse))
    print('     Mean Absolute Error (MAE): ' + str(mae))
    print('               R-squared Score: ' + str(r_squared))


# Pretty prints the results of the multiple classifications
def print_fitting_results(accuracy_list):

    print()
    print('~~~~~~~~~~~~~~~~ CLASSIFICATION RESULTS ~~~~~~~~~~~~~~~~')

    for accuracy_pair in accuracy_list:
        print('Training Set Accuracy 1: ' + str(accuracy_pair[0]))
        print('    Test Set Accuracy 1: ' + str(accuracy_pair[1]))
        print()


# Pretty prints the results of the clustering algorithm
def print_clustering_results(silhouette_score):

    print()
    print('~~~~~~~~~~~~~~~~ CLUSTERING RESULTS ~~~~~~~~~~~~~~~~')
    print('Silhouette Score: ' + str(silhouette_score))


**Exercise 1: Classification**


In [61]:
# Load the dataset from the local session folder
classification_dataset = pd.read_csv("diabetes.csv")

# Take a look at the first entries of the dataset
print_df(classification_dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [62]:
# Extract the features from the dataset
classification_X = dataset.iloc[:, :-1]
# print_df(classification_X.head())

# Extract the labels from the dataset
classification_y = dataset.iloc[:, -1:]
# print_df(classification_y.head())

In [63]:
# Split the data into a training set and a test set with a 80-20 ratio
classification_X_train, classification_X_test, classification_y_train, classification_y_test = train_test_split(classification_X, classification_y, test_size=0.2, random_state=42)

In [64]:
# Build a classifier (less important: a decision tree will be used)
classifier = DecisionTreeClassifier(random_state=42)

# Fit data from the training set to the classifier
classifier.fit(classification_X_train, classification_y_train)

# Make predictions on the test set
classification_y_pred = classifier.predict(classification_X_test)

Task 2.A.

Evaluate the performance of a classifier by manually computing accuracy, precision, recall and F1 score.

In [65]:
# Computes the accuracy of the model using the confusion matrix
def compute_accuracy(cm):
    # TODO - TASK A
    accuracy = None

    return accuracy

In [66]:
# Computes the precision of the model using the confusion matrix
def compute_precision(cm):
    # TODO - TASK A
    precision = None

    return precision

In [67]:
# Computes the recall of the model using the confusion matrix
def compute_recall(cm):
    # TODO - TASK A
    recall = None

    return recall

In [68]:
# Computes the F1 score of the model using the precision and recall
def compute_f1_score(precision, recall):
    # TODO - TASK A
    f1 = None

    return f1

In [69]:
# TODO - TASK A
cm = None

task_a_accuracy = compute_accuracy(cm)
task_a_precision = compute_precision(cm)
task_a_recall = compute_recall(cm)
task_a_f1 = compute_f1_score(precision, recall)

print_classifier_results(task_a_accuracy, task_a_precision, task_a_recall, task_a_f1, 'dumb')


~~~~~~~~~~~~~~~~ CLASSIFICATION RESULTS (DUMB) ~~~~~~~~~~~~~~~~
 Accuracy: None
Precision: None
   Recall: None
 F1 Score: None


Task 2.B.

Evaluate the performance of a classifier using code written by others.

In [70]:
# TODO - TASK B
task_b_accuracy = None
task_b_precision, task_b_recall, task_b_f1, _ = None, None, None, None

print_classifier_results(task_b_accuracy, task_b_precision, task_b_recall, task_b_f1, 'smart')


~~~~~~~~~~~~~~~~ CLASSIFICATION RESULTS (SMART) ~~~~~~~~~~~~~~~~
 Accuracy: None
Precision: None
   Recall: None
 F1 Score: None


**Exercise 2: Linear Regression**


In [71]:
# Load the dataset from the local session folder
regression_dataset = pd.read_csv('diabetes.csv', dtype=np.float64)

# Take a look at the first entries of the dataset
print_df(regression_dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          6.0    148.0           72.0           35.0      0.0  33.6   
1          1.0     85.0           66.0           29.0      0.0  26.6   
2          8.0    183.0           64.0            0.0      0.0  23.3   
3          1.0     89.0           66.0           23.0     94.0  28.1   
4          0.0    137.0           40.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction   Age  Outcome  
0                     0.627  50.0      1.0  
1                     0.351  31.0      0.0  
2                     0.672  32.0      1.0  
3                     0.167  21.0      0.0  
4                     2.288  33.0      1.0  


In [72]:
# Extract the features from the dataset
regression_X = regression_dataset.iloc[:, :-1]
# print_df(regression_X.head())

# Extract the labels from the dataset
regression_y = regression_dataset.iloc[:, -1:]
# print_df(regression_y.head())

In [73]:
# Split the data into a training set and a test set with a 80-20 ratio
regression_X_train, regression_X_test, regression_y_train, regression_y_test = train_test_split(regression_X, regression_y, test_size=0.2, random_state=42)

In [74]:
# Build a linear regressor
regressor = LinearRegression()

# Fit data from the training set to the regressor
regressor.fit(X_train, y_train)

# Make predictions on the test set
regression_y_pred = regressor.predict(X_test)

Task 1.A.

Evaluate the performance of a regressor by manually computing RMSE, MAE and R2 score.

In [75]:
# Computes the RMSE of the model
def compute_rmse(y_test, y_pred):
    # TODO - TASK A
    return None

In [76]:
# Computes the MAE of the model
def compute_mae(y_test, y_pred):
    # TODO - TASK A
    return None

In [77]:
# Computes the R-squared score of the model
def compute_r2_score(y_test, y_pred):
    # TODO - TASK A
    return None

In [78]:
task_a_rmse = compute_rmse(y_test, y_pred)
task_a_mae = compute_mae(y_test, y_pred)
task_a_r_squared = compute_r2_score(y_test, y_pred)

print_regressor_results(task_a_rmse, task_a_mae, task_a_r_squared, 'dumb')


~~~~~~~~~~~~~~~~ REGRESSION RESULTS (DUMB) ~~~~~~~~~~~~~~~~
Root Mean Squared Error (RMSE): None
     Mean Absolute Error (MAE): None
               R-squared Score: None


Task 1.B.

Evaluate the performance of a regressor using code written by others

In [79]:
# TODO - TASK B
task_b_rmse = None
task_b_mae = None
task_b_r_squared = None

In [80]:
print_regressor_results(task_b_rmse, task_b_mae, task_b_r_squared, 'smart')


~~~~~~~~~~~~~~~~ REGRESSION RESULTS (SMART) ~~~~~~~~~~~~~~~~
Root Mean Squared Error (RMSE): None
     Mean Absolute Error (MAE): None
               R-squared Score: None


Task 1.C.

Tweak some parameters and draw some plots

In [81]:
# Plots the evolution of the RMSE when the dataset size varies
def plot_rmse_evolution(X, y, chunks, min_chunk_size, max_chunk_size):
    # TODO - TASK C
    pass

In [82]:
# TODO - TASK C
CHUNKS = 100
MIN_CHUNK_SIZE = 1000
MAX_CHUNK_SIZE = 1000000

plot_rmse_evolution(regression_X, regression_y, CHUNKS, MIN_CHUNK_SIZE, MAX_CHUNK_SIZE)

**Exercise 3: Fitting Behaviour**


In [86]:
# Load the dataset from the local session folder
fitting_dataset = pd.read_csv("diabetes.csv")

# Take a look at the first entries of the dataset
print_df(fitting_dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          6.0    148.0           72.0           35.0      0.0  33.6   
1          1.0     85.0           66.0           29.0      0.0  26.6   
2          8.0    183.0           64.0            0.0      0.0  23.3   
3          1.0     89.0           66.0           23.0     94.0  28.1   
4          0.0    137.0           40.0           35.0    168.0  43.1   

   DiabetesPedigreeFunction   Age  Outcome  
0                     0.627  50.0      1.0  
1                     0.351  31.0      0.0  
2                     0.672  32.0      1.0  
3                     0.167  21.0      0.0  
4                     2.288  33.0      1.0  


In [88]:
# Extract the features from the dataset
fitting_X = dataset.iloc[:, :-1]
# print_df(fitting_X.head())

# Extract the labels from the dataset
fitting_y = dataset.iloc[:, -1:]
# print_df(fitting_y.head())

In [89]:
# Split the data into a training set and a test set with a 80-20 ratio
fitting_X_train, fitting_X_test, fitting_y_train, fitting_y_test = train_test_split(fitting_X, fitting_y, test_size=0.2, random_state=42)

In [91]:
# Build 3 different trained models
classifiers = [DecisionTreeClassifier(max_depth=1, random_state=42).fit(X_train, y_train),
               DecisionTreeClassifier(max_depth=5, random_state=42).fit(X_train, y_train),
               DecisionTreeClassifier(max_depth=32, random_state=42).fit(X_train, y_train)]

Task 3.A.

Write the code for generating 3 different fitting behaviours.

In [94]:
# Makes prediction on both the training set and test set
# HINT 1: You can reuse some code above
# HINT 2: The models are already trained
def make_predictions(clf, X_train, X_test, y_train, y_test):
    # TODO - TASK A
    train_accuracy = None
    test_accuracy = None

    return train_accuracy, test_accuracy

In [95]:
# Make predictions on the training and test sets and evaluate the performance of each model
accuracy_list = []
for clf in classifiers:
    accuracy_list.append(make_predictions(clf, fitting_X_train, fitting_X_test, fitting_y_train, fitting_y_test))

# Print the performance evaluation results
print_fitting_results(accuracy_list)


~~~~~~~~~~~~~~~~ CLASSIFICATION RESULTS ~~~~~~~~~~~~~~~~
Training Set Accuracy 1: None
    Test Set Accuracy 1: None

Training Set Accuracy 1: None
    Test Set Accuracy 1: None

Training Set Accuracy 1: None
    Test Set Accuracy 1: None



Task 3.B.

Write your conclusions.

In [96]:
"""
TODO - TASK B

Your conclusion here...
"""

'\nTODO - TASK B\n\nYour conclusion here...\n'

**Exercise 4: Clustering**
