# Supervised Learning Model Evaluation Lab

Complete the exercises below to solidify your knowledge and understanding of supervised learning model evaluation.

In [1]:
import pandas as pd

## Regression Model Evaluation

In [3]:
from sklearn.datasets import load_boston

data = load_boston()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
# X = data.iloc[:,:13]
y = pd.DataFrame(data["target"], columns=['MEDV'])
# y = data.MEDV
data = pd.concat([X, y], axis=1)

## 1. Split this data set into training (80%) and testing (20%) sets.

The `MEDV` field represents the median value of owner-occupied homes (in $1000's) and is the target variable that we will want to predict.

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

## 2. Train a `LinearRegression` model on this data set and generate predictions on both the training and the testing set.

In [30]:
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(X_train,y_train) 
y_pred_test = model.predict(X_test)
print(y_pred_test)
print('')
y_pred_train = model.predict(X_train)
print(y_pred_train)

[ 9.18315027 18.9093325  13.53692072 30.71993081 31.2915764  20.82547173
 15.07855504 20.76858078 23.24916514 19.17029971 22.52989547 33.49235372
 17.73275119 24.71265536 14.56789235 21.74337097 26.35063591 26.0125025
 21.57751527 28.98534614 22.6281822  17.63112776 13.8470178  19.88874749
 21.18132126 17.35913103 16.74955323 22.49127866 36.08556481 13.72783755
 15.83971617 25.73638889 34.77119985 25.52643986 16.64416557 32.70952522
 22.59397893 13.14973956 17.75050926 31.016355   14.16363369 27.93297166
 21.53567873 27.14764496 19.73746788 30.46806127 12.35010316 29.99373069
 20.04447157 12.59093734 36.24050596 11.3193826  29.59153122 19.62756602
 15.84696669 33.05014138 18.87768126 21.1424275  13.73173903 -5.2566788
 24.08894135 16.9998752  16.63254278 19.81949336 20.22611502 17.08365964
 13.29564658 22.61965103 22.90846965 29.2570923  25.53880443 12.29692537
 39.35655721 20.27037065  6.2990659  16.27037514 20.58948251 15.74184524
 13.05540534 19.90528397 20.4186742  19.45902411 31.9

## 3. Calculate and print R-squared for both the training and the testing set.

In [23]:
from sklearn.metrics import r2_score
r_test = r2_score(y_test, y_pred_test)
print(r_test)
r_train = r2_score(y_train, y_pred_train) 
print(r_train)

0.6924466811059868
0.7442073369477029


## 4. Calculate and print mean squared error for both the training and the testing set.

In [35]:
from sklearn.metrics import mean_squared_error

r_error_test = mean_squared_error(y_test, y_pred_test) 
print(r_error_test)
r_error_train = mean_squared_error(y_train, y_pred_train) 
print(r_error_train)

18.68388688483125
23.04091359014181


## 5. Calculate and print mean absolute error for both the training and the testing set.

In [38]:
from sklearn.metrics import mean_absolute_error

r_errabs_test = mean_absolute_error(y_test, y_pred_test)
print(r_errabs_test)
r_errabs_train = mean_absolute_error(y_train, y_pred_train) 
print(r_errabs_train)

3.205410622663202
3.3361195201959033


## Classification Model Evaluation

In [39]:
from sklearn.datasets import load_iris

data = load_iris()

X = pd.DataFrame(data["data"], columns=data["feature_names"])
y = pd.DataFrame(data["target"], columns=["class"])

data = pd.concat([X, y], axis=1)

## 6. Split this data set into training (80%) and testing (20%) sets.

The `class` field represents the type of flower and is the target variable that we will want to predict.

In [40]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

## 7. Train a `LogisticRegression` model on this data set and generate predictions on both the training and the testing set.

In [46]:
from sklearn.linear_model import LogisticRegression
model = linear_model.LogisticRegression()
model.fit(X_train,y_train) 
y_pred_test = model.predict(X_test)
print(y_pred_test)
print('')
y_pred_train = model.predict(X_train)
print(y_pred_train)

[2 2 2 2 0 0 1 0 1 2 2 0 1 0 2 1 0 0 0 2 1 1 0 1 2 1 2 1 1 0]

[0 2 0 1 2 0 2 1 2 1 1 1 1 2 2 2 0 2 2 2 2 2 1 0 0 1 2 2 0 2 1 1 2 2 2 2 2
 0 0 2 1 1 2 0 0 0 2 1 1 0 0 2 2 0 1 0 0 2 1 2 2 0 0 1 1 0 2 2 2 2 0 2 1 0
 1 0 0 0 2 2 1 1 0 0 2 0 0 0 0 0 1 1 0 2 1 0 0 2 2 0 0 1 2 1 2 2 2 0 1 2 1
 2 1 0 2 1 2 1 2 0]


  y = column_or_1d(y, warn=True)


## 8. Calculate and print the accuracy score for both the training and the testing set.

In [48]:
from sklearn.metrics import accuracy_score

acc_test = accuracy_score(y_test, y_pred_test)
print(acc_test)
acc_train = accuracy_score(y_train, y_pred_train)
print(acc_train)

0.9333333333333333
0.95


## 9. Calculate and print the balanced accuracy score for both the training and the testing set.

In [50]:
from sklearn.metrics import balanced_accuracy_score

balacc_test = balanced_accuracy_score(y_test, y_pred_test)
print(balacc_test)
balacc_train = balanced_accuracy_score(y_train, y_pred_train)
print(balacc_train)

0.9444444444444445
0.9473684210526315


## 10. Calculate and print the precision score for both the training and the testing set.

In [52]:
from sklearn.metrics import precision_score

pres_sco_test = precision_score(y_test, y_pred_test, average='micro')
print(pres_sco_test)
pres_sco_train = precision_score(y_train, y_pred_train, average='micro')
print(pres_sco_train)

0.9333333333333333
0.95


## 11. Calculate and print the recall score for both the training and the testing set.

In [56]:
from sklearn.metrics import recall_score

recall_test = recall_score(y_test, y_pred_test, average='micro')  
print(recall_test)
recall_train = recall_score(y_train, y_pred_train, average='micro')
print(recall_train)

0.9333333333333333
0.95


## 12. Calculate and print the F1 score for both the training and the testing set.

In [57]:
from sklearn.metrics import f1_score

f1_test = f1_score(y_test, y_pred_test, average='micro')  
print(f1_test)
f1_train = f1_score(y_train, y_pred_train, average='micro')  
print(f1_train)

0.9333333333333333
0.9500000000000001


## 13. Generate confusion matrices for both the training and the testing set.

In [62]:
from sklearn.metrics import confusion_matrix 

matrix_test = confusion_matrix(y_test,y_pred_test)
print(matrix_test)
print('')
matrix_train = confusion_matrix(y_train,y_pred_train)
print(matrix_train)

[[10  0  0]
 [ 0 10  2]
 [ 0  0  8]]

[[40  0  0]
 [ 0 32  6]
 [ 0  0 42]]


## Bonus: For each of the data sets in this lab, try training with some of the other models you have learned about, recalculate the evaluation metrics, and compare to determine which models perform best on each data set.