In [None]:
'''
Generalizability
Refers to an algorithm's ability to give accurate predictions for
new, previously unseen data

Assumptions
Future unseen data (test set) will have the same properties as the
current training sets

Models that are accurate on the training set are expected to be accurate
on the test set

Overfitting:
when you have a model that fits a model too closely to the particularities
of the training set

Underfitting:
The model is too simple and it will fail to capture all aspects of and variability
in the data
'''

In [None]:
'''
Goal: Optimization
We want to improve the accuracy while avoiding compromising the complexity
of the model and thus generalizability of the algorithm

Hyperparameter Tuning allows us to optimize our models aka Optimization / Regularization
It's all about choosing parameter values that produce the best possible predictions

The complexity of models constructed using an algorithm can be changed by tuning
the specific hyperparameters of that algorithm
'''

In [2]:
import pandas as pd
from sklearn.datasets import load_digits

# Load the digits dataset
digits = load_digits()

df = pd.DataFrame(digits.data)
df['target'] = digits.target

features = df.drop('target', axis = 1)
target = df['target']

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 3000)

# select a classifier and change parameters
knn = KNeighborsClassifier(n_neighbors = 7)

# create the model by fitting the training dataset
knn.fit(X = X_train, y = y_train)

# make predictions on the test set
predicted = knn.predict(X = X_test)

print('Prediction accuracy on the training data:', format(knn.score(X_train, y_train) * 100, '.2f'))
print('Prediction accuracy on the test data:', format(knn.score(X_test, y_test) * 100, '.2f'))

Prediction accuracy on the training data: 99.26
Prediction accuracy on the test data: 98.44


In [None]:
'''
Support Vector Machines

The strength of the regularization, or tuning, is determined by C
By default C = 1

Larger Values of C : Less Regularization
- Fit the training data as well as possible
- Each individual data point is important to classify correctly
- Increased model complexity

Smaller Values of C : More Regularization
- More tolerant of errors on individual data points

'''

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 3000)

# select a classifier
# As you decreace C the model becomes more simplified
model = LinearSVC(random_state = 3000, max_iter = 1000000, C = 0.001)

# create a model by fitting the training data
model.fit(X = X_train, y = y_train)

print('Prediction accuracy on the training data:', format(model.score(X_train, y_train) * 100, '.2f'))
print('Prediction accuracy on the test data:', format(model.score(X_test, y_test) * 100, '.2f'))

Prediction accuracy on the training data: 98.74
Prediction accuracy on the test data: 96.67


In [None]:
'''
Decision Trees Tuning:

max_depth : controls the maximum depth (number of split points)
--> Most common way to reduce tree complexity and overfitting

Increasing max_depth leads to increased model complexity
--> more likely to overfit
'''

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 3000)

# select a classifier
# As you decreace C the model becomes more simplified
model = DecisionTreeClassifier(max_depth = 10)

# create a model by fitting the training data
model.fit(X = X_train, y = y_train)

# make a prediction on the test set
predicted = model.predict(X = X_test)

print('Prediction accuracy on the training data:', format(model.score(X_train, y_train) * 100, '.2f'))
print('Prediction accuracy on the test data:', format(model.score(X_test, y_test) * 100, '.2f'))

Prediction accuracy on the training data: 97.92
Prediction accuracy on the test data: 83.33
