In [2]:
import sklearn
import numpy

print(numpy.__version__)
print(sklearn.__version__)



2.1.3
1.5.2


In [3]:
# Importing the Iris dataset from scikit-learn. This dataset contains data about three species of iris flowers.
from sklearn.datasets import load_iris

# Load the Iris dataset and store it in a variable called iris.
iris = load_iris()

# X represents the features (inputs) of the model. In this case, X contains data about the physical measurements of the flowers.
# Each row in X is an individual flower, and each column corresponds to one of the four features:
# - Sepal length
# - Sepal width
# - Petal length
# - Petal width
X = iris.data  # type is of numpy.ndarray

# y represents the target (output) ("labels") of the model. For each flower in the dataset, y tells us the flower's species.
# The target values are numerical, with each value corresponding to a different species. This is the answer we want from our
# model.
# - 0: Setosa
# - 1: Versicolor
# - 2: Virginica
y = iris.target

# feature_names gives us the names of the four features used to describe each flower. These are the columns in the X dataset.
# These features will be the input variables for our machine learning model.
feature_names = iris.feature_names

# target_names tells us the names of the three species of flowers in the dataset. These will correspond to the values in y.
target_names = iris.target_names

# Printing the names of the features (measurements of the flowers)
print("Feature names:", feature_names)

# Printing the names of the target classes (types of flowers)
print("Target names:", target_names)


Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target names: ['setosa' 'versicolor' 'virginica']


In [4]:
from sklearn.model_selection import train_test_split
# klearn.model_selection.train_test_split(
#                            *arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)

# Importing the train_test_split function from scikit-learn. This function helps us split our dataset into training and 
# testing sets.
from sklearn.model_selection import train_test_split

# Splitting the dataset (X, y) into training and testing sets.
# - X represents the features (measurements of the flowers).
# - y represents the target (the species of the flowers).
# - We are using 80% of the data for training and 20% for testing (test_size=0.2 means 20% for testing).
#   This is a common practice to ensure the model is evaluated on data it hasn't seen during training.
# - The train_test_split function automatically shuffles the data and splits it.
# - The random_state argument ensures reproducibility by fixing the random split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# X_train is the set of features (inputs) that will be used to train the model.
# y_train is the set of labels (targets) that correspond to X_train, and will teach the model the correct outputs.
# X_test is the set of features the model will use to make predictions, but it hasn't seen this data during training.
# y_test is the true target values corresponding to X_test. After training, we compare the model's predictions to y_test 
# to evaluate its performance.
# Training Set (X_train and y_train): The model learns the relationship between the input features (X_train) and the target 
# labels (y_train).
# Testing Set (X_test and y_test): This subset is used to evaluate the model's performance after training. The model makes 
# predictions on X_test, and these predictions are compared to the true values in y_test to see how well the model generalizes 
# to new, unseen data

# Printing the shapes of the training and testing datasets to understand how the data has been split.
# The shape will show the number of samples (rows) and the number of features (columns) in each set.
print("Training features (X_train) shape:", X_train.shape)  # Shows how many samples and features are in the training set
print("Testing features (X_test) shape:", X_test.shape)    # Shows how many samples and features are in the test set


Training features (X_train) shape: (120, 4)
Testing features (X_test) shape: (30, 4)
Training verification features (y_train) shape: (120,)
Testing  verification features (y_test) shape: (30,)


In [4]:
# Importing the KNeighborsClassifier from sklearn.neighbors
# KNeighborsClassifier is a machine learning algorithm used for classification based on the proximity of data points.
# It classifies new data points by looking at the 'k' nearest data points in the training set and using their majority class.
from sklearn.neighbors import KNeighborsClassifier

# n_neighbors is a parameter that defines how many of the nearest data points (neighbors) the algorithm should consider
# when making a prediction for a new data point. 
# In this case, we are setting n_neighbors to 3, meaning the model will consider the 3 closest data points in the training set.
# It will then assign the majority class label from these 3 neighbors as the predicted class for the new data point.
knn = KNeighborsClassifier(n_neighbors=3)

# The `fit` method is used to train the model using the training data (`X_train` for the features and `y_train` for the target labels).
# `X_train` contains the input features (e.g., sepal length, petal width) that describe each data point (flower).
# `y_train` contains the correct class labels (the actual species of the flowers, such as Setosa, Versicolor, or Virginica).
# During training, the model "learns" the relationship between `X_train` and `y_train` so it can make predictions later.
knn.fit(X_train, y_train)

# After training, the model now has learned how the input features (`X_train`) correspond to the target labels (`y_train`).
# It has stored the training data in a way that allows it to classify new data points based on the closest neighbors.
# Now we can use the trained model to predict the species of new flowers based on their features in `X_test`.
y_pred = knn.predict(X_test)

# The variable `y_pred` will now contain the predicted species labels for the new data points in `X_test`.
# These predictions are based on the majority class of the 3 nearest neighbors of each flower in `X_test` (as determined by the model).


In [5]:
# Importing the metrics module from sklearn to evaluate the performance of the trained model.
# The metrics module provides various functions for assessing the accuracy, precision, recall, etc., of a model's predictions.
from sklearn import metrics

# `accuracy_score` is a function from the metrics module that calculates the accuracy of the model's predictions.
# Accuracy is the proportion of correct predictions out of all predictions made.
# It compares the true labels (`y_test`) with the predicted labels (`y_pred`) to compute the proportion of matches.
print(metrics.accuracy_score(y_test, y_pred))

# In this case, `y_test` contains the true species labels for the test data (flowers).
# `y_pred` contains the predicted species labels for the test data (flowers) generated by the KNN model.
# The `accuracy_score` function compares each element in `y_test` with the corresponding element in `y_pred`.
# It returns a value between 0 and 1, where 1 means 100% accuracy (all predictions are correct),
# and a value closer to 0 means the model performed poorly (many predictions are incorrect).


0.9666666666666667


In [6]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.9666666666666667


In [9]:
# Sample input data. Each sublist represents a single sample with 4 features:
# [sepal length, sepal width, petal length, petal width]
sample = [[3, 5, 4, 2], [2, 4, 3, 5], [2, 3, 4, 5]]

# Use the previously trained KNN model (knn) to predict the species of each sample.
# The .predict() method returns an array of predicted labels (target indices) based on the sample input.
predictions = knn.predict(sample)

# Map the predicted target indices (predictions) to the actual species names.
# iris.target_names is an array of species names, and we use each predicted index to access the correct species name.
# The model outputs the index of the predicted species in iris.target_names, so the result of pred_species is the index of the prediction name from
# iris.target_names. This list comprehension then takes that list of indexes and for each "p" in predictions, pulls the name from iris.target_names.
pred_species = [iris.target_names[p] for p in predictions]

# Print out the predicted species names.
# The f-string formats and prints the result in a readable way.
print(f"Predictions: {pred_species}")


Predictions: [np.str_('versicolor'), np.str_('virginica'), np.str_('virginica')]


In [17]:
# Model persistence: Instead of re-training the model (using knn.fit and knn.predict) each time new data is added,
# we can save (persist) the trained model to a file. This allows us to load the trained model later and make predictions 
# without having to train it again.

# Importing the 'joblib' library, which allows us to save and load Python objects (such as models).
import joblib

# Saving the trained KNN model to a file named 'mlbrain.joblib'.
# 'joblib.dump()' serializes the model and stores it in a binary format.
# The filename ('mlbrain.joblib') will be used later to load the model back into memory.
joblib.dump(knn, 'mlbrain.joblib')  # stores model in a binary file

# Loading the saved model back into memory.
# 'joblib.load()' reads the binary file and recreates the trained model.
model = joblib.load('mlbrain.joblib')

# Using the loaded model to make predictions on new data (X_test).
# The model has already been trained on some data, and now we test it with new input data (X_test).
model.predict(X_test)

# Sample data: each sublist contains 4 feature values representing individual iris flowers
sample = [[3, 5, 4, 2], [2, 4, 3, 5], [2, 3, 4, 5]]

# Using the loaded model to predict the species of the new samples (based on their features).
# The model.predict() method returns the indices of the predicted species.
predictions = model.predict(sample)

# Mapping the predicted species indices (from model.predict()) to the actual species names.
# iris.target_names is a list that contains the species names. 
# We use the predicted indices to retrieve the corresponding species names.
pred_species = [iris.target_names[p] for p in predictions]

# Printing out the predicted species names in a user-friendly format.
print(f"Predictions: {pred_species}")


Predictions: [np.str_('versicolor'), np.str_('virginica'), np.str_('virginica')]
