# Training Machine Learning Models with scikit-learn

### Basics


In [1]:
import pandas as pd

# (*) Toy datasets of hand-written digits and 20 newsgroups text
from sklearn.datasets import load_digits, fetch_20newsgroups # (*)
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC


Building a 'Naive Bayes' model to classify the hand-written digits dataset


In [2]:
# Loading the dataset
digits = load_digits()
digits_data = digits.data
digits_targets = digits.target

print('First handwritten digit 8 x 8 matrix:') # Size of the source images
print(digits_data[0].reshape((8, 8)))
print(f'Label of first handwritten digit: {digits_targets[0]}')


First handwritten digit 8 x 8 matrix:
[[ 0.  0.  5. 13.  9.  1.  0.  0.]
 [ 0.  0. 13. 15. 10. 15.  5.  0.]
 [ 0.  3. 15.  2.  0. 11.  8.  0.]
 [ 0.  4. 12.  0.  0.  8.  8.  0.]
 [ 0.  5.  8.  0.  0.  9.  8.  0.]
 [ 0.  4. 11.  0.  1. 12.  7.  0.]
 [ 0.  2. 14.  5. 10. 12.  0.  0.]
 [ 0.  0.  6. 13. 10.  0.  0.  0.]]
Label of first handwritten digit: 0


In [3]:
# Split into training and testing sets
training_data, testing_data, training_targets, testing_targets = train_test_split(
    digits_data, digits_targets, random_state=0
)

# Train the model
digits_model_gnb = GaussianNB()
digits_model_gnb.fit(training_data, training_targets)

# Run prediction with the testing set
predicted_targets = digits_model_gnb.predict(testing_data)

# Compute the accuracy
accuracy = accuracy_score(testing_targets, predicted_targets)
print(accuracy)


0.8333333333333334


Building a 'Multinomial Naive Bayes' model to classify the 20 newsgroups text dataset, using sklearn pipelines

In [4]:
# Load some categories of newsgroups dataset
categories = [
    "soc.religion.christian",
    "talk.religion.misc",
    "comp.sys.mac.hardware",
    "sci.crypt",
]
newsgroups_training = fetch_20newsgroups(
    subset="train", categories=categories, random_state=0
)
newsgroups_testing = fetch_20newsgroups(
    subset="test", categories=categories, random_state=0
)


In [5]:
# We can print a piece of the raw data to get an idea of its structure

newsgroups_training.data[0]


"From: sandvik@newton.apple.com (Kent Sandvik)\nSubject: Re: Ignorance is BLISS, was Is it good that Jesus died?\nOrganization: Cookamunga Tourist Bureau\nLines: 17\n\nIn article <f1682Ap@quack.kfu.com>, pharvey@quack.kfu.com (Paul Harvey)\nwrote:\n> In article <sandvik-170493104859@sandvik-kent.apple.com> \n> sandvik@newton.apple.com (Kent Sandvik) writes:\n> >Ignorance is not bliss!\n \n> Ignorance is STRENGTH!\n> Help spread the TRUTH of IGNORANCE!\n\nHuh, if ignorance is strength, then I won't distribute this piece\nof information if I want to follow your advice (contradiction above).\n\n\nCheers,\nKent\n---\nsandvik@newton.apple.com. ALink: KSAND -- Private activities on the net.\n"

In [6]:
# Make the pipeline
newsgroups_model = make_pipeline(TfidfVectorizer(), MultinomialNB(),)

# Train the model
newsgroups_model.fit(newsgroups_training.data, newsgroups_training.target)

# Run prediction with the testing set
predicted_targets = newsgroups_model.predict(newsgroups_testing.data)

# Compute the accuracy
accuracy = accuracy_score(newsgroups_testing.target, predicted_targets)
print(f'The accuracy is {accuracy}')

# Show the confusion matrix
confusion = confusion_matrix(newsgroups_testing.target, predicted_targets)
confusion_df = pd.DataFrame(
    confusion,
    index=pd.Index(newsgroups_testing.target_names, name="True"),
    columns=pd.Index(newsgroups_testing.target_names, name="Predicted"),
)
confusion_df


The accuracy is 0.8314685314685315


Predicted,comp.sys.mac.hardware,sci.crypt,soc.religion.christian,talk.religion.misc
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
comp.sys.mac.hardware,354,19,12,0
sci.crypt,3,390,3,0
soc.religion.christian,1,2,395,0
talk.religion.misc,1,19,181,50


Applying cross-validation to the hand-written digits dataset

In [7]:
# Run cross-validation
score = cross_val_score(digits_model_gnb, digits_data, digits_targets)

print(f'The accuracy score of the five folds: {score}')
print(f'The mean of the last accuracy scores:: {score.mean()}')


The accuracy score of the five folds: [0.78055556 0.78333333 0.79387187 0.8718663  0.80501393]
The mean of the last accuracy scores:: 0.8069281956050759


### Classifying data with Naive Bayes models


For a Gaussian Naive Bayes model, we assume that the data is drawn from a Gaussian distribution (so the label 'naive').
To train our model, we compute the mean and standard deviation for each feature in each label.



In [8]:
# Means and standard deviations of the 64 pixels of the 8x8 pixel matrix
print("Mean of each pixel for digit zero")
print(digits_model_gnb.theta_[0])

print("Standard deviation of each pixel for digit zero")
print(digits_model_gnb.sigma_[0])


Mean of each pixel for digit zero
[0.00000000e+00 2.83687943e-02 4.12765957e+00 1.29716312e+01
 1.13049645e+01 2.96453901e+00 3.54609929e-02 0.00000000e+00
 0.00000000e+00 9.50354610e-01 1.25035461e+01 1.37021277e+01
 1.16453901e+01 1.12765957e+01 9.00709220e-01 0.00000000e+00
 0.00000000e+00 3.79432624e+00 1.43758865e+01 5.57446809e+00
 2.13475177e+00 1.23049645e+01 3.43971631e+00 0.00000000e+00
 0.00000000e+00 5.31205674e+00 1.27517730e+01 2.06382979e+00
 1.34751773e-01 9.26241135e+00 6.45390071e+00 0.00000000e+00
 0.00000000e+00 5.78723404e+00 1.16737589e+01 1.00000000e+00
 5.67375887e-02 8.89361702e+00 7.10638298e+00 0.00000000e+00
 0.00000000e+00 3.41843972e+00 1.33687943e+01 1.82269504e+00
 1.69503546e+00 1.12127660e+01 5.90070922e+00 0.00000000e+00
 0.00000000e+00 7.80141844e-01 1.29787234e+01 1.02056738e+01
 1.06382979e+01 1.32340426e+01 2.53191489e+00 0.00000000e+00
 0.00000000e+00 7.09219858e-03 4.15602837e+00 1.35602837e+01
 1.33049645e+01 5.46099291e+00 2.83687943e-01 0.000

### Classifying data with support vector machines


scikit-learn has SVM estimators also, so the training process is similar to Naive Bayes models

In [9]:

# Create the model
digit_model_svm = SVC()

# Run cross-validation
score = cross_val_score(digit_model_svm, digits_data, digits_targets)

print(f'The accuracy score of the five folds: {score}')
print(f'The mean of the last accuracy scores:: {score.mean()}')


The accuracy score of the five folds: [0.96111111 0.94444444 0.98328691 0.98885794 0.93871866]
The mean of the last accuracy scores:: 0.9632838130609718


For SVM estimators, scikit-learn provides a class called GridSearchCV, 
that allows us to automatically search for the best parameters (the kernel function and the 'C' parameter) for our estimator.

In [11]:
# Create the grid of parameters
params_grid = {"C": [1, 10, 100, 1000], "kernel": ["linear", "poly", "rbf", "sigmoid"]}
grid = GridSearchCV(SVC(), params_grid)

grid.fit(digits_data, digits_targets)

print(f"The best parameters are: {grid.best_params_}")
print(f"The best score is: {grid.best_score_}")


The best parameters are: {'C': 10, 'kernel': 'rbf'}
The best score is: 0.9738502011761063
