In [1]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')

In [2]:
"""
Split Data into Training and Test Sets
Typically the train test split is 80% training and 20% test. 
In this case, I chose 6/7th of the data to be training and 1/7th of the data to be in the test set.
"""
from sklearn.model_selection import train_test_split
# test_size: what proportion of original data is used for test set
train_img, test_img, train_lbl, test_lbl = train_test_split( mnist.data, mnist.target, test_size=1/7.0, random_state=0)

In [3]:
##Standardize the Data
##Note you fit on the training set and transform on the training and test set.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(train_img)
# Apply transform to both the training set and the test set.
train_img = scaler.transform(train_img)
test_img = scaler.transform(test_img)

In [4]:
"""
Notice the code below has .95 for the number of components parameter. 
It means that scikit-learn choose the minimum number of principal components such that 95% of the variance is retained.
"""
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.95)

In [5]:
lower_dimensional_data=pca.fit(train_img)

In [6]:
"""
Note: You can find out how many components PCA choose after fitting.
In this case, 95% of the variance amounts to 330 principal components.
"""
pca.n_components_

327

In [7]:
#Apply the mapping (transform) to both the training set and the test set.
train_img = pca.transform(train_img)
test_img = pca.transform(test_img)

In [8]:
#Apply Logistic Regression to the Transformed Data
from sklearn.linear_model import LogisticRegression
# all parameters not specified are set to their defaults
# default solver is incredibly slow which is why it was changed to 'lbfgs'
logisticRegr = LogisticRegression(solver = 'lbfgs')

In [9]:
#Model is learning the relationship between digits and labels
logisticRegr.fit(train_img, train_lbl)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
pred=logisticRegr.predict(test_img)
pred.shape

(10000,)

In [11]:
#Calculating accuracy for simplicity
logisticRegr.score(test_img, test_lbl)

0.9201

In [12]:
"""
Demonstrated using PCA to compress high dimensional data to lower dimensional data.

"""

'\nDemonstrated using PCA to compress high dimensional data to lower dimensional data.\n\n'