In [1]:
import pickle
import numpy as np
from typing import Type, Dict
from numpy.typing import NDArray
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator
from sklearn.model_selection import (
    ShuffleSplit,
    cross_validate,
    KFold,
)
from typing import Any
from sklearn.ensemble import RandomForestClassifier
import utils as u
import new_utils as nu

In [9]:
X, y, Xtest, ytest = u.prepare_data()

... Is MNIST dataset local?
X.shape:  (70000, 784)
y.shape:  (70000,)


In [10]:
X.shape

(60000, 784)

In [32]:
X[0:1000, :].shape

(1000, 784)

In [11]:
X, y = u.filter_out_7_9s(X, y)

In [12]:
X.shape

(12214, 784)

In [13]:
12214*0.9

10992.6

In [3]:
X.shape

(60000, 784)

In [4]:
y.shape

(60000,)

In [5]:
Xtest.shape

(10000, 784)

In [6]:
ytest.shape

(10000,)

In [7]:
Xtrain, ytrain = u.filter_out_7_9s(X, y)

In [8]:
Xtrain.shape

(12214, 784)

In [9]:
ytrain.shape

(12214,)

In [10]:
Xtest, ytest = u.filter_out_7_9s(Xtest, ytest)

In [11]:
Xtest.shape

(2037, 784)

In [12]:
ytest.shape

(2037,)

In [13]:
ytest

array([7, 9, 9, ..., 7, 7, 9], dtype=int32)

In [14]:
def scale_data(X):
    X = X.astype(float)
    X = X / X.max()
    return X

In [15]:
len(Xtrain)

12214

In [16]:
np.max(Xtrain)

1.0

In [17]:
Xtrain.max()

1.0

In [18]:
Xtest.max()

1.0

In [19]:
results=u.train_simple_classifier_with_cv(Xtrain=Xtrain,ytrain=ytrain,
                                          clf=DecisionTreeClassifier(random_state=60),
                                          cv=ShuffleSplit(n_splits=5,random_state=60))

In [20]:
results

{'fit_time': array([0.98312712, 0.98978615, 0.97525692, 1.05130315, 0.92358923]),
 'score_time': array([0.00201583, 0.00100207, 0.00097203, 0.00116992, 0.00132298]),
 'test_score': array([0.97626841, 0.96972177, 0.97217676, 0.96890344, 0.97708674])}

In [21]:
from sklearn.model_selection import GridSearchCV
clf=RandomForestClassifier(random_state=60)
cv=ShuffleSplit(n_splits=5,random_state=60)
param_grid = { 'criterion': ['gini', 'entropy'],
                        'max_depth': [10, 20, None],
                        'min_samples_split': [2, 5, 10],
                        'min_samples_leaf': [1, 2, 4],
                        'max_features': ['auto', 'sqrt', 'log2', None]
                    }
grid_search = GridSearchCV(clf, param_grid, cv=cv, n_jobs=-1, verbose=2)

grid_search.fit(Xtrain,ytrain)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


KeyboardInterrupt: 

In [None]:
grid_search.get_params()

In [None]:
grid_search.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
# Get the best estimator with optimal hyperparameters
best_clf = grid_search.best_estimator_

# Train the best classifier on the entire training dataset
best_clf.fit(Xtrain,ytrain)

# Evaluate performance on training dataset
y_pred_train = best_clf.predict(Xtrain)
confusion_matrix_train = confusion_matrix(ytrain, y_pred_train)
accuracy_train = accuracy_score(ytrain, y_pred_train)

# Evaluate performance on test dataset
y_pred_test = best_clf.predict(Xtest)
confusion_matrix_test = confusion_matrix(ytest, y_pred_test)
accuracy_test = accuracy_score(ytest, y_pred_test)

# Construct the answer dictionary
answer = {
"clf": clf,
"default_parameters": clf.get_params(),
"best_estimator": best_clf,
"grid_search": grid_search,
"mean_accuracy_cv": grid_search.best_score_,
"confusion_matrix_train_orig": confusion_matrix_train,
"confusion_matrix_train_best": confusion_matrix_train,
"confusion_matrix_test_orig": confusion_matrix_test,
"confusion_matrix_test_best": confusion_matrix_test,
"accuracy_orig_full_training": accuracy_train,
"accuracy_best_full_training": accuracy_train,
"accuracy_orig_full_testing": accuracy_test,
"accuracy_best_full_testing": accuracy_test
}

In [None]:
import part_1_template_solution as p

In [None]:
f_answer=p.Section1(seed=42).partF(Xtrain,ytrain)

In [None]:
f_answer

In [19]:
import pickle

# Load the contents of the .pkl file
with open('section1.pkl', 'rb') as file:
    data = pickle.load(file)



In [20]:
data

{'1A': 0,
 '1B': ({'length_Xtrain': 12214,
   'length_Xtest': 2037,
   'length_ytrain': 12214,
   'length_ytest': 2037,
   'max_Xtrain': 1.0,
   'max_Xtest': 1.0},
  array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  array([9, 7, 9, ..., 7, 9, 9], dtype=int32),
  array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]]),
  array([7, 9, 9, ..., 7, 7, 9], dtype=int32)),
 '1C': {'clf': DecisionTreeClassifier(random_state=42),
  'cv': KFold(n_splits=5, random_state=42, shuffle=True),
  'scores': {'mean_fit_time': 0.0018794536590576172,
   'std_fit_time': 0.0002117507600533784,
   'mean_accuracy': 

In [14]:
data['3B']

{'length_Xtrain': 6859,
 'length_Xtest': 1128,
 'length_ytrain': 6859,
 'length_ytest': 1128,
 'max_Xtrain': 1.0,
 'max_Xtest': 1.0}

In [15]:
data['3C']

{'scores': {'mean_accuracy': 0.9759450763737817,
  'mean_recall': 0.9198197108594408,
  'mean_precision': 0.9275316513774694,
  'mean_f1': 0.9233437888081066,
  'std_accuracy': 0.00441747131049589,
  'std_recall': 0.01712214593612047,
  'std_precision': 0.01652890854663595,
  'std_f1': 0.013776682683587546},
 'cv': StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
 'clf': SVC(kernel='linear', random_state=42),
 'is_precision_higher_than_recall': 'Precision',
 'confusion_matrix_train': array([[6247,   18],
        [  24,  570]]),
 'confusion_matrix_test': array([[1011,   17],
        [  14,   86]])}

In [None]:
data['1F']

In [None]:
Xtrain, ytrain, Xtest, ytest = u.prepare_data()
Xtrain = nu.scale_data(Xtrain)
Xtest = nu.scale_data(Xtest)

answer = {}
answer["nb_classes_train"] = len(np.unique(ytrain))
answer["nb_classes_test"] = len(np.unique(ytest))
answer["class_count_train"] = np.bincount(ytrain)
answer["class_count_test"] = np.bincount(ytest)
answer["length_Xtrain"] = len(Xtrain)
answer["length_Xtest"] = len(Xtest)
answer["length_ytrain"] = len(ytrain)
answer["length_ytest"] = len(ytest)
answer["max_Xtrain"] = Xtrain.max()
answer["max_Xtest"] = Xtest.max()

In [None]:
answer

In [17]:
import pickle

# Load the contents of the .pkl file
with open('section3.pkl', 'rb') as file:
    data = pickle.load(file)



In [18]:
data

{'3A': {1: {'score_train': 0.9388666666666666, 'score_test': 0.9259},
  2: {'score_train': 0.9773833333333334, 'score_test': 0.9714},
  3: {'score_train': 0.9892666666666666, 'score_test': 0.9849},
  4: {'score_train': 0.9944166666666666, 'score_test': 0.992},
  5: {'score_train': 0.9972166666666666, 'score_test': 0.9952},
  'clf': LogisticRegression(max_iter=300, random_state=42),
  'plot_k_vs_score_train': [(1, 0.9388666666666666),
   (2, 0.9773833333333334),
   (3, 0.9892666666666666),
   (4, 0.9944166666666666),
   (5, 0.9972166666666666)],
  'plot_k_vs_score_test': [(1, 0.9259),
   (2, 0.9714),
   (3, 0.9849),
   (4, 0.992),
   (5, 0.9952)],
  'text_rate_accuracy_change': 'The model consistently demonstrates positive improvements in accuracy as the value of k increases for the testing data, suggesting that the model becomes increasingly proficient in predicting the top-k classes',
  'text_is_topk_useful_and_why': "The top-k accuracy metric is valuable for evaluating the model's pe

In [21]:
int(0.1)

0

In [27]:
int(0.9*1)

0

In [25]:
0.9*549

494.1

In [26]:
int(494.9)

494

In [None]:
data['2B']

In [None]:
ytest == 9

In [None]:
nine_idx = (ytest == 9)

In [None]:
ytest.shape

In [None]:
ytest[nine_idx][:int((ytest[nine_idx].shape[0])*0.1),:]

In [None]:
ytest[nine_idx].shape

In [None]:
import numpy as np
from numpy.typing import NDArray
from typing import Any

"""
   In the first two set of tasks, we will narrowly focus on accuracy - 
   what fraction of our predictions were correct. However, there are several 
   popular evaluation metrics. You will learn how (and when) to use these evaluation metrics.
"""


# ======================================================================
class Section3:
    def __init__(
        self,
        normalize: bool = True,
        frac_train=0.2,
        seed=42,
    ):
        self.seed = seed
        self.normalize = normalize

    def analyze_class_distribution(self, y: NDArray[np.int32]) -> dict[str, Any]:
        """
        Analyzes and prints the class distribution in the dataset.

        Parameters:
        - y (array-like): Labels dataset.

        Returns:
        - dict: A dictionary containing the count of elements in each class and the total number of classes.
        """
        # Your code here to analyze class distribution
        # Hint: Consider using collections.Counter or numpy.unique for counting

        uniq, counts = np.unique(y, return_counts=True)
        print(f"{uniq=}")
        print(f"{counts=}")
        print(f"{np.sum(counts)=}")

        return {
            "class_counts": {},  # Replace with actual class counts
            "num_classes": 0,  # Replace with the actual number of classes
        }

    # --------------------------------------------------------------------------
    """
    A. Using the same classifier and hyperparameters as the one used at the end of part 2.B. 
       Get the accuracies of the training/test set scores using the top_k_accuracy score for k=1,2,3,4,5. 
       Make a plot of k vs. score for both training and testing data and comment on the rate of accuracy change. 
       Do you think this metric is useful for this dataset?
    """

    def partA(
        self,
        Xtrain: NDArray[np.floating],
        ytrain: NDArray[np.int32],
        Xtest: NDArray[np.floating],
        ytest: NDArray[np.int32],
    ) -> tuple[
        dict[Any, Any],
        NDArray[np.floating],
        NDArray[np.int32],
        NDArray[np.floating],
        NDArray[np.int32],
    ]:
        """ """
        # Enter code and return the `answer`` dictionary

        answer = {}

        """
        # `answer` is a dictionary with the following keys:
        - integers for each topk (1,2,3,4,5)
        - "clf" : the classifier
        - "plot_k_vs_score_train" : the plot of k vs. score for the training data, 
                                    a list of tuples (k, score) for k=1,2,3,4,5
        - "plot_k_vs_score_test" : the plot of k vs. score for the testing data
                                    a list of tuples (k, score) for k=1,2,3,4,5

        # Comment on the rate of accuracy change for testing data
        - "text_rate_accuracy_change" : the rate of accuracy change for the testing data

        # Comment on the rate of accuracy change
        - "text_is_topk_useful_and_why" : provide a description as a string

        answer[k] (k=1,2,3,4,5) is a dictionary with the following keys: 
        - "score_train" : the topk accuracy score for the training set
        - "score_test" : the topk accuracy score for the testing set
        """

        return answer, Xtrain, ytrain, Xtest, ytest

    # --------------------------------------------------------------------------
    """
    B. Repeat part 1.B but return an imbalanced dataset consisting of 90% of all 9s removed.  Also convert the 7s to 0s and 9s to 1s.
    """

    def partB(
        self,
        X: NDArray[np.floating],
        y: NDArray[np.int32],
        Xtest: NDArray[np.floating],
        ytest: NDArray[np.int32],
    ) -> tuple[
        dict[Any, Any],
        NDArray[np.floating],
        NDArray[np.int32],
        NDArray[np.floating],
        NDArray[np.int32],
    ]:
        """"""
        # Enter your code and fill the `answer` dictionary
        answer = {}

        #X, y, Xtest, ytest = u.prepare_data()
        X, y = u.filter_out_7_9s(X, y)
        Xtest, ytest = u.filter_out_7_9s(Xtest, ytest)
        
        X,y=nu.remove_90_9s(X,y)
        Xtest,ytest=nu.remove_90_9s(Xtest,ytest)
        
        X,y=nu.convert_7_0(X,y)
        Xtest,ytest=nu.convert_7_0(Xtest,ytest)
        
        X,y=nu.convert_9_1(X,y)
        Xtest,ytest=nu.convert_9_1(Xtest,ytest)

        answer["length_Xtrain"] = len(X)  
        answer["length_Xtest"] = len(Xtest)
        answer["length_ytrain"] = len(y)
        answer["length_ytest"] = len(ytest)
        answer["max_Xtrain"] = X.max()
        answer["max_Xtest"] = Xtest.max()

        # Answer is a dictionary with the same keys as part 1.B

        return answer, X, y, Xtest, ytest

    # --------------------------------------------------------------------------
    """
    C. Repeat part 1.C for this dataset but use a support vector machine (SVC in sklearn). 
        Make sure to use a stratified cross-validation strategy. In addition to regular accuracy 
        also print out the mean/std of the F1 score, precision, and recall. As usual, use 5 splits. 
        Is precision or recall higher? Explain. Finally, train the classifier on all the training data 
        and plot the confusion matrix.
        Hint: use the make_scorer function with the average='macro' argument for a multiclass dataset. 
    """

    def partC(
        self,
        X: NDArray[np.floating],
        y: NDArray[np.int32],
        Xtest: NDArray[np.floating],
        ytest: NDArray[np.int32],
    ) -> dict[str, Any]:
        """"""

        # Enter your code and fill the `answer` dictionary
        answer = {}

        """
        Answer is a dictionary with the following keys: 
        - "scores" : a dictionary with the mean/std of the F1 score, precision, and recall
        - "cv" : the cross-validation strategy
        - "clf" : the classifier
        - "is_precision_higher_than_recall" : a boolean
        - "explain_is_precision_higher_than_recall" : a string
        - "confusion_matrix_train" : the confusion matrix for the training set
        - "confusion_matrix_test" : the confusion matrix for the testing set
        
        answer["scores"] is dictionary with the following keys, generated from the cross-validator:
        - "mean_accuracy" : the mean accuracy
        - "mean_recall" : the mean recall
        - "mean_precision" : the mean precision
        - "mean_f1" : the mean f1
        - "std_accuracy" : the std accuracy
        - "std_recall" : the std recall
        - "std_precision" : the std precision
        - "std_f1" : the std f1
        """

        return answer