Import the necessary libraries and load the dataset:

In [7]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits

digits = load_digits()
df = pd.DataFrame(digits.data, columns=digits.feature_names)
df['target'] = digits.target


Filter the dataset to only include instances that correspond to the digit "2" or "3":

In [8]:
df = df[df['target'].isin([2, 3])]


Select a subset of variables ("pixel_1", "pixel_3", "pixel_5", "pixel_7", and "pixel_15") to use for classification:

In [9]:
df = df[['target', 'pixel_0_0', 'pixel_0_2', 'pixel_0_4', 'pixel_0_6', 'pixel_1_0']]


Split the dataset into training and test sets:

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42)

Fit a linear regression model to the training data using all variables except the target label:

In [11]:
from sklearn.linear_model import LinearRegression

lin_mod = LinearRegression().fit(X_train, y_train)


Predict the target labels for the training and test data using the linear regression model, and calculate the error rate for each:

In [12]:
pred_vals_lin_train = np.where(lin_mod.predict(X_train) > 2.5, 3, 2)
error_rate_lin_train = np.mean(pred_vals_lin_train != y_train)

pred_vals_lin_test = np.where(lin_mod.predict(X_test) > 2.5, 3, 2)
error_rate_lin_test = np.mean(pred_vals_lin_test != y_test)


In [15]:
print(error_rate_lin_test)

0.3611111111111111


Perform KNN classification on the training and test data using k values of 1, 3, 5, 7, and 15, and calculate the error rate for each:

In [16]:
from sklearn.neighbors import KNeighborsClassifier

k_values = [1, 3, 5, 7, 15]
error_rate_knn_train_list = []
error_rate_knn_test_list = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    
    pred_vals_knn_train = knn.predict(X_train)
    error_rate_knn_train = np.mean(pred_vals_knn_train != y_train)
    error_rate_knn_train_list.append(error_rate_knn_train)
    
    pred_vals_knn_test = knn.predict(X_test)
    error_rate_knn_test = np.mean(pred_vals_knn_test != y_test)
    error_rate_knn_test_list.append(error_rate_knn_test)


Create two data frames containing the error rates for the KNN classification on the training and test data, respectively, for each value of k:

In [17]:
error_rate_knn_train_df = pd.DataFrame({'k': k_values, 'train_error': error_rate_knn_train_list})
error_rate_knn_test_df = pd.DataFrame({'k': k_values, 'test_error': error_rate_knn_test_list})


In [18]:
error_rate_knn_test_df

Unnamed: 0,k,test_error
0,1,0.388889
1,3,0.402778
2,5,0.513889
3,7,0.416667
4,15,0.402778


Neural Networks

In [14]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Load and preprocess the data
digits = load_digits()
df = pd.DataFrame(digits.data, columns=digits.feature_names)
df['target'] = digits.target

df = df[df['target'].isin([2, 3])]
df = df[['target', 'pixel_0_0', 'pixel_0_2', 'pixel_0_4', 'pixel_0_6', 'pixel_1_0']]

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42)

# Create a neural network model
model = Sequential()
model.add(Dense(16, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
optimizer = Adam(lr=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
y_train_binary = (y_train == 3).astype(int)
y_test_binary = (y_test == 3).astype(int)
model.fit(X_train, y_train_binary, epochs=50, batch_size=16, verbose=0)

# Evaluate the model
_, train_accuracy = model.evaluate(X_train, y_train_binary, verbose=0)
_, test_accuracy = model.evaluate(X_test, y_test_binary, verbose=0)

# Calculate the error rates
error_rate_train = 1 - train_accuracy
error_rate_test = 1 - test_accuracy

print("Error rate on the test set:", error_rate_test)




Error rate on the test set: 0.3055555820465088


ExtraTreesClassifier

In [17]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier

# Load and preprocess the data
digits = load_digits()
df = pd.DataFrame(digits.data, columns=digits.feature_names)
df['target'] = digits.target

df = df[df['target'].isin([2, 3])]
df = df[['target', 'pixel_0_0', 'pixel_0_2', 'pixel_0_4', 'pixel_0_6', 'pixel_1_0']]

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42)

# Create an ExtraTreesClassifier
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Fit the model
extra_trees.fit(X_train, y_train)

# Predict the target labels for the training and test data using the ExtraTreesClassifier
pred_vals_extra_trees_train = extra_trees.predict(X_train)
pred_vals_extra_trees_test = extra_trees.predict(X_test)

# Calculate the error rates
error_rate_extra_trees_train = np.mean(pred_vals_extra_trees_train != y_train)
error_rate_extra_trees_test = np.mean(pred_vals_extra_trees_test != y_test)

print("Error rate on the test set:", error_rate_extra_trees_test)


Error rate on the test set: 0.4305555555555556
